
    wivk                         d dl mZmZmZmZmZ d dlZd dlmZm	Z	 ddl
mZ ddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZ  ej        e          ZdZddZ G d dee          ZdS )    )CallableDictListOptionalUnionN)T5EncoderModelT5Tokenizer   )LoraLoaderMixin)Kandinsky3UNetVQModel)DDPMScheduler)	deprecateloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutputa  
    Examples:
        ```py
        >>> from diffusers import AutoPipelineForText2Image
        >>> import torch

        >>> pipe = AutoPipelineForText2Image.from_pretrained(
        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."

        >>> generator = torch.Generator(device="cpu").manual_seed(0)
        >>> image = pipe(prompt, num_inference_steps=25, generator=generator).images[0]
        ```

   c                 z    | |dz  z  }| |dz  z  dk    r|dz  }||dz  z  }||dz  z  dk    r|dz  }||z  ||z  fS )Nr   r       )heightwidthscale_factor
new_height	new_widths        /root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.pydownscale_height_and_widthr    (   sm    <?*Ja1$$a
q(I|Q!##Q	$i,&>>>    c            %       @    e Zd ZdZg dZdedededede	f
 fdZ
d	 Z ej                    	 	 	 	 	 	 	 	 	 d*deej                 deej                 deej                 deej                 fd            Zd Z	 	 	 	 	 	 d+dZed             Zed             Zed             Z ej                     ee          dddddddddddddd
dddgfdeeee         f         deded eeeee         f                  d!ee         d"ee         d#ee         d$eeej        eej                 f                  deej                 deej                 deej                 deej                 d%ee         d&ed'ee eee!gdf                  d(ee         f d)                        Z" xZ#S ),Kandinsky3Pipelineztext_encoder->unet->movq)latentsprompt_embedsnegative_prompt_embedsnegative_attention_maskattention_mask	tokenizertext_encoderunet	schedulermovqc                 |    t                                                       |                     |||||           d S )N)r)   r*   r+   r,   r-   )super__init__register_modules)selfr)   r*   r+   r,   r-   	__class__s         r   r0   zKandinsky3Pipeline.__init__<   sO     	lQZae 	 	
 	
 	
 	
 	
r!   c                     |rkt          j        ||dk                       ||dk    <   |                    d                                          dz   }|d d d |f         }|d d d |f         }||fS )Nr   r   )torch
zeros_likesummax)r2   
embeddingsr(   cut_contextmax_seq_lengths        r   process_embedsz!Kandinsky3Pipeline.process_embedsJ   s     	@.3.>z.\]J]?^._._J~*++//337799A=N#AAA$67J+AAA,>?N>))r!   Tr   NFr%   r&   r(   r'   c                 :   |P|Nt          |          t          |          ur0t          dt          |           dt          |           d          || j        }|t          |t                    rd}n4|%t          |t
                    rt          |          }n|j        d         }d}||                     |d|d	d
          }|j	        
                    |          }|j        
                    |          }	|                     ||	          }|d         }|                     ||	|          \  }}	||	                    d          z  }| j        | j        j        }nd}|
                    ||          }|j        \  }}}|                    d|d          }|                    ||z  |d          }|	                    |d          }	|rF|C|dg|z  }nWt          |t                    r|g}n>|t          |          k    r)t%          d| dt          |           d| d| d	          |}||                     |ddd	d	d
          }|j	        
                    |          }|j        
                    |          }
|                     ||
          }|d         }|ddd|j        d         f         }|
ddd|j        d         f         }
||
                    d          z  }n(t'          j        |          }t'          j        |	          }
|r||j        d         }|
                    ||          }|j        |j        k    rG|                    d|d          }|                    ||z  |d          }|
                    |d          }
nd}d}
|||	|
fS )aX  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
        Nz?`negative_prompt` should be the same type to `prompt`, but got z != .r   r      
max_lengthTpt)paddingrA   
truncationreturn_tensors)r(   r   )dtypedevicer5    z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rC   rA   rD   return_attention_maskrE   )type	TypeError_execution_device
isinstancestrlistlenshaper)   	input_idstor(   r*   r=   	unsqueezerF   repeatview
ValueErrorr6   r7   )r2   promptdo_classifier_free_guidancenum_images_per_promptrG   negative_promptr%   r&   _cut_contextr(   r'   
batch_sizerA   text_inputstext_input_idsrF   bs_embedseq_len_uncond_tokensuncond_inputs                        r   encode_promptz Kandinsky3Pipeline.encode_promptR   s   T /"=F||4#8#888(VZ[jVkVk ( (V( ( (  
 >+F*VS"9"9JJJvt$<$<VJJ&,Q/J
 ..$%# )  K )255f==N(7::6BBN --- .  M *!,M,0,?,?~_k,l,l)M>)N,D,DQ,G,GGM(%+EEE%((uV(DD,2'1%,,Q0EqII%**86K+KWVXYY'../DaHH& &	K+A+I &!#z 1OS11 	0!0 1s?3333 3/ 3 33K_K_ 3 33 30:3 3 3   !0*#~~!("#*.#'  .     ".!7!:!:6!B!B*6*E*H*H*P*P')-):):"#: *; * *& *@)B&)?C[]EXYZE[C[@[)\&*A!!!E]}GZ[\G]E]B]*^')?BYBcBcdeBfBf)f&& */)9-)H)H&*/*:>*J*J'& 	+,215G%;%>%>USY%>%Z%Z"%+}/BBB)?)F)FqJ_ab)c)c&)?)D)DZRgEgiprt)u)u&*A*H*HI^`a*b*b' &*"&*#4nF]]]r!   c                     |t          ||||          }n:|j        |k    rt          d|j         d|           |                    |          }||j        z  }|S )N)	generatorrG   rF   zUnexpected latents shape, got z, expected )r   rQ   rW   rS   init_noise_sigma)r2   rQ   rF   rG   rg   r$   r,   s          r   prepare_latentsz"Kandinsky3Pipeline.prepare_latents   sr    ?"5IfTYZZZGG}%% !c'-!c!c\a!c!cdddjj((GI66r!   c	                     |>t          |t                    r|dk    r#t          d| dt          |           d          |At	           fd|D                       s&t          d j         d fd|D                        ||t          d	| d
| d          ||t          d          |It          |t                    s4t          |t                    st          dt          |                     ||t          d| d| d          |2|0|j        |j        k    r t          d|j         d|j         d          ||t          d          |B|@|j        d d         |j        k    r(t          d|j        d d          d|j         d          ||t          d          |B|B|j        d d         |j        k    r,t          d|j        d d          d|j         d          d S d S d S )Nr   z5`callback_steps` has to be a positive integer but is z	 of type r?   c              3   *   K   | ]}|j         v V  d S N_callback_tensor_inputs.0kr2   s     r   	<genexpr>z2Kandinsky3Pipeline.check_inputs.<locals>.<genexpr>  D       F
 F
23A--F
 F
 F
 F
 F
 F
r!   2`callback_on_step_end_tensor_inputs` has to be in , but found c                 &    g | ]}|j         v|S r   rm   ro   s     r   
<listcomp>z3Kandinsky3Pipeline.check_inputs.<locals>.<listcomp>  V      pH  pH  pHvw  bc  ko  kG  bG  bGpq  bG  bG  bGr!   zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zLPlease provide `negative_attention_mask` along with `negative_prompt_embeds`r   z`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but got: `negative_prompt_embeds` z != `negative_attention_mask` z:Please provide `attention_mask` along with `prompt_embeds`z`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but got: `prompt_embeds` z != `attention_mask` )	rM   intrW   rJ   allrn   rN   rO   rQ   )	r2   rX   callback_stepsr[   r%   r&   "callback_on_step_end_tensor_inputsr(   r'   s	   `        r   check_inputszKandinsky3Pipeline.check_inputs   s    %z.#/N/N%R`deReRe, , ,((, , ,   .9# F
 F
 F
 F
7YF
 F
 F
 C
 C
9  JTEa  J  J  pH  pH  pH  pH  |^  pH  pH  pH  J  J   -";0 0 0} 0 0 0   ^ 5w   FC)@)@TZ\`IaIa`RVW]R^R^``aaa&+A+M_/ _ _*_ _ _  
 $)?)K"&<&BBB 8-:-@8 8.48 8 8  
 "-2I2Qklll!-2I2U%+BQB/3J3PPP 96L6RSUTUSU6V9 9/59 9 9   $)?YZZZ$)C"2A2&.*>>> 0-:-@!-D0 0&,0 0 0   %$)C)C>>r!   c                     | j         S rl   _guidance_scaler2   s    r   guidance_scalez!Kandinsky3Pipeline.guidance_scale9  s    ##r!   c                     | j         dk    S )Nr   r   r   s    r   rY   z.Kandinsky3Pipeline.do_classifier_free_guidance=  s    #a''r!   c                     | j         S rl   )_num_timestepsr   s    r   num_timestepsz Kandinsky3Pipeline.num_timestepsA  s    ""r!      g      @i   pilr$   rX   num_inference_stepsr   r[   rZ   r   r   rg   output_typereturn_dictcallback_on_step_endr|   c                 p
    |                     dd          }|                     dd          }|t          ddd           |t          ddd           |At           fd|D                       s&t          d j         d	 fd
|D                        d} j        }                     ||||	|
|||           | _        |t          |t                    rd}n4|%t          |t                    rt          |          }n|	j        d         }                     | j        ||||	|
|||
  
        \  }	}
}} j        r>t          j        |
|	g          }	t          j        ||g                                          } j                            ||            j        j        }t+          ||d          \  }}                     ||z  d||f|	j        ||| j                  }t1           d          r  j         j                                         t          |          | j        j        z  z
  }t          |           _                             |          5 }t=          |          D ]\  }} j        rt          j        |gdz            n|}                     |||	|d          d         } j        r&|                     d          \  }} |dz   | z  ||z  z
  } j        !                    ||||          j"        }|i }!|D ]}"tG                      |"         |!|"<    | |||!          }#|#                     d|          }|#                     d|	          }	|#                     d|
          }
|#                     d|          }|#                     d|          }|t          |          dz
  k    s|dz   |k    r[|dz    j        j        z  dk    rE|$                                 |/||z  dk    r&|tK           j        dd          z  }$ ||$||           |dvrt          d |           |d!k    s j&        '                    |d"          d#         }%|d$v rl|%d%z  d%z   }%|%(                    dd          }%|%)                                *                    ddd&d          +                                ,                                }%|d'k    r -                    |%          }%n|}% .                                 |s|%fcddd           S t_          |%(          cddd           S # 1 swxY w Y   dS ))u"  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 3.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            height (`int`, *optional*, defaults to self.unet.config.sample_size):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size):
                The width in pixels of the generated image.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
            clean_caption (`bool`, *optional*, defaults to `True`):
                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
                be installed. If the dependencies are not installed, the embeddings will be created from the raw
                prompt.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`

        callbackNr{   z1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c              3   *   K   | ]}|j         v V  d S rl   rm   ro   s     r   rr   z.Kandinsky3Pipeline.__call__.<locals>.<genexpr>  rs   r!   rt   ru   c                 &    g | ]}|j         v|S r   rm   ro   s     r   rw   z/Kandinsky3Pipeline.__call__.<locals>.<listcomp>  rx   r!   Tr   r   )rZ   rG   r[   r%   r&   r\   r(   r'   )rG   r      text_encoder_offload_hook)totalr   F)encoder_hidden_statesencoder_attention_maskr   g      ?)rg   r$   r%   r&   r(   r'   order)rB   npr   latentzSOnly the output types `pt`, `pil`, `np` and `latent` are supported not output_type=r   )force_not_quantizesample)r   r   g      ?r
   r   )images)0popr   rz   rW   rn   rL   r}   r   rM   rN   rO   rP   rQ   re   rY   r6   catboolr,   set_timesteps	timestepsr    ri   rF   hasattrr   offloadr   r   progress_bar	enumerater+   chunkstepprev_samplelocalsupdategetattrr-   decodeclampcpupermutefloatnumpynumpy_to_pilmaybe_free_model_hooksr   )&r2   rX   r   r   r[   rZ   r   r   rg   r%   r&   r(   r'   r   r   r$   r   r|   kwargsr   r{   r;   rG   r]   r   num_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsrq   callback_outputsstep_idximages&   `                                     r   __call__zKandinsky3Pipeline.__call__E  s(   @ ::j$//$4d;;z  
 %  A   .9# F
 F
 F
 F
7YF
 F
 F
 C
 C
9  JTEa  J  J  pH  pH  pH  pH  |^  pH  pH  pH  J  J   ' 	".#		
 		
 		
  .*VS"9"9JJJvt$<$<VJJ&,Q/J Z^YkYk,"7+'#9$)$; Zl Z
 Z
V-~?V + 	Y!I'=}&MNNM"Y(?'PQQVVXXN$$%8$HHHN,	 365!DD&&//FEBN
 
 4455 	5$:X:d*22444 y>>,?$.BV,VV!)nn%899 E	5\!),, *7 *71AEAa%nUYy1}%=%=%=gn" "YY&*7+9 % '   
 3 o9C9I9I!9L9L6%"03"6/!IN]nLn!nJ .--'	 .  
   (3&(O? 9 9-3XXa[**';';D!Q'X'X$.229gFFG$4$8$8-$X$XM-=-A-ABZ\r-s-s*%5%9%9:JN%[%[N.>.B.BC\^u.v.v+I***A9I/I/IqSTuX\XfXlNlpqNqNq '')))+N0Ba0G0G#$(K(K#K 1g666 "??? wjuww   (**	((T(JJ8T-//!CK#-E!KK1--E!IIKK//1a;;AACCIIKKE%'' --e44E'')))  xGE	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5J 'e444KE	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5 E	5s   J#T+T++T/2T/)	Tr   NNNNFNN)NNNNNN)$__name__
__module____qualname__model_cpu_offload_seqrn   r	   r   r   r   r   r0   r=   r6   no_gradr   Tensorre   ri   r}   propertyr   rY   r   r   EXAMPLE_DOC_STRINGr   rN   r   ry   r   	Generatorr   r   r   r   __classcell__)r3   s   @r   r#   r#   2   sq       6  

 %
 	

 !
 
 
 
 
 
 
* * * U]__ %)049=15:>S^ S^  -S^ !) 6S^ !.S^ "*%,!7S^ S^ S^ _S^j	 	 	 #+/ $D D D DL $ $ X$ ( ( X( # # X# U]__122 )-#% #;?/0 $#MQ049=15:>%* KO9B%y5 y5c49n%y5 !y5 	y5
 "%T#Y"78y5  (}y5 y5 }y5 E%/43H"HIJy5  -y5 !) 6y5 !.y5 "*%,!7y5 c]y5 y5" 'xc40@$0F'GH#y5$ -1I%y5 y5 y5 32 _y5 y5 y5 y5 y5r!   r#   )r   ) typingr   r   r   r   r   r6   transformersr   r	   loadersr   modelsr   r   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   
get_loggerr   loggerr   r    r#   r   r!   r   <module>r      s[   8 8 8 8 8 8 8 8 8 8 8 8 8 8  4 4 4 4 4 4 4 4 & & & & & & - - - - - - - - ' ' ' ' ' '         
 . - - - - - C C C C C C C C 
	H	%	% (? ? ? ?N5 N5 N5 N5 N5*O N5 N5 N5 N5 N5r!   