
    wi|V                         d dl Z d dlmZmZmZmZ d dlZd dlmZ	 d dl
mZmZ d dlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ  ej        e          Z  G d de          Z!dS )    N)ListOptionalTupleUnion)
functional)CLIPTextModelWithProjectionCLIPTokenizer)CLIPTextModelOutput   )PriorTransformerUNet2DConditionModelUNet2DModel)UnCLIPScheduler)logging)randn_tensor   )DiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelc                    |    e Zd ZU dZdgZeed<   eed<   eed<   e	ed<   e
ed<   eed<   eed<   eed	<   eed
<   eed<   dZdedede	de
dededed	ed
edef fdZd Z	 	 d(deeeef                  deej                 fdZ ej                    	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d)deeeee         f                  dededededeeej        eej                 f                  d eej                 d!eej                 d"eej                 deeeef                  deej                 d#ed$ed%ee         d&efd'            Z xZS )*UnCLIPPipelineaE  
    Pipeline for text-to-image generation using unCLIP.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
            The decoder to invert the image embedding into an image.
        super_res_first ([`UNet2DModel`]):
            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
        super_res_last ([`UNet2DModel`]):
            Super resolution UNet. Used in the last step of the super resolution diffusion process.
        prior_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
        decoder_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
        super_res_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).

    priordecoder	text_projtext_encoder	tokenizersuper_res_firstsuper_res_lastprior_schedulerdecoder_schedulersuper_res_schedulerzAtext_encoder->text_proj->decoder->super_res_first->super_res_lastc                     t                                                       |                     |||||||||	|

  
         d S )N)
r   r   r   r   r   r   r   r    r!   r"   )super__init__register_modules)selfr   r   r   r   r   r   r   r    r!   r"   	__class__s              z/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/unclip/pipeline_unclip.pyr%   zUnCLIPPipeline.__init__Q   s`     	%+)+/ 3 	 	
 	
 	
 	
 	
    c                     |t          ||||          }n:|j        |k    rt          d|j         d|           |                    |          }||j        z  }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r'   r/   r.   r-   r,   latents	schedulers          r)   prepare_latentszUnCLIPPipeline.prepare_latentsm   sr    ?"5IfTYZZZGG}%% !c'-!c!c\a!c!cdddjj((GI66r*   Ntext_model_outputtext_attention_maskc                    |wt          |t                    rt          |          nd}|                     |d| j        j        dd          }|j        }	|j                                                            |          }
|                     |dd          j        }|j	        d         |	j	        d         k    rt          j        |	|          su| j                            |d d | j        j        dz
  df                   }t                              d	| j        j         d
|            |	d d d | j        j        f         }	|                     |	                    |                    }|j        }|j        }n%|d         j	        d         }|d         |d         }}|}
|                    |d          }|                    |d          }|
                    |d          }
|rddg|z  }|                     |d| j        j        dd          }|j                                                            |          }|                     |j                            |                    }|j        }|j        }|j	        d         }|                    d|          }|                    ||z  |          }|j	        d         }|                    d|d          }|                    ||z  |d          }|                    |d          }t          j        ||g          }t          j        ||g          }t          j        ||
g          }
|||
fS )Nr   
max_lengthTpt)paddingr9   
truncationreturn_tensorslongest)r;   r=   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: r   dim )
isinstancelistlenr   model_max_length	input_idsattention_maskboolr1   r/   torchequalbatch_decodeloggerwarningr   text_embedslast_hidden_staterepeat_interleaverepeatviewcat)r'   promptr-   num_images_per_promptdo_classifier_free_guidancer6   r7   
batch_sizetext_inputstext_input_ids	text_maskuntruncated_idsremoved_texttext_encoder_outputprompt_embedstext_enc_hid_statesuncond_tokensuncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embedsuncond_text_enc_hid_statesseq_lens                          r)   _encode_promptzUnCLIPPipeline._encode_promptx   s    $(264(@(@GVaJ..$>:# )  K )2N#27799<<VDDI"nnVYW[n\\fO$R(N,@,DDDU[N ND  $~::#AAAt~'F'JR'O$OP    Q7Q QBNQ Q   "03TT^5T3T0T!U"&"3"3N4E4Ef4M4M"N"N/;M"5"G +1-3A6J1B11EGXYZG[.M+I%778MST7UU1CCDY_`Caa//0E1/MM	& %	AD:-M>>$>:# *  L  ,:??AADDVLL9=9J9J<KaKdKdekKlKl9m9m6%O%[")S)e& -215G%;%B%B1F[%\%\"%;%@%@NcAcel%m%m"06q9G)C)J)J1Ncef)g)g&)C)H)H22GR* *&  0AABW]^A__ "I'=}&MNNM"')-GI\,]"^"^	#3Y"?@@I19<<r*   r               @       @pilTrU   rV   prior_num_inference_stepsdecoder_num_inference_stepssuper_res_num_inference_stepsr,   prior_latentsdecoder_latentssuper_res_latentsprior_guidance_scaledecoder_guidance_scaleoutput_typereturn_dictc                    |\t          |t                    rd}nWt          |t                    rt          |          }n2t	          dt          |                     |
d         j        d         }| j        }||z  }|dk    p|dk    }|                     |||||
|          \  }}}| j	        
                    ||           | j	        j        }| j        j        j        }|                     ||f|j        |||| j	                  }t#          |                     |                    D ]\  }}|rt'          j        |gdz            n|}|                     |||||          j        }|r#|                    d          \  }}||||z
  z  z   }|dz   |j        d         k    rd}n||dz            }| j	                            |||||	          j        }| j                            |          }|}|                     ||||
          \  }} |j        dk    rb|                    t&          j                  }t9          j        || j        j        dfd          }!|!                    t&          j                  }!n#t9          j        || j        j        dfd          }!| j         
                    ||           | j         j        }"| j!        j        j"        }#| j!        j        j#        }$| j!        j        j#        }%|                     ||#|$|%f|j        |||| j                   }t#          |                     |"                    D ]\  }}|rt'          j        |gdz            n|}| !                    |||| |!          j$        }&|r|&                    d          \  }'}(|'%                    |j        d         d          \  }'})|(%                    |j        d         d          \  }(}*|'||(|'z
  z  z   }&t'          j        |&|*gd          }&|dz   |"j        d         k    rd}n|"|dz            }| j                             |&||||          j        }|&                    dd          }|}+| j'        
                    ||           | j'        j        },| j(        j        j"        dz  }-| j(        j        j#        }$| j(        j        j#        }%|                     ||-|$|%f|+j        |||	| j'                  }	|j        dk    rt9          j)        |+|$|%g          }.nDi }/dtU          j+        t8          j)                  j,        v rd|/d<   t9          j)        |+f|$|%gddd|/}.t#          |                     |,                    D ]\  }}||,j        d         dz
  k    r| j-        }0n| j(        }0t'          j        |	|.gd          } |0||          j$        }&|dz   |,j        d         k    rd}n|,|dz            }| j'                            |&||	||          j        }	|	}1| .                                 |1dz  dz   }1|1&                    dd          }1|1/                                0                    dddd          1                                2                                }1|dk    r| 3                    |1          }1|s|1fS ti          |1          S )a  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
                and `text_attention_mask` is passed.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            prior_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
                quality image at the expense of slower inference.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*):
                Pre-generated noisy latents to be used as inputs for the prior.
            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            text_model_output (`CLIPTextModelOutput`, *optional*):
                Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
                outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
                `text_attention_mask` in this case. `prompt` can the be left `None`.
            text_attention_mask (`torch.Tensor`, *optional*):
                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
                masks are necessary when passing `text_model_output`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        Nr   z2`prompt` has to be of type `str` or `list` but is r   g      ?)r-   r   )timestepproj_embeddingencoder_hidden_statesrH   )ry   sampler,   prev_timestep)image_embeddingsr_   text_encoder_hidden_statesrW   mps)valueT)r|   ry   r{   class_labelsrH   r@   )r}   r,   r?   )size	antialiasbicubicF)r   modealign_corners)r|   ry   g      ?r   rm   )images)5rC   strrD   rE   r0   typer/   _execution_devicerh   r    set_timesteps	timestepsr   configembedding_dimr5   r.   	enumerateprogress_barrJ   rT   predicted_image_embeddingchunkstepprev_samplepost_process_latentsr   intFpadclip_extra_context_tokensrI   r!   r   in_channelssample_sizer|   splitclampr"   r   interpolateinspect	signature
parametersr   maybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )2r'   rU   rV   rn   ro   rp   r,   rq   rr   rs   r6   r7   rt   ru   rv   rw   rX   r-   rW   r_   r`   r[   prior_timesteps_tensorr   itlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textr}   r~   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidth
noise_prednoise_pred_uncondnoise_pred_text_predicted_varianceimage_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunetimages2                                                     r)   __call__zUnCLIPPipeline.__call__   s!   J &#&& f

FD)) f [[

 !dVZ[aVbVb!d!deee*1-3A6J'"77
&:S&@&`DZ]`D`#8<8K8KF13NPacv9
 9
5*I 	**+DV*TTT!%!5!?
)7,,' 
 
 d//0FGGHH 	 	DAqC^!qM?Q+>!?!?!?dq(,

",&9( )3 ) ) ( & + SlSrSrstSuSuP02P,LOc25UUP -) 1u.4Q777 $ 6q1u = 055)$#+ 6    M 
77FF( >B^^-'':(C	 >L >
 >
:: ;% "uy11I !i$.2Z\]1^fg h h h 1 6 6uz B B !i$.2Z\]1^fj k k k,,-HQW,XXX#'#9#C #|2>$0#/..-vu=%"
 
 d//0HIIJJ 	 	DAqE`!uO+<q+@!A!A!Afu)&9:0 &     + P5?5E5Ea5H5H2!?'8'>'>?Q?WXY?Z`a'>'b'b$!16E6K6KL^LdefLgmn6K6o6o3!3.1G?]nKn1oo
"Y
4F'GQOOO
1u06q999 $ 8Q ? #499AmW` :   O *//A66% 	 ../LU[.\\\%)%=%G"'.:a?%,8$+7 00651$
 
 ;%];fe_MMMNN$&!g/>>III59%k2]#)5/	QV Zo N d//0JKKLL 	 	DAq .4Q7!;;;*+!&,=~+NTU!V!V!V)    
 1u28;;; $ :1q5 A !% 8 = =A0Yb !> ! !  " 	##%%% c!Aq!!		##Aq!Q//5577==??%%%e,,E 	8O"%0000r*   )NN)Nr   ri   ri   rj   NNNNNNrk   rl   rm   T) __name__
__module____qualname____doc___exclude_from_cpu_offloadr   __annotations__r   r   r   r	   r   r   model_cpu_offload_seqr%   r5   r   r   r
   r   rJ   Tensorrh   no_gradr   r   r   	Generatorr   rI   r   __classcell__)r(   s   @r)   r   r   "   s         < ")	!!!!""""----    $$$$&&&&((((_

 &
 2	

 !
 '
 %
 $
 )
 +
 -
 
 
 
 
 
8	 	 	" JN6:Y= Y= $E*=u*D$EFY= &el3Y= Y= Y= Y=v U]__ 37%&)++--.MQ042648IM6:&)(+%* !Y1 Y1sDI~./Y1  #Y1 $'	Y1
 &)Y1 (+Y1 E%/43H"HIJY1  -Y1 "%,/Y1 $EL1Y1 $E*=u*D$EFY1 &el3Y1 $Y1 !&Y1 c]Y1  !Y1 Y1 Y1 _Y1 Y1 Y1 Y1 Y1r*   r   )"r   typingr   r   r   r   rJ   torch.nnr   r   transformersr   r	   &transformers.models.clip.modeling_clipr
   modelsr   r   r   
schedulersr   utilsr   utils.torch_utilsr   pipeline_utilsr   r   r   r   
get_loggerr   rM   r    r*   r)   <module>r      sJ    / / / / / / / / / / / /  $ $ $ $ $ $ C C C C C C C C F F F F F F I I I I I I I I I I ) ) ) ) ) )       - - - - - - C C C C C C C C * * * * * * 
	H	%	%K1 K1 K1 K1 K1& K1 K1 K1 K1 K1r*   