
    wie                         d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmc mZ d dlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ d	d
lmZm Z m!Z!  ej"        e#          Z$dZ% G d de e!          Z&dS )    N)AnyCallableDictListOptionalUnion)ClapTextModelWithProjectionRobertaTokenizerRobertaTokenizerFastSpeechT5HifiGan   )AutoencoderKLUNet2DConditionModel)KarrasDiffusionSchedulers)loggingreplace_example_docstring)randn_tensor   )AudioPipelineOutputDiffusionPipelineStableDiffusionMixinaj  
    Examples:
        ```py
        >>> from diffusers import AudioLDMPipeline
        >>> import torch
        >>> import scipy

        >>> repo_id = "cvssp/audioldm-s-full-v2"
        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]

        >>> # save the audio sample as a .wav file
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
        ```
c            $           e Zd ZdZdZdededeee	f         de
dedef fd	Z	 	 	 d(deej                 deej                 fdZd Zd Zd Z	 	 	 d(dZd)dZ ej                     ee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*deeee         f         dee         dededeeeee         f                  dee         ded eeej        eej                 f                  d!eej                 deej                 deej                 d"ed#ee eeej        gd
f                  d$ee         d%ee!ee"f                  d&ee         f d'                        Z# xZ$S )+AudioLDMPipelinea  
    Pipeline for text-to-audio generation using AudioLDM.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        text_encoder ([`~transformers.ClapTextModelWithProjection`]):
            Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
        tokenizer ([`PreTrainedTokenizer`]):
            A [`~transformers.RobertaTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A `UNet2DConditionModel` to denoise the encoded audio latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        vocoder ([`~transformers.SpeechT5HifiGan`]):
            Vocoder of class `SpeechT5HifiGan`.
    ztext_encoder->unet->vaevaetext_encoder	tokenizerunet	schedulervocoderc                     t                                                       |                     ||||||           dt          | j        j        j                  dz
  z  | _        d S )N)r   r   r   r   r   r   r      )super__init__register_moduleslenr   configblock_out_channelsvae_scale_factor)selfr   r   r   r   r   r   	__class__s          ~/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/diffusers/pipelines/audioldm/pipeline_audioldm.pyr#   zAudioLDMPipeline.__init__N   st     	% 	 	
 	
 	
 !"c$(/*L&M&MPQ&Q R    Nprompt_embedsnegative_prompt_embedsc                 :   |t          |t                    rd}n4|%t          |t                    rt          |          }n|j        d         }|7|                     |d| j        j        dd          }	|	j        }
|	j        }|                     |dd	          j        }|j        d
         |
j        d
         k    rrt          j
        |
|          s]| j                            |dd| j        j        dz
  d
f                   }t                              d| j        j         d|            |                     |
                    |          |                    |                    }|j        }t#          j        |d
          }|                    | j        j        |          }|j        \  }}|                    d|          }|                    ||z  |          }|r@|=|dg|z  }nt-          |          t-          |          ur0t/          dt-          |           dt-          |           d          t          |t                    r|g}n>|t          |          k    r)t1          d| dt          |           d| d| d	          |}|j        d         }|                     |d|dd          }|j                            |          }|j                            |          }|                     ||          }|j        }t#          j        |d
          }|rs|j        d         }|                    | j        j        |          }|                    d|          }|                    ||z  |          }t          j        ||g          }|S )a`  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device (`torch.device`):
                torch device
            num_waveforms_per_prompt (`int`):
                number of waveforms that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
        Nr!   r   
max_lengthTpt)paddingr0   
truncationreturn_tensorslongest)r2   r4   z\The following part of your input was truncated because CLAP can only handle sequences up to z	 tokens: )attention_mask)dim)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancestrlistr%   shaper   model_max_length	input_idsr7   torchequalbatch_decodeloggerwarningr   totext_embedsF	normalizer9   repeatviewtype	TypeError
ValueErrorcat)r)   promptr:   num_waveforms_per_promptdo_classifier_free_guidancenegative_promptr-   r.   
batch_sizetext_inputstext_input_idsr7   untruncated_idsremoved_textbs_embedseq_lenuncond_tokensr0   uncond_inputuncond_input_idss                       r+   _encode_promptzAudioLDMPipeline._encode_promptc   sZ   D *VS"9"9JJJvt$<$<VJJ&,Q/J ..$>:# )  K )2N(7N"nnVYW[n\\fO$R(N,@,DDDU[N ND  $~::#AAAt~'F'JR'O$OP    Q7Q QBNQ Q  
 !--!!&))-0088 .  M *5MK2>>>M%((t/@/Fv(VV
 	
 &,,Q0HII%**86N+NPWXX ' &	Q+A+I&!#z 1fT/%:%:::(VZ[jVkVk ( (V( ( (   OS11 	0!0 1s?3333 3/ 3 33K_K_ 3 33 30:3 3 3   !0&,Q/J>>$%# *  L  ,588@@)8;;FCCN%)%6%6 - &7 & &" &<%G"%&[1GR%P%P%P"& 	O,215G%;%>%>TEVE\ek%>%l%l"%;%B%B1F^%_%_"%;%@%@NfAfho%p%p"
 "I'=}&MNNMr,   c                 r    d| j         j        j        z  |z  }| j                             |          j        }|S )Nr!   )r   r&   scaling_factordecodesample)r)   latentsmel_spectrograms      r+   decode_latentszAudioLDMPipeline.decode_latents   s4    dho44w>(//'229r,   c                     |                                 dk    r|                    d          }|                     |          }|                                                                }|S )N   r!   )r8   squeezer   cpufloat)r)   rf   waveforms      r+   mel_spectrogram_to_waveformz,AudioLDMPipeline.mel_spectrogram_to_waveform   s[      A%%-55a88O<<00<<>>''))r,   c                 6   dt          t          j        | j        j                  j                                                  v }i }|r||d<   dt          t          j        | j        j                  j                                                  v }|r||d<   |S )Neta	generator)setinspect	signaturer   step
parameterskeys)r)   rq   rp   accepts_etaextra_step_kwargsaccepts_generators         r+   prepare_extra_step_kwargsz*AudioLDMPipeline.prepare_extra_step_kwargs   s     s7#4T^5H#I#I#T#Y#Y#[#[\\\ 	+'*e$ (3w/@AT/U/U/`/e/e/g/g+h+hh 	7-6k*  r,   c                    || j         z  }||k     rt          d| d| d          | j        j        j        | j         z  dk    r*t          d| j        j        j         d| j          d          ||>t          |t                    r|dk    r#t          d| dt          |           d          ||t          d	| d
| d          ||t          d          |It          |t                    s4t          |t                    st          dt          |                     ||t          d| d| d          |2|2|j
        |j
        k    r$t          d|j
         d|j
         d          d S d S d S )NzH`audio_length_in_s` has to be a positive value greater than or equal to z	, but is r<   r   zwThe number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got z bins and a scale factor of z5`callback_steps` has to be a positive integer but is z	 of type zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r(   rP   r   r&   model_in_dimr=   intrN   r>   r?   r@   )	r)   rR   audio_length_in_svocoder_upsample_factorcallback_stepsrU   r-   r.   min_audio_length_in_ss	            r+   check_inputszAudioLDMPipeline.check_inputs  sz    !8$:O O444+[p + +'+ + +  
 <+d.CCqHH,-1\-@-M, ,(, , ,   "&
>30O0O&SaefSfSf, , ,((, , ,  
 -";0 0 0} 0 0 0   ^ 5w   FC)@)@TZ\`IaIa`RVW]R^R^``aaa&+A+M_/ _ _*_ _ _  
 $)?)K"&<&BBB 8-:-@8 8.48 8 8   %$)K)KBBr,   c                    ||t          |          | j        z  t          | j        j        j                  | j        z  f}t          |t                    r6t          |          |k    r#t          dt          |           d| d          |t          ||||          }n|
                    |          }|| j        j        z  }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rq   r:   r9   )r~   r(   r   r&   r}   r=   r?   r%   rP   r   rH   r   init_noise_sigma)	r)   rV   num_channels_latentsheightr9   r:   rq   re   r@   s	            r+   prepare_latentsz AudioLDMPipeline.prepare_latentsI  s     KK400#011T5JJ	
 i&& 	3y>>Z+G+Gi#i.. i i&i i i  
 ?"5IfTYZZZGGjj((G DN;;r,   
         @r!           TnprR   r   num_inference_stepsguidance_scalerU   rS   rp   rq   re   return_dictcallbackr   cross_attention_kwargsoutput_typec           	         t          j        | j        j        j                  | j        j        j        z  }|| j        j        j        | j        z  |z  }t          ||z            }t          || j        j        j        z            }|| j        z  dk    rXt          t          j
        || j        z                      | j        z  }t                              d| d||z   d| d           |                     ||||||
|           |t          |t                    rd}n4|%t          |t                     rt#          |          }n|
j        d         }| j        }|dk    }|                     ||||||
|	          }
| j                            ||
           | j        j        }| j        j        j        }|                     ||z  |||
j        |||	          }	|                     ||          }t#          |          || j        j        z  z
  }|                     |          5 }t=          |          D ]\  }}|rt?          j         |	gdz            n|	}| j        !                    ||          }|                     ||d|
|          j"        }|r#|#                    d          \  } }!| ||!| z
  z  z   } | j        j$        |||	fi |j%        }	|t#          |          dz
  k    s|dz   |k    r[|dz   | j        j        z  dk    rE|&                                 |/||z  dk    r&|tO          | j        dd          z  }" ||"||	           	 ddd           n# 1 swxY w Y   | (                    |	          }#| )                    |#          }$|$ddd|f         }$|dk    r|$*                                }$|s|$fS tW          |$          S )u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
            audio_length_in_s (`int`, *optional*, defaults to 5.12):
                The length of the generated audio sample in seconds.
            num_inference_steps (`int`, *optional*, defaults to 10):
                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 2.5):
                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
                The number of waveforms to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            output_type (`str`, *optional*, defaults to `"np"`):
                The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
                `"pt"` to return a PyTorch `torch.Tensor` object.

        Examples:

        Returns:
            [`~pipelines.AudioPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated audio.
        Nr   zAudio length in seconds z is increased to z; so that it can be handled by the model. It will be cut to z after the denoising process.r!   g      ?)r-   r.   )r:   )totalr   )encoder_hidden_statesclass_labelsr   orderr   )audios),r   prodr   r&   upsample_ratessampling_rater   sample_sizer(   r~   ceilrF   infor   r=   r>   r?   r%   r@   _execution_devicer`   r   set_timesteps	timestepsin_channelsr   r9   r{   r   progress_bar	enumeraterC   rQ   scale_model_inputrd   chunkru   prev_sampleupdategetattrrg   rn   numpyr   )%r)   rR   r   r   r   rU   rS   rp   rq   re   r-   r.   r   r   r   r   r   r   r   original_waveform_lengthrV   r:   rT   r   r   ry   num_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textstep_idxrf   audios%                                        r+   __call__zAudioLDMPipeline.__call___  s   ^ #%'$,*=*L"M"MPTP\PcPq"q$ $	 0 <t?T TWn n&)@@AA#&'84<;N;\'\#]#] D))Q..$*?!?@@AADDYYFKK&+< & &vXoOo & &M^& & &   	#"	
 	
 	
 *VS"9"9JJJvt$<$<VJJ&,Q/J' '5s&:# ++$''#9 , 
 
 	$$%8$HHHN,	  $y/;&&11 
 
 !::9cJJ y>>,?$.BV,VV%899 	7\!),, 7 71A\%iUYy1}%=%=%=bi"%)^%E%EFXZ[%\%\" "YY&*.!.+A '     / l9C9I9I!9L9L6%!2^YjGj5k!kJ .$.-j!WZZHYZZf I***A9I/I/IqSTuX\XfXlNlpqNqNq '')))+N0Ba0G0G#$(K(K#K 1g66677	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7> --g6600AAaaa22223$KKMME 	8O"%0000s   &D'MM!M)NNN)N)NNr   r   Nr!   r   NNNNTNr!   Nr   )%__name__
__module____qualname____doc__model_cpu_offload_seqr   r	   r   r
   r   r   r   r   r#   r   rC   Tensorr`   rg   rn   r{   r   r   no_gradr   EXAMPLE_DOC_STRINGr>   r   rl   r~   	Generatorboolr   r   r   r   __classcell__)r*   s   @r+   r   r   4   s        . 6SS 2S )+??@	S
 #S -S !S S S S S S6 049=I I  -I !) 6I I I IV  
  ! ! !. #8 8 8 8v   , U]__122 )--1#% #;?23MQ*.049= GK();?%)#A1 A1c49n%A1 $E?A1 !	A1
 A1 "%T#Y"78A1 #+3-A1 A1 E%/43H"HIJA1 %,'A1  -A1 !) 6A1 A1 8S#u|$<d$BCDA1 !A1  !)c3h 8!A1" c]#A1 A1 A1 32 _A1 A1 A1 A1 A1r,   r   )'rs   typingr   r   r   r   r   r   r   r   rC   torch.nn.functionalnn
functionalrJ   transformersr	   r
   r   r   modelsr   r   
schedulersr   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   
get_loggerr   rF   r   r    r,   r+   <module>r      sd    = = = = = = = = = = = = = = = =               m m m m m m m m m m m m 9 9 9 9 9 9 9 9 3 3 3 3 3 3 7 7 7 7 7 7 7 7 - - - - - - Y Y Y Y Y Y Y Y Y Y 
	H	%	% (n1 n1 n1 n1 n1(*> n1 n1 n1 n1 n1r,   