
    xid&                     V   d dl Z d dlZd dlmZ d dlmZ d dlZd dlZd dlZd dl	Z
d dlmZ d dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! d dl"Z" e"j#        e$          Z%dZ&de'de'fdZ(e G d d                      Z) G d d          Z*dS )    N)	dataclass)Path)	load_file)snapshot_download)AutoTokenizer   )T3)S3_SR)S3GEN_SRS3Gen)EnTokenizer)VoiceEncoder)T3Cond)T3Config)	S3GEN_SILzResembleAI/chatterbox-turbotextreturnc                     t                     dk    rdS  d                                         r% d                                          dd         z    d                                                                g d}|D ]\  }}                     ||                                d           h d}t           fd|D                       s d	z    S )
zt
        Quick cleanup func for punctuation from LLMs or
        containing chars not seen often in the dataset
    r   z)You need to add some text for me to talk.r   N )	)u   …z, ):,)u   —-)u   –r   )z ,r   )u   “")u   ”r   )u   ‘')u   ’r   >   !r   r   .?c              3   B   K   | ]}                     |          V  d S N)endswith).0pr   s     d/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/chatterbox/tts_turbo.py	<genexpr>zpunc_norm.<locals>.<genexpr>>   s/      99At}}Q999999    r   )lenislowerupperjoinsplitreplacerstripany)r   punc_to_replaceold_char_sequencenew_charsentence_enderss   `    r#   	punc_normr2      s    
 4yyA~~:: Aw *Aw}}abb) 88DJJLL!!D
 
 
O (7 9 9#8||-x88 ;;sD///O999999999 Kr%   c                   R    e Zd ZU dZeed<   eed<   d ZdefdZ	e
d
d            Zd	S )ConditionalsaR  
    Conditionals for T3 and S3Gen
    - T3 conditionals:
        - speaker_emb
        - clap_emb
        - cond_prompt_speech_tokens
        - cond_prompt_speech_emb
        - emotion_adv
    - S3Gen conditionals:
        - prompt_token
        - prompt_token_len
        - prompt_feat
        - prompt_feat_len
        - embedding
    t3genc                     | j                             |          | _         | j                                        D ]7\  }}t	          j        |          r|                    |          | j        |<   8| S )Ndevice)r5   tor6   itemstorch	is_tensor)selfr9   kvs       r#   r:   zConditionals.toX   si    '**F*++HNN$$ 	2 	2DAqq!! 2dd&d11r%   fpathc                 p    t          | j        j        | j                  }t	          j        ||           d S )N)r5   r6   )dictr5   __dict__r6   r<   save)r>   rA   arg_dicts      r#   rE   zConditionals.save_   s=    w
 
 
 	
8U#####r%   cpuc                     t          |t                    rt          j        |          }t          j        ||d          } | t          di |d         |d                   S )NT)map_locationweights_onlyr5   r6    )
isinstancestrr<   r9   loadr   )clsrA   rI   kwargss       r#   rN   zConditionals.loadf   sa    lC(( 	6 <55LE4PPPs6))F4L))6%=999r%   N)rG   )__name__
__module____qualname____doc__r   __annotations__rC   r:   r   rE   classmethodrN   rK   r%   r#   r4   r4   D   s~           	JJJ	III  $$ $ $ $ $ : : : [: : :r%   r4   c                       e Zd Zdez  Zdez  Z	 ddedede	de
ded	efd
Zedd            Zedd            ZddZddZ	 	 	 	 	 	 	 	 	 ddZdS )ChatterboxTurboTTS   
   Nr5   s3genve	tokenizerr9   condsc                     t           | _        || _        || _        || _        || _        || _        || _        t          j	                    | _
        d S r   )r   srr5   r[   r\   r]   r9   r^   perthPerthImplicitWatermarkerwatermarker)r>   r5   r[   r\   r]   r9   r^   s          r#   __init__zChatterboxTurboTTS.__init__r   sL     
"
 9;;r%   r   c                    t          |          }|dv rt          j        d          }nd }t                      }|                    t          |dz                       |                    |                                           t          d          }d|_	        d|_
        d |_        d|_        d	|_        d	|_        t          |          }t          |d
z            }d|                                v r|d         d         }|                    |           |j        `|                    |                                           t'          d          }t          |dz            }	|                    |	d           |                    |                                           t)          j        |          }
|
j        |
j        |
_        t1          |
          dk    r t3          dt1          |
           d           d }|dz  }|                                r/t6                              ||                              |          } | ||||
||          S )N)rG   mpsrG   zve.safetensorsid  )text_tokens_dict_sizeGPT2_mediumi  iw  Fzt3_turbo_v1.safetensorsmodelr   T)meanflowzs3gen_meanflow.safetensors)strictzWARNING: Tokenizer len z	 != 50276zconds.pt)rI   )r^   )r   r<   r9   r   load_state_dictr   r:   evalr   llama_config_namespeech_tokens_dict_sizeinput_pos_embspeech_cond_prompt_lenuse_perceiver_resampleremotion_advr	   keystfmrwter   r   from_pretrained	pad_token	eos_tokenr&   printexistsr4   rN   )rO   ckpt_dirr9   rI   r\   hpr5   t3_stater[   weightsr]   r^   builtin_voices                r#   
from_localzChatterboxTurboTTS.from_local   sZ   >> ^## <..LLL^^
h!1122	
 	
 	
 	f E222,%)"$'!%*"VVX(AABBhmmoo%%(+H
8$$$GK
ft$$$H'CCDDD 	 	
 	
 	
 	!1(;;	&"+"5Iy>>U""EC	NNEEEFFF :-!! 	[ %%m,%OORRSYZZEs2ub)V5AAAAr%   c                 b   |dk    rgt           j        j                                        sDt           j        j                                        st          d           nt          d           d}t          t          t          j	        d          pdg d          }| 
                    ||          S )	Nrf   zUMPS not available because the current PyTorch install was not built with MPS enabled.z~MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.rG   HF_TOKENT)z*.safetensorsz*.jsonz*.txtz*.ptz*.model)repo_idtokenallow_patterns)r<   backendsrf   is_availableis_builtrz   r   REPO_IDosgetenvr   )rO   r9   
local_paths      r#   rw   z"ChatterboxTurboTTS.from_pretrained   s     U??5>#5#B#B#D#D?>%..00 Xmnnnn  W  X  X  XF&)J''/4RRR	
 
 

 ~~j&111r%   c                    	 t          j        |          }|                    |          }||z
  }d|dz  z  }t          j        |          r|dk    r||z  }n)# t
          $ r}t          d|            Y d }~nd }~ww xY w|S )Ng      $@g      4@        z+Warning: Error in norm_loudness, skipping: )lnMeterintegrated_loudnessmathisfinite	Exceptionrz   )	r>   wavr`   target_lufsmeterloudnessgain_dbgain_lineares	            r#   norm_loudnessz ChatterboxTurboTTS.norm_loudness   s    	EHRLLE0055H!H,G7T>2K}[)) (kC.?.?K' 	E 	E 	ECCCDDDDDDDD	E 
s   AA 
A>"A99A>      ?Tc           
         t          j        |t                    \  }}t          |          |z  dk    s
J d            |r|                     ||          }t          j        |t          t                    }|d | j                 }| j        	                    |t          | j
                  }| j        j        j        x}r`| j        j        }	|	                    |d | j                 g|          \  }
}t#          j        |
                              | j
                  }
t#          j        | j                            |gt                              }|                    dd	
                              | j
                  }t1          ||
|t#          j        ddd          z                                | j
                  }t5          ||          | _        d S )N)r`   g      @z+Audio prompt must be longer than 5 seconds!)orig_sr	target_srr8   )max_lensample_rater   T)axiskeepdimr   )speaker_embcond_prompt_speech_tokensrs   )librosarN   r   r&   r   resampler
   DEC_COND_LENr[   	embed_refr9   r5   r}   rq   r]   forwardENC_COND_LENr<   
atleast_2dr:   
from_numpyr\   embeds_from_wavsmeanr   onesr4   r^   )r>   	wav_fpathexaggerationr   s3gen_ref_wav_srref_16k_wavs3gen_ref_dictplens3_tokzrt3_cond_prompt_tokens_ve_embedt3_conds                 r#   prepare_conditionalsz'ChatterboxTurboTTS.prepare_conditionals   s   $\)AAAs=!!C'#---/\--- 	C ..}cBBM&}hRWXXX%&8t'8&89--mXdk-ZZ 7:444 	\z+H'/'7'7EWdFWEW9X8Ycg'7'h'h$!1$)$45J$K$K$N$Nt{$[$[! #DG$<$<k]X]$<$^$^__==a=6699$+FF &;$uz!Q':'::
 
 
 "DK"
 
 	 	
 "'>::


r%   333333?r   ffffff?皙?  c                 &   |r|                      |||
           n| j        
J d            |dk    s|dk    s|dk    rt                              d           t	          |          }|                     |ddd          }|j                            | j                  }| j	        
                    | j        j	        |||	||          }||d	k              }|                    | j                  }t          j        t          t          t          g                                                              | j                  }t          j        ||g          }| j                            || j        j        d
          \  }}|                    d                                                                                                          }| j                            || j                  }t          j        |                              d          S )N)r   r   zBPlease `prepare_conditionals` first or specify `audio_prompt_path`r   zSCFG, min_p and exaggeration are not supported by Turbo version and will be ignored.ptT)return_tensorspadding
truncation)r   text_tokenstemperaturetop_ktop_prepetition_penaltyi     )speech_tokensref_dictn_cfm_timestepsr   r   )r   r^   loggerwarningr2   r]   	input_idsr:   r9   r5   inference_turbor<   tensorr   longcatr[   	inferencer6   squeezedetachrG   numpyrc   apply_watermarkr`   r   	unsqueeze)r>   r   r   min_pr   audio_prompt_pathr   
cfg_weightr   r   r   r   r   silencer   r   watermarked_wavs                    r#   generatezChatterboxTurboTTS.generate   s     	p%%&7lbo%pppp:))+o)))|c11US[[NNpqqq nnT$Y]n^^!+..t{;;//JM##1 0 
 
 &md&:;%((55,	9i@AAFFHHKKDKXX	='":;;%%'Z^ & 
 
Q
 kk!nn##%%))++1133*::3DG:TT00::1===r%   r   )r   rX   )r   )r   T)	r   r   r   Nr   r   r   r   T)rQ   rR   rS   r
   r   r   r   r	   r   r   r   rM   r4   rd   rV   r   rw   r   r   r   rK   r%   r#   rX   rX   n   s       :L=L #< << < 	<
 < < < < < <$ 2B 2B 2B [2Bh 2 2 2 [2$   ; ; ; ;D 0> 0> 0> 0> 0> 0>r%   rX   )+r   r   dataclassesr   pathlibr   r   r<   ra   
pyloudnormr   safetensors.torchr   huggingface_hubr   transformersr   	models.t3r	   models.s3tokenizerr
   models.s3genr   r   models.tokenizersr   models.voice_encoderr   models.t3.modules.cond_encr   models.t3.modules.t3_configr   models.s3gen.constr   logging	getLoggerrQ   r   r   rM   r2   r4   rX   rK   r%   r#   <module>r      s   				  ! ! ! ! ! !              ' ' ' ' ' ' - - - - - - & & & & & &       % % % % % % ) ) ) ) ) ) ) ) * * * * * * . . . . . . . . . . . . 1 1 1 1 1 1 ) ) ) ) ) ) 		8	$	$
'$C $C $ $ $ $N &: &: &: &: &: &: &: &:Rz> z> z> z> z> z> z> z> z> z>r%   