import sys
import torchaudio as ta
import torch
import os
from chatterbox.tts_turbo import ChatterboxTurboTTS
from huggingface_hub import login
import tempfile

# Define the path to the reference audio clip
audio_prompt_path = "/root/.openclaw/media/inbound/smriti_ref_voice.wav"

# Ensure Hugging Face is logged in (token from environment)
# This script is meant to be called by the agent, so HF_TOKEN will be provided via exec tool
token = os.environ.get("HF_TOKEN")
if token:
    login(token=token, add_to_git_credential=False)
else:
    sys.stderr.write("Error: HF_TOKEN environment variable not set. Cannot generate audio.\n")
    sys.exit(1)

# Load the Turbo model, explicitly using CPU
try:
    model = ChatterboxTurboTTS.from_pretrained(device="cpu")
except Exception as e:
    sys.stderr.write(f"Error loading Chatterbox-Turbo model: {e}\n")
    sys.exit(1)

# Get the text from command line arguments
if len(sys.argv) < 2:
    sys.stderr.write("Usage: python3.11 chatterbox_speak.py <text_to_speak>\n")
    sys.exit(1)

text = sys.argv[1]

# Generate audio (requires a reference clip for voice cloning)
try:
    wav = model.generate(text, audio_prompt_path=audio_prompt_path)
    
    # Create a temporary WAV file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
        output_path = tmp_file.name
    
    ta.save(output_path, wav, model.sr)
    sys.stdout.write(output_path) # Print the path to stdout for the agent to pick up
except Exception as e:
    sys.stderr.write(f"Error generating audio: {e}\n")
    sys.exit(1)