edge-tts/web/server.py

#!/usr/bin/env python3
"""
Edge TTS Web API Server

This server provides a REST API for the edge-tts web UI.
"""

import asyncio
import io
import logging
from typing import Optional

from fastapi import FastAPI, HTTPException, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from pydantic import BaseModel, Field
import uvicorn

# Import edge_tts
import edge_tts
from edge_tts import VoicesManager


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create FastAPI app
app = FastAPI(
    title="Edge TTS API",
    description="REST API for Microsoft Edge Text-to-Speech service",
    version="1.0.0"
)

# CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Global voices cache
voices_cache: Optional[list] = None


# Models
class SynthesizeRequest(BaseModel):
    text: str = Field(..., max_length=5000, description="Text to convert to speech")
    voice: str = Field(default="en-US-EmmaMultilingualNeural", description="Voice name")
    rate: str = Field(default="+0%", description="Speech rate (e.g., '+0%', '-50%', '+100%')")
    volume: str = Field(default="+0%", description="Volume (e.g., '+0%', '-50%', '+100%')")
    pitch: str = Field(default="+0Hz", description="Pitch (e.g., '+0Hz', '-500Hz', '+500Hz')")


class VoiceResponse(BaseModel):
    Name: str
    ShortName: str
    Gender: str
    Locale: str
    LocaleName: str
    LocalName: Optional[str] = None
    DisplayName: Optional[str] = None
    Status: Optional[str] = None


# API Routes
@app.get("/")
async def root():
    """Serve the main web page"""
    return FileResponse("index.html")


@app.get("/api/health")
async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "service": "edge-tts-api"}


@app.get("/api/voices")
async def get_voices():
    """
    Get list of all available voices.

    Returns a list of voice objects with their properties.
    """
    global voices_cache

    try:
        # Use cached voices if available
        if voices_cache is None:
            logger.info("Fetching voices from Edge TTS service...")
            voices_cache = await edge_tts.list_voices()
            logger.info(f"Loaded {len(voices_cache)} voices")

        return voices_cache

    except Exception as e:
        logger.error(f"Error fetching voices: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to fetch voices: {str(e)}")


@app.post("/api/synthesize")
async def synthesize_speech(request: SynthesizeRequest):
    """
    Synthesize speech from text.

    Returns an MP3 audio file.
    """
    try:
        logger.info(f"Synthesizing speech: text_length={len(request.text)}, voice={request.voice}")

        # Validate text
        if not request.text.strip():
            raise HTTPException(status_code=400, detail="Text cannot be empty")

        if len(request.text) > 5000:
            raise HTTPException(status_code=400, detail="Text exceeds maximum length of 5000 characters")

        # Create Communicate instance
        communicate = edge_tts.Communicate(
            text=request.text,
            voice=request.voice,
            rate=request.rate,
            volume=request.volume,
            pitch=request.pitch
        )

        # Generate audio
        audio_data = io.BytesIO()

        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_data.write(chunk["data"])

        # Check if audio was generated
        audio_data.seek(0)
        if audio_data.getbuffer().nbytes == 0:
            raise HTTPException(status_code=500, detail="No audio was generated")

        logger.info(f"Successfully generated {audio_data.getbuffer().nbytes} bytes of audio")

        # Return audio as MP3
        return Response(
            content=audio_data.getvalue(),
            media_type="audio/mpeg",
            headers={
                "Content-Disposition": "attachment; filename=speech.mp3"
            }
        )

    except edge_tts.exceptions.NoAudioReceived as e:
        logger.error(f"No audio received: {e}")
        raise HTTPException(status_code=400, detail="No audio was generated. Check your parameters.")

    except edge_tts.exceptions.UnknownResponse as e:
        logger.error(f"Unknown response from TTS service: {e}")
        raise HTTPException(status_code=502, detail="Unknown response from TTS service")

    except edge_tts.exceptions.WebSocketError as e:
        logger.error(f"WebSocket error: {e}")
        raise HTTPException(status_code=503, detail="Failed to connect to TTS service")

    except HTTPException:
        raise

    except Exception as e:
        logger.error(f"Error synthesizing speech: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Failed to synthesize speech: {str(e)}")


@app.post("/api/synthesize-with-subtitles")
async def synthesize_with_subtitles(request: SynthesizeRequest):
    """
    Synthesize speech from text and generate subtitles.

    Returns JSON with audio data (base64) and SRT subtitles.
    """
    try:
        logger.info(f"Synthesizing with subtitles: text_length={len(request.text)}, voice={request.voice}")

        # Validate text
        if not request.text.strip():
            raise HTTPException(status_code=400, detail="Text cannot be empty")

        # Create Communicate instance
        communicate = edge_tts.Communicate(
            text=request.text,
            voice=request.voice,
            rate=request.rate,
            volume=request.volume,
            pitch=request.pitch
        )

        # Create subtitle maker
        submaker = edge_tts.SubMaker()

        # Generate audio and subtitles
        audio_data = io.BytesIO()

        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_data.write(chunk["data"])
            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                submaker.feed(chunk)

        # Get subtitles
        subtitles = submaker.get_srt()

        # Return both audio and subtitles
        import base64
        audio_data.seek(0)
        audio_base64 = base64.b64encode(audio_data.read()).decode('utf-8')

        return {
            "audio": audio_base64,
            "subtitles": subtitles,
            "format": "mp3"
        }

    except Exception as e:
        logger.error(f"Error synthesizing with subtitles: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Failed to synthesize: {str(e)}")


# Mount static files
app.mount("/", StaticFiles(directory=".", html=True), name="static")


def main():
    """Run the server"""
    import argparse

    parser = argparse.ArgumentParser(description="Edge TTS Web API Server")
    parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
    parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
    parser.add_argument("--reload", action="store_true", help="Enable auto-reload")

    args = parser.parse_args()

    logger.info(f"Starting Edge TTS Web Server on {args.host}:{args.port}")
    logger.info(f"Visit http://localhost:{args.port} to use the web interface")

    uvicorn.run(
        "server:app",
        host=args.host,
        port=args.port,
        reload=args.reload,
        log_level="info"
    )


if __name__ == "__main__":
    main()