257 lines
7.4 KiB
Python
Executable File
257 lines
7.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Edge TTS Web API Server
|
|
|
|
This server provides a REST API for the edge-tts web UI.
|
|
"""
|
|
|
|
import asyncio
|
|
import io
|
|
import logging
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, HTTPException, Response
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.responses import FileResponse
|
|
from pydantic import BaseModel, Field
|
|
import uvicorn
|
|
|
|
# Import edge_tts
|
|
import edge_tts
|
|
from edge_tts import VoicesManager
|
|
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Create FastAPI app
|
|
app = FastAPI(
|
|
title="Edge TTS API",
|
|
description="REST API for Microsoft Edge Text-to-Speech service",
|
|
version="1.0.0"
|
|
)
|
|
|
|
# CORS middleware
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# Global voices cache
|
|
voices_cache: Optional[list] = None
|
|
|
|
|
|
# Models
|
|
class SynthesizeRequest(BaseModel):
|
|
text: str = Field(..., max_length=5000, description="Text to convert to speech")
|
|
voice: str = Field(default="en-US-EmmaMultilingualNeural", description="Voice name")
|
|
rate: str = Field(default="+0%", description="Speech rate (e.g., '+0%', '-50%', '+100%')")
|
|
volume: str = Field(default="+0%", description="Volume (e.g., '+0%', '-50%', '+100%')")
|
|
pitch: str = Field(default="+0Hz", description="Pitch (e.g., '+0Hz', '-500Hz', '+500Hz')")
|
|
|
|
|
|
class VoiceResponse(BaseModel):
|
|
Name: str
|
|
ShortName: str
|
|
Gender: str
|
|
Locale: str
|
|
LocaleName: str
|
|
LocalName: Optional[str] = None
|
|
DisplayName: Optional[str] = None
|
|
Status: Optional[str] = None
|
|
|
|
|
|
# API Routes
|
|
@app.get("/")
|
|
async def root():
|
|
"""Serve the main web page"""
|
|
return FileResponse("index.html")
|
|
|
|
|
|
@app.get("/api/health")
|
|
async def health_check():
|
|
"""Health check endpoint"""
|
|
return {"status": "healthy", "service": "edge-tts-api"}
|
|
|
|
|
|
@app.get("/api/voices")
|
|
async def get_voices():
|
|
"""
|
|
Get list of all available voices.
|
|
|
|
Returns a list of voice objects with their properties.
|
|
"""
|
|
global voices_cache
|
|
|
|
try:
|
|
# Use cached voices if available
|
|
if voices_cache is None:
|
|
logger.info("Fetching voices from Edge TTS service...")
|
|
voices_cache = await edge_tts.list_voices()
|
|
logger.info(f"Loaded {len(voices_cache)} voices")
|
|
|
|
return voices_cache
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching voices: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to fetch voices: {str(e)}")
|
|
|
|
|
|
@app.post("/api/synthesize")
|
|
async def synthesize_speech(request: SynthesizeRequest):
|
|
"""
|
|
Synthesize speech from text.
|
|
|
|
Returns an MP3 audio file.
|
|
"""
|
|
try:
|
|
logger.info(f"Synthesizing speech: text_length={len(request.text)}, voice={request.voice}")
|
|
|
|
# Validate text
|
|
if not request.text.strip():
|
|
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
|
|
|
if len(request.text) > 5000:
|
|
raise HTTPException(status_code=400, detail="Text exceeds maximum length of 5000 characters")
|
|
|
|
# Create Communicate instance
|
|
communicate = edge_tts.Communicate(
|
|
text=request.text,
|
|
voice=request.voice,
|
|
rate=request.rate,
|
|
volume=request.volume,
|
|
pitch=request.pitch
|
|
)
|
|
|
|
# Generate audio
|
|
audio_data = io.BytesIO()
|
|
|
|
async for chunk in communicate.stream():
|
|
if chunk["type"] == "audio":
|
|
audio_data.write(chunk["data"])
|
|
|
|
# Check if audio was generated
|
|
audio_data.seek(0)
|
|
if audio_data.getbuffer().nbytes == 0:
|
|
raise HTTPException(status_code=500, detail="No audio was generated")
|
|
|
|
logger.info(f"Successfully generated {audio_data.getbuffer().nbytes} bytes of audio")
|
|
|
|
# Return audio as MP3
|
|
return Response(
|
|
content=audio_data.getvalue(),
|
|
media_type="audio/mpeg",
|
|
headers={
|
|
"Content-Disposition": "attachment; filename=speech.mp3"
|
|
}
|
|
)
|
|
|
|
except edge_tts.exceptions.NoAudioReceived as e:
|
|
logger.error(f"No audio received: {e}")
|
|
raise HTTPException(status_code=400, detail="No audio was generated. Check your parameters.")
|
|
|
|
except edge_tts.exceptions.UnknownResponse as e:
|
|
logger.error(f"Unknown response from TTS service: {e}")
|
|
raise HTTPException(status_code=502, detail="Unknown response from TTS service")
|
|
|
|
except edge_tts.exceptions.WebSocketError as e:
|
|
logger.error(f"WebSocket error: {e}")
|
|
raise HTTPException(status_code=503, detail="Failed to connect to TTS service")
|
|
|
|
except HTTPException:
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error synthesizing speech: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"Failed to synthesize speech: {str(e)}")
|
|
|
|
|
|
@app.post("/api/synthesize-with-subtitles")
|
|
async def synthesize_with_subtitles(request: SynthesizeRequest):
|
|
"""
|
|
Synthesize speech from text and generate subtitles.
|
|
|
|
Returns JSON with audio data (base64) and SRT subtitles.
|
|
"""
|
|
try:
|
|
logger.info(f"Synthesizing with subtitles: text_length={len(request.text)}, voice={request.voice}")
|
|
|
|
# Validate text
|
|
if not request.text.strip():
|
|
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
|
|
|
# Create Communicate instance
|
|
communicate = edge_tts.Communicate(
|
|
text=request.text,
|
|
voice=request.voice,
|
|
rate=request.rate,
|
|
volume=request.volume,
|
|
pitch=request.pitch
|
|
)
|
|
|
|
# Create subtitle maker
|
|
submaker = edge_tts.SubMaker()
|
|
|
|
# Generate audio and subtitles
|
|
audio_data = io.BytesIO()
|
|
|
|
async for chunk in communicate.stream():
|
|
if chunk["type"] == "audio":
|
|
audio_data.write(chunk["data"])
|
|
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
|
|
submaker.feed(chunk)
|
|
|
|
# Get subtitles
|
|
subtitles = submaker.get_srt()
|
|
|
|
# Return both audio and subtitles
|
|
import base64
|
|
audio_data.seek(0)
|
|
audio_base64 = base64.b64encode(audio_data.read()).decode('utf-8')
|
|
|
|
return {
|
|
"audio": audio_base64,
|
|
"subtitles": subtitles,
|
|
"format": "mp3"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error synthesizing with subtitles: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"Failed to synthesize: {str(e)}")
|
|
|
|
|
|
# Mount static files
|
|
app.mount("/", StaticFiles(directory=".", html=True), name="static")
|
|
|
|
|
|
def main():
|
|
"""Run the server"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Edge TTS Web API Server")
|
|
parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
|
|
parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
|
|
parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
|
|
|
|
args = parser.parse_args()
|
|
|
|
logger.info(f"Starting Edge TTS Web Server on {args.host}:{args.port}")
|
|
logger.info(f"Visit http://localhost:{args.port} to use the web interface")
|
|
|
|
uvicorn.run(
|
|
"server:app",
|
|
host=args.host,
|
|
port=args.port,
|
|
reload=args.reload,
|
|
log_level="info"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|