DeepCritical / src /services /audio_processing.py
SeasonalFall84's picture
Add TTS on-demand with UI credentials, improve UI layout, and fix References removal
b4f9ff5
Raw
History Blame Contribute Delete
4.92 kB
"""Unified audio processing service for STT and TTS integration."""
from functools import lru_cache
from typing import Any
import numpy as np
import structlog
from src.agents.audio_refiner import audio_refiner
from src.services.stt_gradio import STTService, get_stt_service
from src.utils.config import settings
logger = structlog.get_logger(__name__)
# Type stub for TTS service (will be imported when available)
try:
from src.services.tts_modal import TTSService, get_tts_service
_TTS_AVAILABLE = True
except ImportError:
_TTS_AVAILABLE = False
TTSService = None # type: ignore[assignment, misc]
get_tts_service = None # type: ignore[assignment, misc]
class AudioService:
"""Unified audio processing service."""
def __init__(
self,
stt_service: STTService | None = None,
tts_service: Any | None = None,
) -> None:
"""Initialize audio service with STT and TTS.
Args:
stt_service: STT service instance (default: get_stt_service())
tts_service: TTS service instance (default: get_tts_service() if available)
"""
self.stt = stt_service or get_stt_service()
# TTS is optional (requires Modal)
if tts_service is not None:
self.tts = tts_service
elif _TTS_AVAILABLE and settings.modal_available:
try:
self.tts = get_tts_service() # type: ignore[misc]
except Exception as e:
logger.warning("tts_service_unavailable", error=str(e))
self.tts = None
else:
self.tts = None
async def process_audio_input(
self,
audio_input: tuple[int, np.ndarray[Any, Any]] | None, # type: ignore[type-arg]
hf_token: str | None = None,
) -> str | None:
"""Process audio input and return transcribed text.
Args:
audio_input: Tuple of (sample_rate, audio_array) or None
hf_token: HuggingFace token for authenticated Gradio Spaces
Returns:
Transcribed text string or None if no audio input
"""
if audio_input is None:
return None
try:
transcribed_text = await self.stt.transcribe_audio(audio_input, hf_token=hf_token)
logger.info("audio_input_processed", text_length=len(transcribed_text))
return transcribed_text
except Exception as e:
logger.error("audio_input_processing_failed", error=str(e))
# Return None on failure (graceful degradation)
return None
async def generate_audio_output(
self,
text: str,
voice: str | None = None,
speed: float | None = None,
) -> tuple[int, np.ndarray[Any, Any]] | None: # type: ignore[type-arg]
"""Generate audio output from text.
Args:
text: Text to synthesize (markdown will be cleaned for audio)
voice: Voice ID (default: settings.tts_voice)
speed: Speech speed (default: settings.tts_speed)
Returns:
Tuple of (sample_rate, audio_array) or None if TTS unavailable
"""
if self.tts is None:
logger.warning("tts_unavailable", message="TTS service not available")
return None
if not text or not text.strip():
logger.warning("empty_text_for_tts")
return None
try:
# Refine text for audio (remove markdown, citations, etc.)
# Use LLM polish if enabled in settings
refined_text = await audio_refiner.refine_for_audio(
text, use_llm_polish=settings.tts_use_llm_polish
)
logger.info(
"text_refined_for_audio",
original_length=len(text),
refined_length=len(refined_text),
llm_polish_enabled=settings.tts_use_llm_polish,
)
# Use provided voice/speed or fallback to settings defaults
voice = voice if voice else settings.tts_voice
speed = speed if speed is not None else settings.tts_speed
audio_output = await self.tts.synthesize_async(refined_text, voice, speed) # type: ignore[misc]
if audio_output:
logger.info(
"audio_output_generated",
text_length=len(text),
sample_rate=audio_output[0],
)
return audio_output # type: ignore[no-any-return]
except Exception as e:
logger.error("audio_output_generation_failed", error=str(e))
# Return None on failure (graceful degradation)
return None
@lru_cache(maxsize=1)
def get_audio_service() -> AudioService:
"""Get or create singleton audio service instance.
Returns:
AudioService instance
"""
return AudioService()