audio.py 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368
  1. import hashlib
  2. import json
  3. import logging
  4. import os
  5. import uuid
  6. import html
  7. import base64
  8. from functools import lru_cache
  9. from pydub import AudioSegment
  10. from pydub.silence import split_on_silence
  11. from concurrent.futures import ThreadPoolExecutor
  12. from typing import Optional
  13. from fnmatch import fnmatch
  14. import aiohttp
  15. import aiofiles
  16. import requests
  17. import mimetypes
  18. from urllib.parse import urljoin, quote
  19. from fastapi import (
  20. Depends,
  21. FastAPI,
  22. File,
  23. Form,
  24. HTTPException,
  25. Request,
  26. UploadFile,
  27. status,
  28. APIRouter,
  29. )
  30. from fastapi.middleware.cors import CORSMiddleware
  31. from fastapi.responses import FileResponse
  32. from pydantic import BaseModel
  33. from open_webui.utils.auth import get_admin_user, get_verified_user
  34. from open_webui.config import (
  35. WHISPER_MODEL_AUTO_UPDATE,
  36. WHISPER_MODEL_DIR,
  37. CACHE_DIR,
  38. WHISPER_LANGUAGE,
  39. ELEVENLABS_API_BASE_URL,
  40. )
  41. from open_webui.constants import ERROR_MESSAGES
  42. from open_webui.env import (
  43. ENV,
  44. AIOHTTP_CLIENT_SESSION_SSL,
  45. AIOHTTP_CLIENT_TIMEOUT,
  46. SRC_LOG_LEVELS,
  47. DEVICE_TYPE,
  48. ENABLE_FORWARD_USER_INFO_HEADERS,
  49. )
  50. router = APIRouter()
  51. # Constants
  52. MAX_FILE_SIZE_MB = 20
  53. MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
  54. AZURE_MAX_FILE_SIZE_MB = 200
  55. AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
  56. log = logging.getLogger(__name__)
  57. log.setLevel(SRC_LOG_LEVELS["AUDIO"])
  58. SPEECH_CACHE_DIR = CACHE_DIR / "audio" / "speech"
  59. SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True)
  60. ##########################################
  61. #
  62. # Utility functions
  63. #
  64. ##########################################
  65. from pydub import AudioSegment
  66. from pydub.utils import mediainfo
  67. def is_audio_conversion_required(file_path):
  68. """
  69. Check if the given audio file needs conversion to mp3.
  70. """
  71. SUPPORTED_FORMATS = {"flac", "m4a", "mp3", "mp4", "mpeg", "wav", "webm"}
  72. if not os.path.isfile(file_path):
  73. log.error(f"File not found: {file_path}")
  74. return False
  75. try:
  76. info = mediainfo(file_path)
  77. codec_name = info.get("codec_name", "").lower()
  78. codec_type = info.get("codec_type", "").lower()
  79. codec_tag_string = info.get("codec_tag_string", "").lower()
  80. if codec_name == "aac" and codec_type == "audio" and codec_tag_string == "mp4a":
  81. # File is AAC/mp4a audio, recommend mp3 conversion
  82. return True
  83. # If the codec name is in the supported formats
  84. if codec_name in SUPPORTED_FORMATS:
  85. return False
  86. return True
  87. except Exception as e:
  88. log.error(f"Error getting audio format: {e}")
  89. return False
  90. def convert_audio_to_mp3(file_path):
  91. """Convert audio file to mp3 format."""
  92. try:
  93. output_path = os.path.splitext(file_path)[0] + ".mp3"
  94. audio = AudioSegment.from_file(file_path)
  95. audio.export(output_path, format="mp3")
  96. log.info(f"Converted {file_path} to {output_path}")
  97. return output_path
  98. except Exception as e:
  99. log.error(f"Error converting audio file: {e}")
  100. return None
  101. def set_faster_whisper_model(model: str, auto_update: bool = False):
  102. whisper_model = None
  103. if model:
  104. from faster_whisper import WhisperModel
  105. faster_whisper_kwargs = {
  106. "model_size_or_path": model,
  107. "device": DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu",
  108. "compute_type": "int8",
  109. "download_root": WHISPER_MODEL_DIR,
  110. "local_files_only": not auto_update,
  111. }
  112. try:
  113. whisper_model = WhisperModel(**faster_whisper_kwargs)
  114. except Exception:
  115. log.warning(
  116. "WhisperModel initialization failed, attempting download with local_files_only=False"
  117. )
  118. faster_whisper_kwargs["local_files_only"] = False
  119. whisper_model = WhisperModel(**faster_whisper_kwargs)
  120. return whisper_model
  121. ##########################################
  122. #
  123. # Audio API
  124. #
  125. ##########################################
  126. class TTSConfigForm(BaseModel):
  127. OPENAI_API_BASE_URL: str
  128. OPENAI_API_KEY: str
  129. OPENAI_PARAMS: Optional[dict] = None
  130. API_KEY: str
  131. ENGINE: str
  132. MODEL: str
  133. VOICE: str
  134. SPLIT_ON: str
  135. AZURE_SPEECH_REGION: str
  136. AZURE_SPEECH_BASE_URL: str
  137. AZURE_SPEECH_OUTPUT_FORMAT: str
  138. class STTConfigForm(BaseModel):
  139. OPENAI_API_BASE_URL: str
  140. OPENAI_API_KEY: str
  141. ENGINE: str
  142. MODEL: str
  143. SUPPORTED_CONTENT_TYPES: list[str] = []
  144. WHISPER_MODEL: str
  145. DEEPGRAM_API_KEY: str
  146. AZURE_API_KEY: str
  147. AZURE_REGION: str
  148. AZURE_LOCALES: str
  149. AZURE_BASE_URL: str
  150. AZURE_MAX_SPEAKERS: str
  151. MISTRAL_API_KEY: str
  152. MISTRAL_API_BASE_URL: str
  153. MISTRAL_USE_CHAT_COMPLETIONS: bool
  154. class AudioConfigUpdateForm(BaseModel):
  155. tts: TTSConfigForm
  156. stt: STTConfigForm
  157. @router.get("/config")
  158. async def get_audio_config(request: Request, user=Depends(get_admin_user)):
  159. return {
  160. "tts": {
  161. "OPENAI_API_BASE_URL": request.app.state.config.TTS_OPENAI_API_BASE_URL,
  162. "OPENAI_API_KEY": request.app.state.config.TTS_OPENAI_API_KEY,
  163. "OPENAI_PARAMS": request.app.state.config.TTS_OPENAI_PARAMS,
  164. "API_KEY": request.app.state.config.TTS_API_KEY,
  165. "ENGINE": request.app.state.config.TTS_ENGINE,
  166. "MODEL": request.app.state.config.TTS_MODEL,
  167. "VOICE": request.app.state.config.TTS_VOICE,
  168. "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
  169. "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
  170. "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
  171. "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
  172. },
  173. "stt": {
  174. "OPENAI_API_BASE_URL": request.app.state.config.STT_OPENAI_API_BASE_URL,
  175. "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
  176. "ENGINE": request.app.state.config.STT_ENGINE,
  177. "MODEL": request.app.state.config.STT_MODEL,
  178. "SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
  179. "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
  180. "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
  181. "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
  182. "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
  183. "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
  184. "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
  185. "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,
  186. "MISTRAL_API_KEY": request.app.state.config.AUDIO_STT_MISTRAL_API_KEY,
  187. "MISTRAL_API_BASE_URL": request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL,
  188. "MISTRAL_USE_CHAT_COMPLETIONS": request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS,
  189. },
  190. }
  191. @router.post("/config/update")
  192. async def update_audio_config(
  193. request: Request, form_data: AudioConfigUpdateForm, user=Depends(get_admin_user)
  194. ):
  195. request.app.state.config.TTS_OPENAI_API_BASE_URL = form_data.tts.OPENAI_API_BASE_URL
  196. request.app.state.config.TTS_OPENAI_API_KEY = form_data.tts.OPENAI_API_KEY
  197. request.app.state.config.TTS_OPENAI_PARAMS = form_data.tts.OPENAI_PARAMS
  198. request.app.state.config.TTS_API_KEY = form_data.tts.API_KEY
  199. request.app.state.config.TTS_ENGINE = form_data.tts.ENGINE
  200. request.app.state.config.TTS_MODEL = form_data.tts.MODEL
  201. request.app.state.config.TTS_VOICE = form_data.tts.VOICE
  202. request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
  203. request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION
  204. request.app.state.config.TTS_AZURE_SPEECH_BASE_URL = (
  205. form_data.tts.AZURE_SPEECH_BASE_URL
  206. )
  207. request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = (
  208. form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT
  209. )
  210. request.app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
  211. request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
  212. request.app.state.config.STT_ENGINE = form_data.stt.ENGINE
  213. request.app.state.config.STT_MODEL = form_data.stt.MODEL
  214. request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = (
  215. form_data.stt.SUPPORTED_CONTENT_TYPES
  216. )
  217. request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL
  218. request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY
  219. request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
  220. request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
  221. request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
  222. request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL
  223. request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = (
  224. form_data.stt.AZURE_MAX_SPEAKERS
  225. )
  226. request.app.state.config.AUDIO_STT_MISTRAL_API_KEY = form_data.stt.MISTRAL_API_KEY
  227. request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL = (
  228. form_data.stt.MISTRAL_API_BASE_URL
  229. )
  230. request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS = (
  231. form_data.stt.MISTRAL_USE_CHAT_COMPLETIONS
  232. )
  233. if request.app.state.config.STT_ENGINE == "":
  234. request.app.state.faster_whisper_model = set_faster_whisper_model(
  235. form_data.stt.WHISPER_MODEL, WHISPER_MODEL_AUTO_UPDATE
  236. )
  237. else:
  238. request.app.state.faster_whisper_model = None
  239. return {
  240. "tts": {
  241. "ENGINE": request.app.state.config.TTS_ENGINE,
  242. "MODEL": request.app.state.config.TTS_MODEL,
  243. "VOICE": request.app.state.config.TTS_VOICE,
  244. "OPENAI_API_BASE_URL": request.app.state.config.TTS_OPENAI_API_BASE_URL,
  245. "OPENAI_API_KEY": request.app.state.config.TTS_OPENAI_API_KEY,
  246. "OPENAI_PARAMS": request.app.state.config.TTS_OPENAI_PARAMS,
  247. "API_KEY": request.app.state.config.TTS_API_KEY,
  248. "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
  249. "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
  250. "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
  251. "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
  252. },
  253. "stt": {
  254. "OPENAI_API_BASE_URL": request.app.state.config.STT_OPENAI_API_BASE_URL,
  255. "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY,
  256. "ENGINE": request.app.state.config.STT_ENGINE,
  257. "MODEL": request.app.state.config.STT_MODEL,
  258. "SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES,
  259. "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL,
  260. "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY,
  261. "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
  262. "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
  263. "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
  264. "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
  265. "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,
  266. "MISTRAL_API_KEY": request.app.state.config.AUDIO_STT_MISTRAL_API_KEY,
  267. "MISTRAL_API_BASE_URL": request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL,
  268. "MISTRAL_USE_CHAT_COMPLETIONS": request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS,
  269. },
  270. }
  271. def load_speech_pipeline(request):
  272. from transformers import pipeline
  273. from datasets import load_dataset
  274. if request.app.state.speech_synthesiser is None:
  275. request.app.state.speech_synthesiser = pipeline(
  276. "text-to-speech", "microsoft/speecht5_tts"
  277. )
  278. if request.app.state.speech_speaker_embeddings_dataset is None:
  279. request.app.state.speech_speaker_embeddings_dataset = load_dataset(
  280. "Matthijs/cmu-arctic-xvectors", split="validation"
  281. )
  282. @router.post("/speech")
  283. async def speech(request: Request, user=Depends(get_verified_user)):
  284. body = await request.body()
  285. name = hashlib.sha256(
  286. body
  287. + str(request.app.state.config.TTS_ENGINE).encode("utf-8")
  288. + str(request.app.state.config.TTS_MODEL).encode("utf-8")
  289. ).hexdigest()
  290. file_path = SPEECH_CACHE_DIR.joinpath(f"{name}.mp3")
  291. file_body_path = SPEECH_CACHE_DIR.joinpath(f"{name}.json")
  292. # Check if the file already exists in the cache
  293. if file_path.is_file():
  294. return FileResponse(file_path)
  295. payload = None
  296. try:
  297. payload = json.loads(body.decode("utf-8"))
  298. except Exception as e:
  299. log.exception(e)
  300. raise HTTPException(status_code=400, detail="Invalid JSON payload")
  301. r = None
  302. if request.app.state.config.TTS_ENGINE == "openai":
  303. payload["model"] = request.app.state.config.TTS_MODEL
  304. try:
  305. timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT)
  306. async with aiohttp.ClientSession(
  307. timeout=timeout, trust_env=True
  308. ) as session:
  309. payload = {
  310. **payload,
  311. **(request.app.state.config.TTS_OPENAI_PARAMS or {}),
  312. }
  313. r = await session.post(
  314. url=f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech",
  315. json=payload,
  316. headers={
  317. "Content-Type": "application/json",
  318. "Authorization": f"Bearer {request.app.state.config.TTS_OPENAI_API_KEY}",
  319. **(
  320. {
  321. "X-OpenWebUI-User-Name": quote(user.name, safe=" "),
  322. "X-OpenWebUI-User-Id": user.id,
  323. "X-OpenWebUI-User-Email": user.email,
  324. "X-OpenWebUI-User-Role": user.role,
  325. }
  326. if ENABLE_FORWARD_USER_INFO_HEADERS
  327. else {}
  328. ),
  329. },
  330. ssl=AIOHTTP_CLIENT_SESSION_SSL,
  331. )
  332. r.raise_for_status()
  333. async with aiofiles.open(file_path, "wb") as f:
  334. await f.write(await r.read())
  335. async with aiofiles.open(file_body_path, "w") as f:
  336. await f.write(json.dumps(payload))
  337. return FileResponse(file_path)
  338. except Exception as e:
  339. log.exception(e)
  340. detail = None
  341. status_code = 500
  342. detail = f"Open WebUI: Server Connection Error"
  343. if r is not None:
  344. status_code = r.status
  345. try:
  346. res = await r.json()
  347. if "error" in res:
  348. detail = f"External: {res['error']}"
  349. except Exception:
  350. detail = f"External: {e}"
  351. raise HTTPException(
  352. status_code=status_code,
  353. detail=detail,
  354. )
  355. elif request.app.state.config.TTS_ENGINE == "elevenlabs":
  356. voice_id = payload.get("voice", "")
  357. if voice_id not in get_available_voices(request):
  358. raise HTTPException(
  359. status_code=400,
  360. detail="Invalid voice id",
  361. )
  362. try:
  363. timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT)
  364. async with aiohttp.ClientSession(
  365. timeout=timeout, trust_env=True
  366. ) as session:
  367. async with session.post(
  368. f"{ELEVENLABS_API_BASE_URL}/v1/text-to-speech/{voice_id}",
  369. json={
  370. "text": payload["input"],
  371. "model_id": request.app.state.config.TTS_MODEL,
  372. "voice_settings": {"stability": 0.5, "similarity_boost": 0.5},
  373. },
  374. headers={
  375. "Accept": "audio/mpeg",
  376. "Content-Type": "application/json",
  377. "xi-api-key": request.app.state.config.TTS_API_KEY,
  378. },
  379. ssl=AIOHTTP_CLIENT_SESSION_SSL,
  380. ) as r:
  381. r.raise_for_status()
  382. async with aiofiles.open(file_path, "wb") as f:
  383. await f.write(await r.read())
  384. async with aiofiles.open(file_body_path, "w") as f:
  385. await f.write(json.dumps(payload))
  386. return FileResponse(file_path)
  387. except Exception as e:
  388. log.exception(e)
  389. detail = None
  390. try:
  391. if r.status != 200:
  392. res = await r.json()
  393. if "error" in res:
  394. detail = f"External: {res['error'].get('message', '')}"
  395. except Exception:
  396. detail = f"External: {e}"
  397. raise HTTPException(
  398. status_code=getattr(r, "status", 500) if r else 500,
  399. detail=detail if detail else "Open WebUI: Server Connection Error",
  400. )
  401. elif request.app.state.config.TTS_ENGINE == "azure":
  402. try:
  403. payload = json.loads(body.decode("utf-8"))
  404. except Exception as e:
  405. log.exception(e)
  406. raise HTTPException(status_code=400, detail="Invalid JSON payload")
  407. region = request.app.state.config.TTS_AZURE_SPEECH_REGION or "eastus"
  408. base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
  409. language = request.app.state.config.TTS_VOICE
  410. locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1])
  411. output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT
  412. try:
  413. data = f"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{locale}">
  414. <voice name="{language}">{html.escape(payload["input"])}</voice>
  415. </speak>"""
  416. timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT)
  417. async with aiohttp.ClientSession(
  418. timeout=timeout, trust_env=True
  419. ) as session:
  420. async with session.post(
  421. (base_url or f"https://{region}.tts.speech.microsoft.com")
  422. + "/cognitiveservices/v1",
  423. headers={
  424. "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY,
  425. "Content-Type": "application/ssml+xml",
  426. "X-Microsoft-OutputFormat": output_format,
  427. },
  428. data=data,
  429. ssl=AIOHTTP_CLIENT_SESSION_SSL,
  430. ) as r:
  431. r.raise_for_status()
  432. async with aiofiles.open(file_path, "wb") as f:
  433. await f.write(await r.read())
  434. async with aiofiles.open(file_body_path, "w") as f:
  435. await f.write(json.dumps(payload))
  436. return FileResponse(file_path)
  437. except Exception as e:
  438. log.exception(e)
  439. detail = None
  440. try:
  441. if r.status != 200:
  442. res = await r.json()
  443. if "error" in res:
  444. detail = f"External: {res['error'].get('message', '')}"
  445. except Exception:
  446. detail = f"External: {e}"
  447. raise HTTPException(
  448. status_code=getattr(r, "status", 500) if r else 500,
  449. detail=detail if detail else "Open WebUI: Server Connection Error",
  450. )
  451. elif request.app.state.config.TTS_ENGINE == "transformers":
  452. payload = None
  453. try:
  454. payload = json.loads(body.decode("utf-8"))
  455. except Exception as e:
  456. log.exception(e)
  457. raise HTTPException(status_code=400, detail="Invalid JSON payload")
  458. import torch
  459. import soundfile as sf
  460. load_speech_pipeline(request)
  461. embeddings_dataset = request.app.state.speech_speaker_embeddings_dataset
  462. speaker_index = 6799
  463. try:
  464. speaker_index = embeddings_dataset["filename"].index(
  465. request.app.state.config.TTS_MODEL
  466. )
  467. except Exception:
  468. pass
  469. speaker_embedding = torch.tensor(
  470. embeddings_dataset[speaker_index]["xvector"]
  471. ).unsqueeze(0)
  472. speech = request.app.state.speech_synthesiser(
  473. payload["input"],
  474. forward_params={"speaker_embeddings": speaker_embedding},
  475. )
  476. sf.write(file_path, speech["audio"], samplerate=speech["sampling_rate"])
  477. async with aiofiles.open(file_body_path, "w") as f:
  478. await f.write(json.dumps(payload))
  479. return FileResponse(file_path)
  480. def transcription_handler(request, file_path, metadata):
  481. filename = os.path.basename(file_path)
  482. file_dir = os.path.dirname(file_path)
  483. id = filename.split(".")[0]
  484. metadata = metadata or {}
  485. languages = [
  486. metadata.get("language", None) if not WHISPER_LANGUAGE else WHISPER_LANGUAGE,
  487. None, # Always fallback to None in case transcription fails
  488. ]
  489. if request.app.state.config.STT_ENGINE == "":
  490. if request.app.state.faster_whisper_model is None:
  491. request.app.state.faster_whisper_model = set_faster_whisper_model(
  492. request.app.state.config.WHISPER_MODEL
  493. )
  494. model = request.app.state.faster_whisper_model
  495. segments, info = model.transcribe(
  496. file_path,
  497. beam_size=5,
  498. vad_filter=request.app.state.config.WHISPER_VAD_FILTER,
  499. language=languages[0],
  500. )
  501. log.info(
  502. "Detected language '%s' with probability %f"
  503. % (info.language, info.language_probability)
  504. )
  505. transcript = "".join([segment.text for segment in list(segments)])
  506. data = {"text": transcript.strip()}
  507. # save the transcript to a json file
  508. transcript_file = f"{file_dir}/{id}.json"
  509. with open(transcript_file, "w") as f:
  510. json.dump(data, f)
  511. log.debug(data)
  512. return data
  513. elif request.app.state.config.STT_ENGINE == "openai":
  514. r = None
  515. try:
  516. for language in languages:
  517. payload = {
  518. "model": request.app.state.config.STT_MODEL,
  519. }
  520. if language:
  521. payload["language"] = language
  522. r = requests.post(
  523. url=f"{request.app.state.config.STT_OPENAI_API_BASE_URL}/audio/transcriptions",
  524. headers={
  525. "Authorization": f"Bearer {request.app.state.config.STT_OPENAI_API_KEY}"
  526. },
  527. files={"file": (filename, open(file_path, "rb"))},
  528. data=payload,
  529. )
  530. if r.status_code == 200:
  531. # Successful transcription
  532. break
  533. r.raise_for_status()
  534. data = r.json()
  535. # save the transcript to a json file
  536. transcript_file = f"{file_dir}/{id}.json"
  537. with open(transcript_file, "w") as f:
  538. json.dump(data, f)
  539. return data
  540. except Exception as e:
  541. log.exception(e)
  542. detail = None
  543. if r is not None:
  544. try:
  545. res = r.json()
  546. if "error" in res:
  547. detail = f"External: {res['error'].get('message', '')}"
  548. except Exception:
  549. detail = f"External: {e}"
  550. raise Exception(detail if detail else "Open WebUI: Server Connection Error")
  551. elif request.app.state.config.STT_ENGINE == "deepgram":
  552. try:
  553. # Determine the MIME type of the file
  554. mime, _ = mimetypes.guess_type(file_path)
  555. if not mime:
  556. mime = "audio/wav" # fallback to wav if undetectable
  557. # Read the audio file
  558. with open(file_path, "rb") as f:
  559. file_data = f.read()
  560. # Build headers and parameters
  561. headers = {
  562. "Authorization": f"Token {request.app.state.config.DEEPGRAM_API_KEY}",
  563. "Content-Type": mime,
  564. }
  565. for language in languages:
  566. params = {}
  567. if request.app.state.config.STT_MODEL:
  568. params["model"] = request.app.state.config.STT_MODEL
  569. if language:
  570. params["language"] = language
  571. # Make request to Deepgram API
  572. r = requests.post(
  573. "https://api.deepgram.com/v1/listen?smart_format=true",
  574. headers=headers,
  575. params=params,
  576. data=file_data,
  577. )
  578. if r.status_code == 200:
  579. # Successful transcription
  580. break
  581. r.raise_for_status()
  582. response_data = r.json()
  583. # Extract transcript from Deepgram response
  584. try:
  585. transcript = response_data["results"]["channels"][0]["alternatives"][
  586. 0
  587. ].get("transcript", "")
  588. except (KeyError, IndexError) as e:
  589. log.error(f"Malformed response from Deepgram: {str(e)}")
  590. raise Exception(
  591. "Failed to parse Deepgram response - unexpected response format"
  592. )
  593. data = {"text": transcript.strip()}
  594. # Save transcript
  595. transcript_file = f"{file_dir}/{id}.json"
  596. with open(transcript_file, "w") as f:
  597. json.dump(data, f)
  598. return data
  599. except Exception as e:
  600. log.exception(e)
  601. detail = None
  602. if r is not None:
  603. try:
  604. res = r.json()
  605. if "error" in res:
  606. detail = f"External: {res['error'].get('message', '')}"
  607. except Exception:
  608. detail = f"External: {e}"
  609. raise Exception(detail if detail else "Open WebUI: Server Connection Error")
  610. elif request.app.state.config.STT_ENGINE == "azure":
  611. # Check file exists and size
  612. if not os.path.exists(file_path):
  613. raise HTTPException(status_code=400, detail="Audio file not found")
  614. # Check file size (Azure has a larger limit of 200MB)
  615. file_size = os.path.getsize(file_path)
  616. if file_size > AZURE_MAX_FILE_SIZE:
  617. raise HTTPException(
  618. status_code=400,
  619. detail=f"File size exceeds Azure's limit of {AZURE_MAX_FILE_SIZE_MB}MB",
  620. )
  621. api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
  622. region = request.app.state.config.AUDIO_STT_AZURE_REGION or "eastus"
  623. locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
  624. base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
  625. max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS or 3
  626. # IF NO LOCALES, USE DEFAULTS
  627. if len(locales) < 2:
  628. locales = [
  629. "en-US",
  630. "es-ES",
  631. "es-MX",
  632. "fr-FR",
  633. "hi-IN",
  634. "it-IT",
  635. "de-DE",
  636. "en-GB",
  637. "en-IN",
  638. "ja-JP",
  639. "ko-KR",
  640. "pt-BR",
  641. "zh-CN",
  642. ]
  643. locales = ",".join(locales)
  644. if not api_key or not region:
  645. raise HTTPException(
  646. status_code=400,
  647. detail="Azure API key is required for Azure STT",
  648. )
  649. r = None
  650. try:
  651. # Prepare the request
  652. data = {
  653. "definition": json.dumps(
  654. {
  655. "locales": locales.split(","),
  656. "diarization": {"maxSpeakers": max_speakers, "enabled": True},
  657. }
  658. if locales
  659. else {}
  660. )
  661. }
  662. url = (
  663. base_url or f"https://{region}.api.cognitive.microsoft.com"
  664. ) + "/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
  665. # Use context manager to ensure file is properly closed
  666. with open(file_path, "rb") as audio_file:
  667. r = requests.post(
  668. url=url,
  669. files={"audio": audio_file},
  670. data=data,
  671. headers={
  672. "Ocp-Apim-Subscription-Key": api_key,
  673. },
  674. )
  675. r.raise_for_status()
  676. response = r.json()
  677. # Extract transcript from response
  678. if not response.get("combinedPhrases"):
  679. raise ValueError("No transcription found in response")
  680. # Get the full transcript from combinedPhrases
  681. transcript = response["combinedPhrases"][0].get("text", "").strip()
  682. if not transcript:
  683. raise ValueError("Empty transcript in response")
  684. data = {"text": transcript}
  685. # Save transcript to json file (consistent with other providers)
  686. transcript_file = f"{file_dir}/{id}.json"
  687. with open(transcript_file, "w") as f:
  688. json.dump(data, f)
  689. log.debug(data)
  690. return data
  691. except (KeyError, IndexError, ValueError) as e:
  692. log.exception("Error parsing Azure response")
  693. raise HTTPException(
  694. status_code=500,
  695. detail=f"Failed to parse Azure response: {str(e)}",
  696. )
  697. except requests.exceptions.RequestException as e:
  698. log.exception(e)
  699. detail = None
  700. try:
  701. if r is not None and r.status_code != 200:
  702. res = r.json()
  703. if "error" in res:
  704. detail = f"External: {res['error'].get('message', '')}"
  705. except Exception:
  706. detail = f"External: {e}"
  707. raise HTTPException(
  708. status_code=getattr(r, "status_code", 500) if r else 500,
  709. detail=detail if detail else "Open WebUI: Server Connection Error",
  710. )
  711. elif request.app.state.config.STT_ENGINE == "mistral":
  712. # Check file exists
  713. if not os.path.exists(file_path):
  714. raise HTTPException(status_code=400, detail="Audio file not found")
  715. # Check file size
  716. file_size = os.path.getsize(file_path)
  717. if file_size > MAX_FILE_SIZE:
  718. raise HTTPException(
  719. status_code=400,
  720. detail=f"File size exceeds limit of {MAX_FILE_SIZE_MB}MB",
  721. )
  722. api_key = request.app.state.config.AUDIO_STT_MISTRAL_API_KEY
  723. api_base_url = (
  724. request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL
  725. or "https://api.mistral.ai/v1"
  726. )
  727. use_chat_completions = (
  728. request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS
  729. )
  730. if not api_key:
  731. raise HTTPException(
  732. status_code=400,
  733. detail="Mistral API key is required for Mistral STT",
  734. )
  735. r = None
  736. try:
  737. # Use voxtral-mini-latest as the default model for transcription
  738. model = request.app.state.config.STT_MODEL or "voxtral-mini-latest"
  739. log.info(
  740. f"Mistral STT - model: {model}, "
  741. f"method: {'chat_completions' if use_chat_completions else 'transcriptions'}"
  742. )
  743. if use_chat_completions:
  744. # Use chat completions API with audio input
  745. # This method requires mp3 or wav format
  746. audio_file_to_use = file_path
  747. if is_audio_conversion_required(file_path):
  748. log.debug("Converting audio to mp3 for chat completions API")
  749. converted_path = convert_audio_to_mp3(file_path)
  750. if converted_path:
  751. audio_file_to_use = converted_path
  752. else:
  753. log.error("Audio conversion failed")
  754. raise HTTPException(
  755. status_code=500,
  756. detail="Audio conversion failed. Chat completions API requires mp3 or wav format.",
  757. )
  758. # Read and encode audio file as base64
  759. with open(audio_file_to_use, "rb") as audio_file:
  760. audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
  761. # Prepare chat completions request
  762. url = f"{api_base_url}/chat/completions"
  763. # Add language instruction if specified
  764. language = metadata.get("language", None) if metadata else None
  765. if language:
  766. text_instruction = f"Transcribe this audio exactly as spoken in {language}. Do not translate it."
  767. else:
  768. text_instruction = "Transcribe this audio exactly as spoken in its original language. Do not translate it to another language."
  769. payload = {
  770. "model": model,
  771. "messages": [
  772. {
  773. "role": "user",
  774. "content": [
  775. {
  776. "type": "input_audio",
  777. "input_audio": audio_base64,
  778. },
  779. {"type": "text", "text": text_instruction},
  780. ],
  781. }
  782. ],
  783. }
  784. r = requests.post(
  785. url=url,
  786. json=payload,
  787. headers={
  788. "Authorization": f"Bearer {api_key}",
  789. "Content-Type": "application/json",
  790. },
  791. )
  792. r.raise_for_status()
  793. response = r.json()
  794. # Extract transcript from chat completion response
  795. transcript = (
  796. response.get("choices", [{}])[0]
  797. .get("message", {})
  798. .get("content", "")
  799. .strip()
  800. )
  801. if not transcript:
  802. raise ValueError("Empty transcript in response")
  803. data = {"text": transcript}
  804. else:
  805. # Use dedicated transcriptions API
  806. url = f"{api_base_url}/audio/transcriptions"
  807. # Determine the MIME type
  808. mime_type, _ = mimetypes.guess_type(file_path)
  809. if not mime_type:
  810. mime_type = "audio/webm"
  811. # Use context manager to ensure file is properly closed
  812. with open(file_path, "rb") as audio_file:
  813. files = {"file": (filename, audio_file, mime_type)}
  814. data_form = {"model": model}
  815. # Add language if specified in metadata
  816. language = metadata.get("language", None) if metadata else None
  817. if language:
  818. data_form["language"] = language
  819. r = requests.post(
  820. url=url,
  821. files=files,
  822. data=data_form,
  823. headers={
  824. "Authorization": f"Bearer {api_key}",
  825. },
  826. )
  827. r.raise_for_status()
  828. response = r.json()
  829. # Extract transcript from response
  830. transcript = response.get("text", "").strip()
  831. if not transcript:
  832. raise ValueError("Empty transcript in response")
  833. data = {"text": transcript}
  834. # Save transcript to json file (consistent with other providers)
  835. transcript_file = f"{file_dir}/{id}.json"
  836. with open(transcript_file, "w") as f:
  837. json.dump(data, f)
  838. log.debug(data)
  839. return data
  840. except ValueError as e:
  841. log.exception("Error parsing Mistral response")
  842. raise HTTPException(
  843. status_code=500,
  844. detail=f"Failed to parse Mistral response: {str(e)}",
  845. )
  846. except requests.exceptions.RequestException as e:
  847. log.exception(e)
  848. detail = None
  849. try:
  850. if r is not None and r.status_code != 200:
  851. res = r.json()
  852. if "error" in res:
  853. detail = f"External: {res['error'].get('message', '')}"
  854. else:
  855. detail = f"External: {r.text}"
  856. except Exception:
  857. detail = f"External: {e}"
  858. raise HTTPException(
  859. status_code=getattr(r, "status_code", 500) if r else 500,
  860. detail=detail if detail else "Open WebUI: Server Connection Error",
  861. )
  862. def transcribe(request: Request, file_path: str, metadata: Optional[dict] = None):
  863. log.info(f"transcribe: {file_path} {metadata}")
  864. if is_audio_conversion_required(file_path):
  865. file_path = convert_audio_to_mp3(file_path)
  866. try:
  867. file_path = compress_audio(file_path)
  868. except Exception as e:
  869. log.exception(e)
  870. # Always produce a list of chunk paths (could be one entry if small)
  871. try:
  872. chunk_paths = split_audio(file_path, MAX_FILE_SIZE)
  873. print(f"Chunk paths: {chunk_paths}")
  874. except Exception as e:
  875. log.exception(e)
  876. raise HTTPException(
  877. status_code=status.HTTP_400_BAD_REQUEST,
  878. detail=ERROR_MESSAGES.DEFAULT(e),
  879. )
  880. results = []
  881. try:
  882. with ThreadPoolExecutor() as executor:
  883. # Submit tasks for each chunk_path
  884. futures = [
  885. executor.submit(transcription_handler, request, chunk_path, metadata)
  886. for chunk_path in chunk_paths
  887. ]
  888. # Gather results as they complete
  889. for future in futures:
  890. try:
  891. results.append(future.result())
  892. except Exception as transcribe_exc:
  893. raise HTTPException(
  894. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  895. detail=f"Error transcribing chunk: {transcribe_exc}",
  896. )
  897. finally:
  898. # Clean up only the temporary chunks, never the original file
  899. for chunk_path in chunk_paths:
  900. if chunk_path != file_path and os.path.isfile(chunk_path):
  901. try:
  902. os.remove(chunk_path)
  903. except Exception:
  904. pass
  905. return {
  906. "text": " ".join([result["text"] for result in results]),
  907. }
  908. def compress_audio(file_path):
  909. if os.path.getsize(file_path) > MAX_FILE_SIZE:
  910. id = os.path.splitext(os.path.basename(file_path))[
  911. 0
  912. ] # Handles names with multiple dots
  913. file_dir = os.path.dirname(file_path)
  914. audio = AudioSegment.from_file(file_path)
  915. audio = audio.set_frame_rate(16000).set_channels(1) # Compress audio
  916. compressed_path = os.path.join(file_dir, f"{id}_compressed.mp3")
  917. audio.export(compressed_path, format="mp3", bitrate="32k")
  918. # log.debug(f"Compressed audio to {compressed_path}") # Uncomment if log is defined
  919. return compressed_path
  920. else:
  921. return file_path
  922. def split_audio(file_path, max_bytes, format="mp3", bitrate="32k"):
  923. """
  924. Splits audio into chunks not exceeding max_bytes.
  925. Returns a list of chunk file paths. If audio fits, returns list with original path.
  926. """
  927. file_size = os.path.getsize(file_path)
  928. if file_size <= max_bytes:
  929. return [file_path] # Nothing to split
  930. audio = AudioSegment.from_file(file_path)
  931. duration_ms = len(audio)
  932. orig_size = file_size
  933. approx_chunk_ms = max(int(duration_ms * (max_bytes / orig_size)) - 1000, 1000)
  934. chunks = []
  935. start = 0
  936. i = 0
  937. base, _ = os.path.splitext(file_path)
  938. while start < duration_ms:
  939. end = min(start + approx_chunk_ms, duration_ms)
  940. chunk = audio[start:end]
  941. chunk_path = f"{base}_chunk_{i}.{format}"
  942. chunk.export(chunk_path, format=format, bitrate=bitrate)
  943. # Reduce chunk duration if still too large
  944. while os.path.getsize(chunk_path) > max_bytes and (end - start) > 5000:
  945. end = start + ((end - start) // 2)
  946. chunk = audio[start:end]
  947. chunk.export(chunk_path, format=format, bitrate=bitrate)
  948. if os.path.getsize(chunk_path) > max_bytes:
  949. os.remove(chunk_path)
  950. raise Exception("Audio chunk cannot be reduced below max file size.")
  951. chunks.append(chunk_path)
  952. start = end
  953. i += 1
  954. return chunks
  955. @router.post("/transcriptions")
  956. def transcription(
  957. request: Request,
  958. file: UploadFile = File(...),
  959. language: Optional[str] = Form(None),
  960. user=Depends(get_verified_user),
  961. ):
  962. log.info(f"file.content_type: {file.content_type}")
  963. stt_supported_content_types = getattr(
  964. request.app.state.config, "STT_SUPPORTED_CONTENT_TYPES", []
  965. )
  966. if not any(
  967. fnmatch(file.content_type, content_type)
  968. for content_type in (
  969. stt_supported_content_types
  970. if stt_supported_content_types
  971. and any(t.strip() for t in stt_supported_content_types)
  972. else ["audio/*", "video/webm"]
  973. )
  974. ):
  975. raise HTTPException(
  976. status_code=status.HTTP_400_BAD_REQUEST,
  977. detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
  978. )
  979. try:
  980. ext = file.filename.split(".")[-1]
  981. id = uuid.uuid4()
  982. filename = f"{id}.{ext}"
  983. contents = file.file.read()
  984. file_dir = f"{CACHE_DIR}/audio/transcriptions"
  985. os.makedirs(file_dir, exist_ok=True)
  986. file_path = f"{file_dir}/{filename}"
  987. with open(file_path, "wb") as f:
  988. f.write(contents)
  989. try:
  990. metadata = None
  991. if language:
  992. metadata = {"language": language}
  993. result = transcribe(request, file_path, metadata)
  994. return {
  995. **result,
  996. "filename": os.path.basename(file_path),
  997. }
  998. except Exception as e:
  999. log.exception(e)
  1000. raise HTTPException(
  1001. status_code=status.HTTP_400_BAD_REQUEST,
  1002. detail=ERROR_MESSAGES.DEFAULT(e),
  1003. )
  1004. except Exception as e:
  1005. log.exception(e)
  1006. raise HTTPException(
  1007. status_code=status.HTTP_400_BAD_REQUEST,
  1008. detail=ERROR_MESSAGES.DEFAULT(e),
  1009. )
  1010. def get_available_models(request: Request) -> list[dict]:
  1011. available_models = []
  1012. if request.app.state.config.TTS_ENGINE == "openai":
  1013. # Use custom endpoint if not using the official OpenAI API URL
  1014. if not request.app.state.config.TTS_OPENAI_API_BASE_URL.startswith(
  1015. "https://api.openai.com"
  1016. ):
  1017. try:
  1018. response = requests.get(
  1019. f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/audio/models"
  1020. )
  1021. response.raise_for_status()
  1022. data = response.json()
  1023. available_models = data.get("models", [])
  1024. except Exception as e:
  1025. log.error(f"Error fetching models from custom endpoint: {str(e)}")
  1026. available_models = [{"id": "tts-1"}, {"id": "tts-1-hd"}]
  1027. else:
  1028. available_models = [{"id": "tts-1"}, {"id": "tts-1-hd"}]
  1029. elif request.app.state.config.TTS_ENGINE == "elevenlabs":
  1030. try:
  1031. response = requests.get(
  1032. f"{ELEVENLABS_API_BASE_URL}/v1/models",
  1033. headers={
  1034. "xi-api-key": request.app.state.config.TTS_API_KEY,
  1035. "Content-Type": "application/json",
  1036. },
  1037. timeout=5,
  1038. )
  1039. response.raise_for_status()
  1040. models = response.json()
  1041. available_models = [
  1042. {"name": model["name"], "id": model["model_id"]} for model in models
  1043. ]
  1044. except requests.RequestException as e:
  1045. log.error(f"Error fetching voices: {str(e)}")
  1046. return available_models
  1047. @router.get("/models")
  1048. async def get_models(request: Request, user=Depends(get_verified_user)):
  1049. return {"models": get_available_models(request)}
  1050. def get_available_voices(request) -> dict:
  1051. """Returns {voice_id: voice_name} dict"""
  1052. available_voices = {}
  1053. if request.app.state.config.TTS_ENGINE == "openai":
  1054. # Use custom endpoint if not using the official OpenAI API URL
  1055. if not request.app.state.config.TTS_OPENAI_API_BASE_URL.startswith(
  1056. "https://api.openai.com"
  1057. ):
  1058. try:
  1059. response = requests.get(
  1060. f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/audio/voices"
  1061. )
  1062. response.raise_for_status()
  1063. data = response.json()
  1064. voices_list = data.get("voices", [])
  1065. available_voices = {voice["id"]: voice["name"] for voice in voices_list}
  1066. except Exception as e:
  1067. log.error(f"Error fetching voices from custom endpoint: {str(e)}")
  1068. available_voices = {
  1069. "alloy": "alloy",
  1070. "echo": "echo",
  1071. "fable": "fable",
  1072. "onyx": "onyx",
  1073. "nova": "nova",
  1074. "shimmer": "shimmer",
  1075. }
  1076. else:
  1077. available_voices = {
  1078. "alloy": "alloy",
  1079. "echo": "echo",
  1080. "fable": "fable",
  1081. "onyx": "onyx",
  1082. "nova": "nova",
  1083. "shimmer": "shimmer",
  1084. }
  1085. elif request.app.state.config.TTS_ENGINE == "elevenlabs":
  1086. try:
  1087. available_voices = get_elevenlabs_voices(
  1088. api_key=request.app.state.config.TTS_API_KEY
  1089. )
  1090. except Exception:
  1091. # Avoided @lru_cache with exception
  1092. pass
  1093. elif request.app.state.config.TTS_ENGINE == "azure":
  1094. try:
  1095. region = request.app.state.config.TTS_AZURE_SPEECH_REGION
  1096. base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
  1097. url = (
  1098. base_url or f"https://{region}.tts.speech.microsoft.com"
  1099. ) + "/cognitiveservices/voices/list"
  1100. headers = {
  1101. "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY
  1102. }
  1103. response = requests.get(url, headers=headers)
  1104. response.raise_for_status()
  1105. voices = response.json()
  1106. for voice in voices:
  1107. available_voices[voice["ShortName"]] = (
  1108. f"{voice['DisplayName']} ({voice['ShortName']})"
  1109. )
  1110. except requests.RequestException as e:
  1111. log.error(f"Error fetching voices: {str(e)}")
  1112. return available_voices
  1113. @lru_cache
  1114. def get_elevenlabs_voices(api_key: str) -> dict:
  1115. """
  1116. Note, set the following in your .env file to use Elevenlabs:
  1117. AUDIO_TTS_ENGINE=elevenlabs
  1118. AUDIO_TTS_API_KEY=sk_... # Your Elevenlabs API key
  1119. AUDIO_TTS_VOICE=EXAVITQu4vr4xnSDxMaL # From https://api.elevenlabs.io/v1/voices
  1120. AUDIO_TTS_MODEL=eleven_multilingual_v2
  1121. """
  1122. try:
  1123. # TODO: Add retries
  1124. response = requests.get(
  1125. f"{ELEVENLABS_API_BASE_URL}/v1/voices",
  1126. headers={
  1127. "xi-api-key": api_key,
  1128. "Content-Type": "application/json",
  1129. },
  1130. )
  1131. response.raise_for_status()
  1132. voices_data = response.json()
  1133. voices = {}
  1134. for voice in voices_data.get("voices", []):
  1135. voices[voice["voice_id"]] = voice["name"]
  1136. except requests.RequestException as e:
  1137. # Avoid @lru_cache with exception
  1138. log.error(f"Error fetching voices: {str(e)}")
  1139. raise RuntimeError(f"Error fetching voices: {str(e)}")
  1140. return voices
  1141. @router.get("/voices")
  1142. async def get_voices(request: Request, user=Depends(get_verified_user)):
  1143. return {
  1144. "voices": [
  1145. {"id": k, "name": v} for k, v in get_available_voices(request).items()
  1146. ]
  1147. }