浏览代码

Merge pull request #13540 from NoMoreFood/dev

feat: Azure TTS Allow Base URL
Tim Jaeryang Baek 5 月之前
父节点
当前提交
2a4dfc02a2

+ 8 - 2
backend/open_webui/config.py

@@ -2689,7 +2689,7 @@ AUDIO_STT_AZURE_BASE_URL = PersistentConfig(
 AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig(
 AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig(
     "AUDIO_STT_AZURE_MAX_SPEAKERS",
     "AUDIO_STT_AZURE_MAX_SPEAKERS",
     "audio.stt.azure.max_speakers",
     "audio.stt.azure.max_speakers",
-    os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", "3"),
+    os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", ""),
 )
 )
 
 
 AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
 AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
@@ -2737,7 +2737,13 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig(
 AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig(
 AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig(
     "AUDIO_TTS_AZURE_SPEECH_REGION",
     "AUDIO_TTS_AZURE_SPEECH_REGION",
     "audio.tts.azure.speech_region",
     "audio.tts.azure.speech_region",
-    os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"),
+    os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", ""),
+)
+
+AUDIO_TTS_AZURE_SPEECH_BASE_URL = PersistentConfig(
+    "AUDIO_TTS_AZURE_SPEECH_BASE_URL",
+    "audio.tts.azure.speech_base_url",
+    os.getenv("AUDIO_TTS_AZURE_SPEECH_BASE_URL", ""),
 )
 )
 
 
 AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig(
 AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig(

+ 2 - 0
backend/open_webui/main.py

@@ -166,6 +166,7 @@ from open_webui.config import (
     AUDIO_TTS_SPLIT_ON,
     AUDIO_TTS_SPLIT_ON,
     AUDIO_TTS_VOICE,
     AUDIO_TTS_VOICE,
     AUDIO_TTS_AZURE_SPEECH_REGION,
     AUDIO_TTS_AZURE_SPEECH_REGION,
+    AUDIO_TTS_AZURE_SPEECH_BASE_URL,
     AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
     AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
     PLAYWRIGHT_WS_URL,
     PLAYWRIGHT_WS_URL,
     PLAYWRIGHT_TIMEOUT,
     PLAYWRIGHT_TIMEOUT,
@@ -852,6 +853,7 @@ app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
 
 
 
 
 app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION
 app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION
+app.state.config.TTS_AZURE_SPEECH_BASE_URL = AUDIO_TTS_AZURE_SPEECH_BASE_URL
 app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT
 app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT
 
 
 
 

+ 18 - 14
backend/open_webui/routers/audio.py

@@ -138,6 +138,7 @@ class TTSConfigForm(BaseModel):
     VOICE: str
     VOICE: str
     SPLIT_ON: str
     SPLIT_ON: str
     AZURE_SPEECH_REGION: str
     AZURE_SPEECH_REGION: str
+    AZURE_SPEECH_BASE_URL: str
     AZURE_SPEECH_OUTPUT_FORMAT: str
     AZURE_SPEECH_OUTPUT_FORMAT: str
 
 
 
 
@@ -172,6 +173,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
             "VOICE": request.app.state.config.TTS_VOICE,
             "VOICE": request.app.state.config.TTS_VOICE,
             "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
             "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
             "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
             "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
+            "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
             "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
             "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
         },
         },
         "stt": {
         "stt": {
@@ -202,6 +204,9 @@ async def update_audio_config(
     request.app.state.config.TTS_VOICE = form_data.tts.VOICE
     request.app.state.config.TTS_VOICE = form_data.tts.VOICE
     request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
     request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
     request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION
     request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION
+    request.app.state.config.TTS_AZURE_SPEECH_BASE_URL = (
+        form_data.tts.AZURE_SPEECH_BASE_URL
+    )
     request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = (
     request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = (
         form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT
         form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT
     )
     )
@@ -235,6 +240,7 @@ async def update_audio_config(
             "VOICE": request.app.state.config.TTS_VOICE,
             "VOICE": request.app.state.config.TTS_VOICE,
             "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
             "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON,
             "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
             "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION,
+            "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL,
             "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
             "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT,
         },
         },
         "stt": {
         "stt": {
@@ -406,7 +412,8 @@ async def speech(request: Request, user=Depends(get_verified_user)):
             log.exception(e)
             log.exception(e)
             raise HTTPException(status_code=400, detail="Invalid JSON payload")
             raise HTTPException(status_code=400, detail="Invalid JSON payload")
 
 
-        region = request.app.state.config.TTS_AZURE_SPEECH_REGION
+        region = request.app.state.config.TTS_AZURE_SPEECH_REGION or "eastus"
+        base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
         language = request.app.state.config.TTS_VOICE
         language = request.app.state.config.TTS_VOICE
         locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1])
         locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1])
         output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT
         output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT
@@ -420,7 +427,8 @@ async def speech(request: Request, user=Depends(get_verified_user)):
                 timeout=timeout, trust_env=True
                 timeout=timeout, trust_env=True
             ) as session:
             ) as session:
                 async with session.post(
                 async with session.post(
-                    f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1",
+                    (base_url or f"https://{region}.tts.speech.microsoft.com")
+                    + "/cognitiveservices/v1",
                     headers={
                     headers={
                         "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY,
                         "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY,
                         "Content-Type": "application/ssml+xml",
                         "Content-Type": "application/ssml+xml",
@@ -651,10 +659,10 @@ def transcribe(request: Request, file_path):
             )
             )
 
 
         api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
         api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
-        region = request.app.state.config.AUDIO_STT_AZURE_REGION
+        region = request.app.state.config.AUDIO_STT_AZURE_REGION or "eastus"
         locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
         locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
         base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
         base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
-        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS
+        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS or 3
 
 
         # IF NO LOCALES, USE DEFAULTS
         # IF NO LOCALES, USE DEFAULTS
         if len(locales) < 2:
         if len(locales) < 2:
@@ -681,12 +689,6 @@ def transcribe(request: Request, file_path):
                 detail="Azure API key is required for Azure STT",
                 detail="Azure API key is required for Azure STT",
             )
             )
 
 
-        if not base_url and not region:
-            raise HTTPException(
-                status_code=400,
-                detail="Azure region or base url is required for Azure STT",
-            )
-
         r = None
         r = None
         try:
         try:
             # Prepare the request
             # Prepare the request
@@ -702,9 +704,8 @@ def transcribe(request: Request, file_path):
             }
             }
 
 
             url = (
             url = (
-                base_url
-                or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
-            )
+                base_url or f"https://{region}.api.cognitive.microsoft.com"
+            ) + "/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
 
 
             # Use context manager to ensure file is properly closed
             # Use context manager to ensure file is properly closed
             with open(file_path, "rb") as audio_file:
             with open(file_path, "rb") as audio_file:
@@ -939,7 +940,10 @@ def get_available_voices(request) -> dict:
     elif request.app.state.config.TTS_ENGINE == "azure":
     elif request.app.state.config.TTS_ENGINE == "azure":
         try:
         try:
             region = request.app.state.config.TTS_AZURE_SPEECH_REGION
             region = request.app.state.config.TTS_AZURE_SPEECH_REGION
-            url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list"
+            base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL
+            url = (
+                base_url or f"https://{region}.tts.speech.microsoft.com"
+            ) + "/cognitiveservices/voices/list"
             headers = {
             headers = {
                 "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY
                 "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY
             }
             }

+ 48 - 21
src/lib/components/admin/Settings/Audio.svelte

@@ -32,6 +32,7 @@
 	let TTS_VOICE = '';
 	let TTS_VOICE = '';
 	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
 	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
 	let TTS_AZURE_SPEECH_REGION = '';
 	let TTS_AZURE_SPEECH_REGION = '';
+	let TTS_AZURE_SPEECH_BASE_URL = '';
 	let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
 	let TTS_AZURE_SPEECH_OUTPUT_FORMAT = '';
 
 
 	let STT_OPENAI_API_BASE_URL = '';
 	let STT_OPENAI_API_BASE_URL = '';
@@ -105,6 +106,7 @@
 				VOICE: TTS_VOICE,
 				VOICE: TTS_VOICE,
 				SPLIT_ON: TTS_SPLIT_ON,
 				SPLIT_ON: TTS_SPLIT_ON,
 				AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
 				AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION,
+				AZURE_SPEECH_BASE_URL: TTS_AZURE_SPEECH_BASE_URL,
 				AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
 				AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT
 			},
 			},
 			stt: {
 			stt: {
@@ -149,8 +151,9 @@
 
 
 			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
 			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
 
 
-			TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
 			TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
 			TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION;
+			TTS_AZURE_SPEECH_BASE_URL = res.tts.AZURE_SPEECH_BASE_URL;
+			TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT;
 
 
 			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
 			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
 			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
 			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
@@ -272,16 +275,23 @@
 								bind:value={STT_AZURE_API_KEY}
 								bind:value={STT_AZURE_API_KEY}
 								required
 								required
 							/>
 							/>
-							<input
-								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-								placeholder={$i18n.t('Azure Region')}
-								bind:value={STT_AZURE_REGION}
-								required
-							/>
 						</div>
 						</div>
 
 
 						<hr class="border-gray-100 dark:border-gray-850 my-2" />
 						<hr class="border-gray-100 dark:border-gray-850 my-2" />
 
 
+						<div>
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+										bind:value={STT_AZURE_REGION}
+										placeholder={$i18n.t('e.g., westus (leave blank for eastus)')}
+									/>
+								</div>
+							</div>
+						</div>
+
 						<div>
 						<div>
 							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
 							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Language Locales')}</div>
 							<div class="flex w-full">
 							<div class="flex w-full">
@@ -296,13 +306,13 @@
 						</div>
 						</div>
 
 
 						<div>
 						<div>
-							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Base URL')}</div>
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
 							<div class="flex w-full">
 							<div class="flex w-full">
 								<div class="flex-1">
 								<div class="flex-1">
 									<input
 									<input
 										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
 										bind:value={STT_AZURE_BASE_URL}
 										bind:value={STT_AZURE_BASE_URL}
-										placeholder={$i18n.t('(leave blank for Azure Commercial URL auto-generation)')}
+										placeholder={$i18n.t('(leave blank for to use commercial endpoint)')}
 									/>
 									/>
 								</div>
 								</div>
 							</div>
 							</div>
@@ -468,18 +478,35 @@
 				{:else if TTS_ENGINE === 'azure'}
 				{:else if TTS_ENGINE === 'azure'}
 					<div>
 					<div>
 						<div class="mt-1 flex gap-2 mb-1">
 						<div class="mt-1 flex gap-2 mb-1">
-							<input
-								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-								placeholder={$i18n.t('API Key')}
-								bind:value={TTS_API_KEY}
-								required
-							/>
-							<input
-								class="flex-1 w-full rounded-lg py-2 pl-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-								placeholder={$i18n.t('Azure Region')}
-								bind:value={TTS_AZURE_SPEECH_REGION}
-								required
-							/>
+							<SensitiveInput placeholder={$i18n.t('API Key')} bind:value={TTS_API_KEY} required />
+						</div>
+
+						<hr class="border-gray-100 dark:border-gray-850 my-2" />
+
+						<div>
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Azure Region')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+										bind:value={TTS_AZURE_SPEECH_REGION}
+										placeholder={$i18n.t('e.g., westus (leave blank for eastus)')}
+									/>
+								</div>
+							</div>
+						</div>
+
+						<div>
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Endpoint URL')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+										bind:value={TTS_AZURE_SPEECH_BASE_URL}
+										placeholder={$i18n.t('(leave blank for to use commercial endpoint)')}
+									/>
+								</div>
+							</div>
 						</div>
 						</div>
 					</div>
 					</div>
 				{/if}
 				{/if}