Kaynağa Gözat

Azure STT Allow Base URL & Max Speaker Setting

Bryan Berns 3 ay önce
ebeveyn
işleme
6c8a9d000e

+ 12 - 0
backend/open_webui/config.py

@@ -2650,6 +2650,18 @@ AUDIO_STT_AZURE_LOCALES = PersistentConfig(
     os.getenv("AUDIO_STT_AZURE_LOCALES", ""),
 )
 
+AUDIO_STT_AZURE_BASE_URL = PersistentConfig(
+    "AUDIO_STT_AZURE_BASE_URL",
+    "audio.stt.azure.base_url",
+    os.getenv("AUDIO_STT_AZURE_BASE_URL", ""),
+)
+
+AUDIO_STT_AZURE_MAX_SPEAKERS = PersistentConfig(
+    "AUDIO_STT_AZURE_MAX_SPEAKERS",
+    "audio.stt.azure.max_speakers",
+    os.getenv("AUDIO_STT_AZURE_MAX_SPEAKERS", "3"),
+)
+
 AUDIO_TTS_OPENAI_API_BASE_URL = PersistentConfig(
     "AUDIO_TTS_OPENAI_API_BASE_URL",
     "audio.tts.openai.api_base_url",

+ 4 - 0
backend/open_webui/main.py

@@ -155,6 +155,8 @@ from open_webui.config import (
     AUDIO_STT_AZURE_API_KEY,
     AUDIO_STT_AZURE_REGION,
     AUDIO_STT_AZURE_LOCALES,
+    AUDIO_STT_AZURE_BASE_URL,
+    AUDIO_STT_AZURE_MAX_SPEAKERS,    
     AUDIO_TTS_API_KEY,
     AUDIO_TTS_ENGINE,
     AUDIO_TTS_MODEL,
@@ -822,6 +824,8 @@ app.state.config.DEEPGRAM_API_KEY = DEEPGRAM_API_KEY
 app.state.config.AUDIO_STT_AZURE_API_KEY = AUDIO_STT_AZURE_API_KEY
 app.state.config.AUDIO_STT_AZURE_REGION = AUDIO_STT_AZURE_REGION
 app.state.config.AUDIO_STT_AZURE_LOCALES = AUDIO_STT_AZURE_LOCALES
+app.state.config.AUDIO_STT_AZURE_BASE_URL = AUDIO_STT_AZURE_BASE_URL
+app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = AUDIO_STT_AZURE_MAX_SPEAKERS
 
 app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
 app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY

+ 20 - 4
backend/open_webui/routers/audio.py

@@ -150,7 +150,8 @@ class STTConfigForm(BaseModel):
     AZURE_API_KEY: str
     AZURE_REGION: str
     AZURE_LOCALES: str
-
+    AZURE_BASE_URL: str
+    AZURE_MAX_SPEAKERS: str
 
 class AudioConfigUpdateForm(BaseModel):
     tts: TTSConfigForm
@@ -181,6 +182,8 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)):
             "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
             "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
             "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
+            "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
+            "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,            
         },
     }
 
@@ -210,6 +213,8 @@ async def update_audio_config(
     request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY
     request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION
     request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES
+    request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL
+    request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = form_data.stt.AZURE_MAX_SPEAKERS
 
     if request.app.state.config.STT_ENGINE == "":
         request.app.state.faster_whisper_model = set_faster_whisper_model(
@@ -238,6 +243,8 @@ async def update_audio_config(
             "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY,
             "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION,
             "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES,
+            "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL,
+            "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS,    
         },
     }
 
@@ -641,6 +648,8 @@ def transcribe(request: Request, file_path):
         api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY
         region = request.app.state.config.AUDIO_STT_AZURE_REGION
         locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES
+        base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL
+        max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS
 
         # IF NO LOCALES, USE DEFAULTS
         if len(locales) < 2:
@@ -664,7 +673,13 @@ def transcribe(request: Request, file_path):
         if not api_key or not region:
             raise HTTPException(
                 status_code=400,
-                detail="Azure API key and region are required for Azure STT",
+                detail="Azure API key is required for Azure STT",
+            )
+
+        if not base_url and not region:
+            raise HTTPException(
+                status_code=400,
+                detail="Azure region or base url is required for Azure STT",
             )
 
         r = None
@@ -674,13 +689,14 @@ def transcribe(request: Request, file_path):
                 "definition": json.dumps(
                     {
                         "locales": locales.split(","),
-                        "diarization": {"maxSpeakers": 3, "enabled": True},
+                        "diarization": {"maxSpeakers": max_speakers, "enabled": True},
                     }
                     if locales
                     else {}
                 )
             }
-            url = f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
+
+            url = base_url or f"https://{region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version=2024-11-15"
 
             # Use context manager to ensure file is properly closed
             with open(file_path, "rb") as audio_file:

+ 33 - 1
src/lib/components/admin/Settings/Audio.svelte

@@ -42,6 +42,8 @@
 	let STT_AZURE_API_KEY = '';
 	let STT_AZURE_REGION = '';
 	let STT_AZURE_LOCALES = '';
+	let STT_AZURE_BASE_URL = '';
+	let STT_AZURE_MAX_SPEAKERS = '';    
 	let STT_DEEPGRAM_API_KEY = '';
 
 	let STT_WHISPER_MODEL_LOADING = false;
@@ -114,7 +116,9 @@
 				DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY,
 				AZURE_API_KEY: STT_AZURE_API_KEY,
 				AZURE_REGION: STT_AZURE_REGION,
-				AZURE_LOCALES: STT_AZURE_LOCALES
+				AZURE_LOCALES: STT_AZURE_LOCALES,
+				AZURE_BASE_URL: STT_AZURE_BASE_URL,
+				AZURE_MAX_SPEAKERS: STT_AZURE_MAX_SPEAKERS
 			}
 		});
 
@@ -157,6 +161,8 @@
 			STT_AZURE_API_KEY = res.stt.AZURE_API_KEY;
 			STT_AZURE_REGION = res.stt.AZURE_REGION;
 			STT_AZURE_LOCALES = res.stt.AZURE_LOCALES;
+			STT_AZURE_BASE_URL = res.stt.AZURE_BASE_URL;
+			STT_AZURE_MAX_SPEAKERS = res.stt.AZURE_MAX_SPEAKERS;
 			STT_DEEPGRAM_API_KEY = res.stt.DEEPGRAM_API_KEY;
 		}
 
@@ -287,6 +293,32 @@
 									/>
 								</div>
 							</div>
+						</div> 
+                        
+                        <div>
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Base URL')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+										bind:value={STT_AZURE_BASE_URL}
+										placeholder={$i18n.t('(leave blank for Azure Commercial URL auto-generation)')}
+									/>
+								</div>
+							</div>
+						</div>
+                        
+                        <div>
+							<div class=" mb-1.5 text-sm font-medium">{$i18n.t('Max Speakers')}</div>
+							<div class="flex w-full">
+								<div class="flex-1">
+									<input
+										class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+										bind:value={STT_AZURE_MAX_SPEAKERS}
+										placeholder={$i18n.t('e.g., 3, 4, 5 (leave blank for default)')}
+									/>
+								</div>
+							</div>
 						</div>
 					</div>
 				{:else if STT_ENGINE === ''}