Browse Source

Merge pull request #12822 from tth37/feat_external_search_loader

feat: Support for Self-Hosted/External Web Search/Loader Engines
Tim Jaeryang Baek 2 months ago
parent
commit
87844a8042

+ 23 - 0
backend/open_webui/config.py

@@ -2270,6 +2270,29 @@ FIRECRAWL_API_BASE_URL = PersistentConfig(
     os.environ.get("FIRECRAWL_API_BASE_URL", "https://api.firecrawl.dev"),
 )
 
+EXTERNAL_WEB_SEARCH_URL = PersistentConfig(
+    "EXTERNAL_WEB_SEARCH_URL",
+    "rag.web.search.external_web_search_url",
+    os.environ.get("EXTERNAL_WEB_SEARCH_URL", ""),
+)
+
+EXTERNAL_WEB_SEARCH_API_KEY = PersistentConfig(
+    "EXTERNAL_WEB_SEARCH_API_KEY",
+    "rag.web.search.external_web_search_api_key",
+    os.environ.get("EXTERNAL_WEB_SEARCH_API_KEY", ""),
+)
+
+EXTERNAL_WEB_LOADER_URL = PersistentConfig(
+    "EXTERNAL_WEB_LOADER_URL",
+    "rag.web.loader.external_web_loader_url",
+    os.environ.get("EXTERNAL_WEB_LOADER_URL", ""),
+)
+
+EXTERNAL_WEB_LOADER_API_KEY = PersistentConfig(
+    "EXTERNAL_WEB_LOADER_API_KEY",
+    "rag.web.loader.external_web_loader_api_key",
+    os.environ.get("EXTERNAL_WEB_LOADER_API_KEY", ""),
+)
 
 ####################################
 # Images

+ 8 - 0
backend/open_webui/main.py

@@ -251,6 +251,10 @@ from open_webui.config import (
     ENABLE_GOOGLE_DRIVE_INTEGRATION,
     ENABLE_ONEDRIVE_INTEGRATION,
     UPLOAD_DIR,
+    EXTERNAL_WEB_SEARCH_URL,
+    EXTERNAL_WEB_SEARCH_API_KEY,
+    EXTERNAL_WEB_LOADER_URL,
+    EXTERNAL_WEB_LOADER_API_KEY,
     # WebUI
     WEBUI_AUTH,
     WEBUI_NAME,
@@ -678,6 +682,10 @@ app.state.config.EXA_API_KEY = EXA_API_KEY
 app.state.config.PERPLEXITY_API_KEY = PERPLEXITY_API_KEY
 app.state.config.SOUGOU_API_SID = SOUGOU_API_SID
 app.state.config.SOUGOU_API_SK = SOUGOU_API_SK
+app.state.config.EXTERNAL_WEB_SEARCH_URL = EXTERNAL_WEB_SEARCH_URL
+app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = EXTERNAL_WEB_SEARCH_API_KEY
+app.state.config.EXTERNAL_WEB_LOADER_URL = EXTERNAL_WEB_LOADER_URL
+app.state.config.EXTERNAL_WEB_LOADER_API_KEY = EXTERNAL_WEB_LOADER_API_KEY
 
 
 app.state.config.PLAYWRIGHT_WS_URL = PLAYWRIGHT_WS_URL

+ 53 - 0
backend/open_webui/retrieval/loaders/external.py

@@ -0,0 +1,53 @@
+import requests
+import logging
+from typing import Iterator, List, Union
+
+from langchain_core.document_loaders import BaseLoader
+from langchain_core.documents import Document
+from open_webui.env import SRC_LOG_LEVELS
+
+log = logging.getLogger(__name__)
+log.setLevel(SRC_LOG_LEVELS["RAG"])
+
+
+class ExternalLoader(BaseLoader):
+    def __init__(
+        self,
+        web_paths: Union[str, List[str]],
+        external_url: str,
+        external_api_key: str,
+        continue_on_failure: bool = True,
+        **kwargs,
+    ) -> None:
+        self.external_url = external_url
+        self.external_api_key = external_api_key
+        self.urls = web_paths if isinstance(web_paths, list) else [web_paths]
+        self.continue_on_failure = continue_on_failure
+
+    def lazy_load(self) -> Iterator[Document]:
+        batch_size = 20
+        for i in range(0, len(self.urls), batch_size):
+            urls = self.urls[i : i + batch_size]
+            try:
+                response = requests.post(
+                    self.external_url,
+                    headers={
+                        "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
+                        "Authorization": f"Bearer {self.external_api_key}",
+                    },
+                    json={
+                        "urls": urls,
+                    },
+                )
+                response.raise_for_status()
+                results = response.json()
+                for result in results:
+                    yield Document(
+                        page_content=result.get("page_content", ""),
+                        metadata=result.get("metadata", {}),
+                    )
+            except Exception as e:
+                if self.continue_on_failure:
+                    log.error(f"Error extracting content from batch {urls}: {e}")
+                else:
+                    raise e

+ 47 - 0
backend/open_webui/retrieval/web/external.py

@@ -0,0 +1,47 @@
+import logging
+from typing import Optional, List
+
+import requests
+from open_webui.retrieval.web.main import SearchResult, get_filtered_results
+from open_webui.env import SRC_LOG_LEVELS
+
+log = logging.getLogger(__name__)
+log.setLevel(SRC_LOG_LEVELS["RAG"])
+
+
+def search_external(
+    external_url: str,
+    external_api_key: str,
+    query: str,
+    count: int,
+    filter_list: Optional[List[str]] = None,
+) -> List[SearchResult]:
+    try:
+        response = requests.post(
+            external_url,
+            headers={
+                "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
+                "Authorization": f"Bearer {external_api_key}",
+            },
+            json={
+                "query": query,
+                "count": count,
+            },
+        )
+        response.raise_for_status()
+        results = response.json()
+        if filter_list:
+            results = get_filtered_results(results, filter_list)
+        results = [
+            SearchResult(
+                link=result.get("link"),
+                title=result.get("title"),
+                snippet=result.get("snippet"),
+            )
+            for result in results[:count]
+        ]
+        log.info(f"External search results: {results}")
+        return results
+    except Exception as e:
+        log.error(f"Error in External search: {e}")
+        return []

+ 8 - 0
backend/open_webui/retrieval/web/utils.py

@@ -25,6 +25,7 @@ from langchain_community.document_loaders.firecrawl import FireCrawlLoader
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from open_webui.retrieval.loaders.tavily import TavilyLoader
+from open_webui.retrieval.loaders.external import ExternalLoader
 from open_webui.constants import ERROR_MESSAGES
 from open_webui.config import (
     ENABLE_RAG_LOCAL_WEB_FETCH,
@@ -35,6 +36,8 @@ from open_webui.config import (
     FIRECRAWL_API_KEY,
     TAVILY_API_KEY,
     TAVILY_EXTRACT_DEPTH,
+    EXTERNAL_WEB_LOADER_URL,
+    EXTERNAL_WEB_LOADER_API_KEY,
 )
 from open_webui.env import SRC_LOG_LEVELS
 
@@ -619,6 +622,11 @@ def get_web_loader(
         web_loader_args["api_key"] = TAVILY_API_KEY.value
         web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
 
+    if WEB_LOADER_ENGINE.value == "external":
+        WebLoaderClass = ExternalLoader
+        web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value
+        web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value
+
     if WebLoaderClass:
         web_loader = WebLoaderClass(**web_loader_args)
 

+ 33 - 0
backend/open_webui/routers/retrieval.py

@@ -61,6 +61,7 @@ from open_webui.retrieval.web.bing import search_bing
 from open_webui.retrieval.web.exa import search_exa
 from open_webui.retrieval.web.perplexity import search_perplexity
 from open_webui.retrieval.web.sougou import search_sougou
+from open_webui.retrieval.web.external import search_external
 
 from open_webui.retrieval.utils import (
     get_embedding_function,
@@ -418,6 +419,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
             "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY,
             "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL,
             "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH,
+            "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
+            "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
+            "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL,
+            "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
             "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
             "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
             "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION,
@@ -463,6 +468,10 @@ class WebConfig(BaseModel):
     FIRECRAWL_API_KEY: Optional[str] = None
     FIRECRAWL_API_BASE_URL: Optional[str] = None
     TAVILY_EXTRACT_DEPTH: Optional[str] = None
+    EXTERNAL_WEB_SEARCH_URL: Optional[str] = None
+    EXTERNAL_WEB_SEARCH_API_KEY: Optional[str] = None
+    EXTERNAL_WEB_LOADER_URL: Optional[str] = None
+    EXTERNAL_WEB_LOADER_API_KEY: Optional[str] = None
     YOUTUBE_LOADER_LANGUAGE: Optional[List[str]] = None
     YOUTUBE_LOADER_PROXY_URL: Optional[str] = None
     YOUTUBE_LOADER_TRANSLATION: Optional[str] = None
@@ -697,6 +706,18 @@ async def update_rag_config(
         request.app.state.config.FIRECRAWL_API_BASE_URL = (
             form_data.web.FIRECRAWL_API_BASE_URL
         )
+        request.app.state.config.EXTERNAL_WEB_SEARCH_URL = (
+            form_data.web.EXTERNAL_WEB_SEARCH_URL
+        )
+        request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = (
+            form_data.web.EXTERNAL_WEB_SEARCH_API_KEY
+        )
+        request.app.state.config.EXTERNAL_WEB_LOADER_URL = (
+            form_data.web.EXTERNAL_WEB_LOADER_URL
+        )
+        request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY = (
+            form_data.web.EXTERNAL_WEB_LOADER_API_KEY
+        )
         request.app.state.config.TAVILY_EXTRACT_DEPTH = (
             form_data.web.TAVILY_EXTRACT_DEPTH
         )
@@ -778,6 +799,10 @@ async def update_rag_config(
             "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY,
             "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL,
             "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH,
+            "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
+            "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
+            "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL,
+            "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
             "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
             "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
             "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION,
@@ -1465,6 +1490,14 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
             raise Exception(
                 "No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables"
             )
+    elif engine == "external":
+        return search_external(
+            request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
+            request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
+            query,
+            request.app.state.config.WEB_SEARCH_RESULT_COUNT,
+            request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
+        )
     else:
         raise Exception("No search engine API key found in environment variables")
 

+ 65 - 2
src/lib/components/admin/Settings/WebSearch.svelte

@@ -30,9 +30,10 @@
 		'bing',
 		'exa',
 		'perplexity',
-		'sougou'
+		'sougou',
+		'external'
 	];
-	let webLoaderEngines = ['playwright', 'firecrawl', 'tavily'];
+	let webLoaderEngines = ['playwright', 'firecrawl', 'tavily', 'external'];
 
 	let webConfig = null;
 
@@ -431,6 +432,37 @@
 									/>
 								</div>
 							</div>
+						{:else if webConfig.WEB_SEARCH_ENGINE === 'external'}
+							<div class="mb-2.5 flex w-full flex-col">
+								<div>
+									<div class=" self-center text-xs font-medium mb-1">
+										{$i18n.t('External Web Search URL')}
+									</div>
+
+									<div class="flex w-full">
+										<div class="flex-1">
+											<input
+												class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+												type="text"
+												placeholder={$i18n.t('Enter External Web Search URL')}
+												bind:value={webConfig.EXTERNAL_WEB_SEARCH_URL}
+												autocomplete="off"
+											/>
+										</div>
+									</div>
+								</div>
+
+								<div class="mt-2">
+									<div class=" self-center text-xs font-medium mb-1">
+										{$i18n.t('External Web Search API Key')}
+									</div>
+
+									<SensitiveInput
+										placeholder={$i18n.t('Enter External Web Search API Key')}
+										bind:value={webConfig.EXTERNAL_WEB_SEARCH_API_KEY}
+									/>
+								</div>
+							</div>
 						{/if}
 					{/if}
 
@@ -652,6 +684,37 @@
 								</div>
 							{/if}
 						</div>
+					{:else if webConfig.WEB_LOADER_ENGINE === 'external'}
+						<div class="mb-2.5 flex w-full flex-col">
+							<div>
+								<div class=" self-center text-xs font-medium mb-1">
+									{$i18n.t('External Web Loader URL')}
+								</div>
+
+								<div class="flex w-full">
+									<div class="flex-1">
+										<input
+											class="w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
+											type="text"
+											placeholder={$i18n.t('Enter External Web Loader URL')}
+											bind:value={webConfig.EXTERNAL_WEB_LOADER_URL}
+											autocomplete="off"
+										/>
+									</div>
+								</div>
+							</div>
+
+							<div class="mt-2">
+								<div class=" self-center text-xs font-medium mb-1">
+									{$i18n.t('External Web Loader API Key')}
+								</div>
+
+								<SensitiveInput
+									placeholder={$i18n.t('Enter External Web Loader API Key')}
+									bind:value={webConfig.EXTERNAL_WEB_LOADER_API_KEY}
+								/>
+							</div>
+						</div>
 					{/if}
 
 					<div class="  mb-2.5 flex w-full justify-between">