Browse Source

feat: Backend for Self-Hosted/External Web Search/Loader Engines

tth37 3 months ago
parent
commit
839ba22c90

+ 23 - 0
backend/open_webui/config.py

@@ -2236,6 +2236,29 @@ FIRECRAWL_API_BASE_URL = PersistentConfig(
     os.environ.get("FIRECRAWL_API_BASE_URL", "https://api.firecrawl.dev"),
 )
 
+EXTERNAL_WEB_SEARCH_URL = PersistentConfig(
+    "EXTERNAL_WEB_SEARCH_URL",
+    "rag.web.search.external_web_search_url",
+    os.environ.get("EXTERNAL_WEB_SEARCH_URL", ""),
+)
+
+EXTERNAL_WEB_SEARCH_API_KEY = PersistentConfig(
+    "EXTERNAL_WEB_SEARCH_API_KEY",
+    "rag.web.search.external_web_search_api_key",
+    os.environ.get("EXTERNAL_WEB_SEARCH_API_KEY", ""),
+)
+
+EXTERNAL_WEB_LOADER_URL = PersistentConfig(
+    "EXTERNAL_WEB_LOADER_URL",
+    "rag.web.loader.external_web_loader_url",
+    os.environ.get("EXTERNAL_WEB_LOADER_URL", ""),
+)
+
+EXTERNAL_WEB_LOADER_API_KEY = PersistentConfig(
+    "EXTERNAL_WEB_LOADER_API_KEY",
+    "rag.web.loader.external_web_loader_api_key",
+    os.environ.get("EXTERNAL_WEB_LOADER_API_KEY", ""),
+)
 
 ####################################
 # Images

+ 8 - 0
backend/open_webui/main.py

@@ -245,6 +245,10 @@ from open_webui.config import (
     ENABLE_GOOGLE_DRIVE_INTEGRATION,
     ENABLE_ONEDRIVE_INTEGRATION,
     UPLOAD_DIR,
+    EXTERNAL_WEB_SEARCH_URL,
+    EXTERNAL_WEB_SEARCH_API_KEY,
+    EXTERNAL_WEB_LOADER_URL,
+    EXTERNAL_WEB_LOADER_API_KEY,
     # WebUI
     WEBUI_AUTH,
     WEBUI_NAME,
@@ -667,6 +671,10 @@ app.state.config.EXA_API_KEY = EXA_API_KEY
 app.state.config.PERPLEXITY_API_KEY = PERPLEXITY_API_KEY
 app.state.config.SOUGOU_API_SID = SOUGOU_API_SID
 app.state.config.SOUGOU_API_SK = SOUGOU_API_SK
+app.state.config.EXTERNAL_WEB_SEARCH_URL = EXTERNAL_WEB_SEARCH_URL
+app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = EXTERNAL_WEB_SEARCH_API_KEY
+app.state.config.EXTERNAL_WEB_LOADER_URL = EXTERNAL_WEB_LOADER_URL
+app.state.config.EXTERNAL_WEB_LOADER_API_KEY = EXTERNAL_WEB_LOADER_API_KEY
 
 
 app.state.config.PLAYWRIGHT_WS_URL = PLAYWRIGHT_WS_URL

+ 56 - 0
backend/open_webui/retrieval/loaders/external.py

@@ -0,0 +1,56 @@
+import requests
+import logging
+from typing import Iterator, List, Union
+
+from langchain_core.document_loaders import BaseLoader
+from langchain_core.documents import Document
+from open_webui.env import SRC_LOG_LEVELS
+
+log = logging.getLogger(__name__)
+log.setLevel(SRC_LOG_LEVELS["RAG"])
+
+
+class ExternalLoader(BaseLoader):
+    def __init__(
+        self,
+        web_paths: Union[str, List[str]],
+        external_url: str,
+        external_api_key: str,
+        continue_on_failure: bool = True,
+        **kwargs,
+    ) -> None:
+        if not web_paths:
+            raise ValueError("At least one URL must be provided.")
+
+        self.external_url = external_url
+        self.external_api_key = external_api_key
+        self.urls = web_paths if isinstance(web_paths, list) else [web_paths]
+        self.continue_on_failure = continue_on_failure
+
+    def lazy_load(self) -> Iterator[Document]:
+        batch_size = 20
+        for i in range(0, len(self.urls), batch_size):
+            urls = self.urls[i : i + batch_size]
+            try:
+                response = requests.get(
+                    self.external_url,
+                    headers={
+                        "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
+                        "Authorization": f"Bearer {self.external_api_key}",
+                    },
+                    params={
+                        "urls": urls,
+                    }
+                )
+                response.raise_for_status()
+                results = response.json()
+                for result in results:
+                    yield Document(
+                        page_content=result.get("page_content", ""),
+                        metadata=result.get("metadata", {}),
+                    )
+            except Exception as e:
+                if self.continue_on_failure:
+                    log.error(f"Error extracting content from batch {urls}: {e}")
+                else:
+                    raise e

+ 45 - 0
backend/open_webui/retrieval/web/external.py

@@ -0,0 +1,45 @@
+import logging
+from typing import Optional, List
+
+import requests
+from open_webui.retrieval.web.main import SearchResult, get_filtered_results
+from open_webui.env import SRC_LOG_LEVELS
+
+log = logging.getLogger(__name__)
+log.setLevel(SRC_LOG_LEVELS["RAG"])
+
+
+def search_external(
+    external_url: str,
+    external_api_key: str,
+    query: str,
+    count: int,
+    filter_list: Optional[List[str]] = None,
+) -> List[SearchResult]:
+    try:
+        response = requests.get(
+            external_url,
+            headers={
+                "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
+                "Authorization": f"Bearer {external_api_key}",
+            },
+            params={
+                "query": query,
+                "count": count,
+            }
+        )
+        response.raise_for_status()
+        results = response.json()
+        if filter_list:
+            results = get_filtered_results(results, filter_list)
+        return [
+            SearchResult(
+                link=result.get("link"),
+                title=result.get("title"),
+                snippet=result.get("snippet"),
+            )
+            for result in results[:count]
+        ]
+    except Exception as e:
+        log.error(f"Error in External search: {e}")
+        return []

+ 8 - 0
backend/open_webui/retrieval/web/utils.py

@@ -25,6 +25,7 @@ from langchain_community.document_loaders.firecrawl import FireCrawlLoader
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from open_webui.retrieval.loaders.tavily import TavilyLoader
+from open_webui.retrieval.loaders.external import ExternalLoader
 from open_webui.constants import ERROR_MESSAGES
 from open_webui.config import (
     ENABLE_RAG_LOCAL_WEB_FETCH,
@@ -35,6 +36,8 @@ from open_webui.config import (
     FIRECRAWL_API_KEY,
     TAVILY_API_KEY,
     TAVILY_EXTRACT_DEPTH,
+    EXTERNAL_WEB_LOADER_URL,
+    EXTERNAL_WEB_LOADER_API_KEY,
 )
 from open_webui.env import SRC_LOG_LEVELS
 
@@ -619,6 +622,11 @@ def get_web_loader(
         web_loader_args["api_key"] = TAVILY_API_KEY.value
         web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
 
+    if WEB_LOADER_ENGINE.value == "external":
+        WebLoaderClass = ExternalLoader
+        web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value
+        web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value
+
     if WebLoaderClass:
         web_loader = WebLoaderClass(**web_loader_args)
 

+ 9 - 0
backend/open_webui/routers/retrieval.py

@@ -61,6 +61,7 @@ from open_webui.retrieval.web.bing import search_bing
 from open_webui.retrieval.web.exa import search_exa
 from open_webui.retrieval.web.perplexity import search_perplexity
 from open_webui.retrieval.web.sougou import search_sougou
+from open_webui.retrieval.web.external import search_external
 
 from open_webui.retrieval.utils import (
     get_embedding_function,
@@ -1465,6 +1466,14 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
             raise Exception(
                 "No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables"
             )
+    elif engine == "external":
+        return search_external(
+            request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
+            request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
+            query,
+            request.app.state.config.WEB_SEARCH_RESULT_COUNT,
+            request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
+        )
     else:
         raise Exception("No search engine API key found in environment variables")