Procházet zdrojové kódy

refactor: replace requests with Firecrawl SDK in search and requests Firecrawl SDK in scrape rather than langchain_community FireCrawlLoader

wei840222 před 4 měsíci
rodič
revize
7a3f4d85f6

+ 9 - 18
backend/open_webui/retrieval/web/firecrawl.py

@@ -1,11 +1,11 @@
 import logging
 from typing import Optional, List
-from urllib.parse import urljoin
 
-import requests
 from open_webui.retrieval.web.main import SearchResult, get_filtered_results
 from open_webui.env import SRC_LOG_LEVELS
 
+from firecrawl import Firecrawl
+
 log = logging.getLogger(__name__)
 log.setLevel(SRC_LOG_LEVELS["RAG"])
 
@@ -18,27 +18,18 @@ def search_firecrawl(
     filter_list: Optional[List[str]] = None,
 ) -> List[SearchResult]:
     try:
-        firecrawl_search_url = urljoin(firecrawl_url, "/v1/search")
-        response = requests.post(
-            firecrawl_search_url,
-            headers={
-                "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
-                "Authorization": f"Bearer {firecrawl_api_key}",
-            },
-            json={
-                "query": query,
-                "limit": count,
-            },
+        firecrawl = Firecrawl(api_key=firecrawl_api_key, api_url=firecrawl_url)
+        response = firecrawl.search(
+            query=query, limit=count, ignore_invalid_urls=True, timeout=count * 3
         )
-        response.raise_for_status()
-        results = response.json().get("data", [])
+        results = response.web
         if filter_list:
             results = get_filtered_results(results, filter_list)
         results = [
             SearchResult(
-                link=result.get("url"),
-                title=result.get("title"),
-                snippet=result.get("description"),
+                link=result.url,
+                title=result.title,
+                snippet=result.description,
             )
             for result in results[:count]
         ]

+ 76 - 43
backend/open_webui/retrieval/web/utils.py

@@ -4,7 +4,6 @@ import socket
 import ssl
 import urllib.parse
 import urllib.request
-from collections import defaultdict
 from datetime import datetime, time, timedelta
 from typing import (
     Any,
@@ -21,7 +20,6 @@ import aiohttp
 import certifi
 import validators
 from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
-from langchain_community.document_loaders.firecrawl import FireCrawlLoader
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 from open_webui.retrieval.loaders.tavily import TavilyLoader
@@ -39,7 +37,9 @@ from open_webui.config import (
     EXTERNAL_WEB_LOADER_URL,
     EXTERNAL_WEB_LOADER_API_KEY,
 )
-from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL
+from open_webui.env import SRC_LOG_LEVELS
+
+from firecrawl import Firecrawl
 
 log = logging.getLogger(__name__)
 log.setLevel(SRC_LOG_LEVELS["RAG"])
@@ -189,13 +189,12 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
                 (uses FIRE_CRAWL_API_KEY environment variable if not provided).
             api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
             mode: Operation mode selection:
-                - 'crawl': Website crawling mode (default)
-                - 'scrape': Direct page scraping
+                - 'crawl': Website crawling mode
+                - 'scrape': Direct page scraping (default)
                 - 'map': Site map generation
             proxy: Proxy override settings for the FireCrawl API.
             params: The parameters to pass to the Firecrawl API.
-                Examples include crawlerOptions.
-                For more details, visit: https://github.com/mendableai/firecrawl-py
+                For more details, visit: https://docs.firecrawl.dev/sdks/python#batch-scrape
         """
         proxy_server = proxy.get("server") if proxy else None
         if trust_env and not proxy_server:
@@ -215,50 +214,84 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
         self.api_key = api_key
         self.api_url = api_url
         self.mode = mode
-        self.params = params
+        self.params = params or {}
 
     def lazy_load(self) -> Iterator[Document]:
-        """Load documents concurrently using FireCrawl."""
-        for url in self.web_paths:
-            try:
-                self._safe_process_url_sync(url)
-                loader = FireCrawlLoader(
-                    url=url,
-                    api_key=self.api_key,
-                    api_url=self.api_url,
-                    mode=self.mode,
-                    params=self.params,
+        """Load documents using FireCrawl batch_scrape."""
+        log.debug(
+            "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
+            len(self.web_paths),
+            self.mode,
+            self.params,
+        )
+        try:
+            firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
+            result = firecrawl.batch_scrape(
+                self.web_paths,
+                formats=["markdown"],
+                skip_tls_verification=not self.verify_ssl,
+                ignore_invalid_urls=True,
+                remove_base64_images=True,
+                max_age=300000,  # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
+                wait_timeout=len(self.web_paths) * 3,
+                **self.params,
+            )
+
+            if result.status != "completed":
+                raise RuntimeError(
+                    f"FireCrawl batch scrape did not complete successfully. result: {result}"
                 )
-                for document in loader.lazy_load():
-                    if not document.metadata.get("source"):
-                        document.metadata["source"] = document.metadata.get("sourceURL")
-                    yield document
-            except Exception as e:
-                if self.continue_on_failure:
-                    log.exception(f"Error loading {url}: {e}")
-                    continue
+
+            for data in result.data:
+                metadata = data.metadata or {}
+                yield Document(
+                    page_content=data.markdown or "",
+                    metadata={"source": metadata.url or metadata.source_url or ""},
+                )
+
+        except Exception as e:
+            if self.continue_on_failure:
+                log.exception(f"Error extracting content from URLs: {e}")
+            else:
                 raise e
 
     async def alazy_load(self):
         """Async version of lazy_load."""
-        for url in self.web_paths:
-            try:
-                await self._safe_process_url(url)
-                loader = FireCrawlLoader(
-                    url=url,
-                    api_key=self.api_key,
-                    api_url=self.api_url,
-                    mode=self.mode,
-                    params=self.params,
+        log.debug(
+            "Starting FireCrawl batch scrape for %d URLs, mode: %s, params: %s",
+            len(self.web_paths),
+            self.mode,
+            self.params,
+        )
+        try:
+            firecrawl = Firecrawl(api_key=self.api_key, api_url=self.api_url)
+            result = firecrawl.batch_scrape(
+                self.web_paths,
+                formats=["markdown"],
+                skip_tls_verification=not self.verify_ssl,
+                ignore_invalid_urls=True,
+                remove_base64_images=True,
+                max_age=300000,  # 5 minutes https://docs.firecrawl.dev/features/fast-scraping#common-maxage-values
+                wait_timeout=len(self.web_paths) * 3,
+                **self.params,
+            )
+
+            if result.status != "completed":
+                raise RuntimeError(
+                    f"FireCrawl batch scrape did not complete successfully. result: {result}"
                 )
-                async for document in loader.alazy_load():
-                    if not document.metadata.get("source"):
-                        document.metadata["source"] = document.metadata.get("sourceURL")
-                    yield document
-            except Exception as e:
-                if self.continue_on_failure:
-                    log.exception(f"Error loading {url}: {e}")
-                    continue
+
+            for data in result.data:
+                metadata = data.metadata or {}
+                yield Document(
+                    page_content=data.markdown or "",
+                    metadata={"source": metadata.url or metadata.source_url or ""},
+                )
+
+        except Exception as e:
+            if self.continue_on_failure:
+                log.exception(f"Error extracting content from URLs: {e}")
+            else:
                 raise e
 
 

+ 1 - 1
backend/requirements.txt

@@ -133,7 +133,7 @@ pytest-docker~=3.1.1
 ldap3==2.9.1
 
 ## Firecrawl
-firecrawl-py==1.12.0
+firecrawl-py==4.5.0
 
 ## Trace
 opentelemetry-api==1.37.0

+ 1 - 1
pyproject.toml

@@ -151,7 +151,7 @@ all = [
     "oracledb==3.2.0",
 
     "colbert-ai==0.2.21",
-    "firecrawl-py==1.12.0",
+    "firecrawl-py==4.5.0",
 ]
 
 [project.scripts]