utils.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. import asyncio
  2. import logging
  3. import socket
  4. import ssl
  5. import urllib.parse
  6. import urllib.request
  7. from collections import defaultdict
  8. from datetime import datetime, time, timedelta
  9. from typing import (
  10. Any,
  11. AsyncIterator,
  12. Dict,
  13. Iterator,
  14. List,
  15. Optional,
  16. Sequence,
  17. Union,
  18. Literal,
  19. )
  20. import aiohttp
  21. import certifi
  22. import validators
  23. from langchain_community.document_loaders import PlaywrightURLLoader, WebBaseLoader
  24. from langchain_community.document_loaders.firecrawl import FireCrawlLoader
  25. from langchain_community.document_loaders.base import BaseLoader
  26. from langchain_core.documents import Document
  27. from open_webui.retrieval.loaders.tavily import TavilyLoader
  28. from open_webui.retrieval.loaders.external_web import ExternalWebLoader
  29. from open_webui.constants import ERROR_MESSAGES
  30. from open_webui.config import (
  31. ENABLE_RAG_LOCAL_WEB_FETCH,
  32. PLAYWRIGHT_WS_URL,
  33. PLAYWRIGHT_TIMEOUT,
  34. WEB_LOADER_ENGINE,
  35. FIRECRAWL_API_BASE_URL,
  36. FIRECRAWL_API_KEY,
  37. TAVILY_API_KEY,
  38. TAVILY_EXTRACT_DEPTH,
  39. EXTERNAL_WEB_LOADER_URL,
  40. EXTERNAL_WEB_LOADER_API_KEY,
  41. )
  42. from open_webui.env import SRC_LOG_LEVELS, AIOHTTP_CLIENT_SESSION_SSL
  43. log = logging.getLogger(__name__)
  44. log.setLevel(SRC_LOG_LEVELS["RAG"])
  45. def validate_url(url: Union[str, Sequence[str]]):
  46. if isinstance(url, str):
  47. if isinstance(validators.url(url), validators.ValidationError):
  48. raise ValueError(ERROR_MESSAGES.INVALID_URL)
  49. if not ENABLE_RAG_LOCAL_WEB_FETCH:
  50. # Local web fetch is disabled, filter out any URLs that resolve to private IP addresses
  51. parsed_url = urllib.parse.urlparse(url)
  52. # Get IPv4 and IPv6 addresses
  53. ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname)
  54. # Check if any of the resolved addresses are private
  55. # This is technically still vulnerable to DNS rebinding attacks, as we don't control WebBaseLoader
  56. for ip in ipv4_addresses:
  57. if validators.ipv4(ip, private=True):
  58. raise ValueError(ERROR_MESSAGES.INVALID_URL)
  59. for ip in ipv6_addresses:
  60. if validators.ipv6(ip, private=True):
  61. raise ValueError(ERROR_MESSAGES.INVALID_URL)
  62. return True
  63. elif isinstance(url, Sequence):
  64. return all(validate_url(u) for u in url)
  65. else:
  66. return False
  67. def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
  68. valid_urls = []
  69. for u in url:
  70. try:
  71. if validate_url(u):
  72. valid_urls.append(u)
  73. except Exception as e:
  74. log.debug(f"Invalid URL {u}: {str(e)}")
  75. continue
  76. return valid_urls
  77. def resolve_hostname(hostname):
  78. # Get address information
  79. addr_info = socket.getaddrinfo(hostname, None)
  80. # Extract IP addresses from address information
  81. ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET]
  82. ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6]
  83. return ipv4_addresses, ipv6_addresses
  84. def extract_metadata(soup, url):
  85. metadata = {"source": url}
  86. if title := soup.find("title"):
  87. metadata["title"] = title.get_text()
  88. if description := soup.find("meta", attrs={"name": "description"}):
  89. metadata["description"] = description.get("content", "No description found.")
  90. if html := soup.find("html"):
  91. metadata["language"] = html.get("lang", "No language found.")
  92. return metadata
  93. def verify_ssl_cert(url: str) -> bool:
  94. """Verify SSL certificate for the given URL."""
  95. if not url.startswith("https://"):
  96. return True
  97. try:
  98. hostname = url.split("://")[-1].split("/")[0]
  99. context = ssl.create_default_context(cafile=certifi.where())
  100. with context.wrap_socket(ssl.socket(), server_hostname=hostname) as s:
  101. s.connect((hostname, 443))
  102. return True
  103. except ssl.SSLError:
  104. return False
  105. except Exception as e:
  106. log.warning(f"SSL verification failed for {url}: {str(e)}")
  107. return False
  108. class RateLimitMixin:
  109. async def _wait_for_rate_limit(self):
  110. """Wait to respect the rate limit if specified."""
  111. if self.requests_per_second and self.last_request_time:
  112. min_interval = timedelta(seconds=1.0 / self.requests_per_second)
  113. time_since_last = datetime.now() - self.last_request_time
  114. if time_since_last < min_interval:
  115. await asyncio.sleep((min_interval - time_since_last).total_seconds())
  116. self.last_request_time = datetime.now()
  117. def _sync_wait_for_rate_limit(self):
  118. """Synchronous version of rate limit wait."""
  119. if self.requests_per_second and self.last_request_time:
  120. min_interval = timedelta(seconds=1.0 / self.requests_per_second)
  121. time_since_last = datetime.now() - self.last_request_time
  122. if time_since_last < min_interval:
  123. time.sleep((min_interval - time_since_last).total_seconds())
  124. self.last_request_time = datetime.now()
  125. class URLProcessingMixin:
  126. def _verify_ssl_cert(self, url: str) -> bool:
  127. """Verify SSL certificate for a URL."""
  128. return verify_ssl_cert(url)
  129. async def _safe_process_url(self, url: str) -> bool:
  130. """Perform safety checks before processing a URL."""
  131. if self.verify_ssl and not self._verify_ssl_cert(url):
  132. raise ValueError(f"SSL certificate verification failed for {url}")
  133. await self._wait_for_rate_limit()
  134. return True
  135. def _safe_process_url_sync(self, url: str) -> bool:
  136. """Synchronous version of safety checks."""
  137. if self.verify_ssl and not self._verify_ssl_cert(url):
  138. raise ValueError(f"SSL certificate verification failed for {url}")
  139. self._sync_wait_for_rate_limit()
  140. return True
  141. class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
  142. def __init__(
  143. self,
  144. web_paths,
  145. verify_ssl: bool = True,
  146. trust_env: bool = False,
  147. requests_per_second: Optional[float] = None,
  148. continue_on_failure: bool = True,
  149. api_key: Optional[str] = None,
  150. api_url: Optional[str] = None,
  151. mode: Literal["crawl", "scrape", "map"] = "scrape",
  152. proxy: Optional[Dict[str, str]] = None,
  153. params: Optional[Dict] = None,
  154. ):
  155. """Concurrent document loader for FireCrawl operations.
  156. Executes multiple FireCrawlLoader instances concurrently using thread pooling
  157. to improve bulk processing efficiency.
  158. Args:
  159. web_paths: List of URLs/paths to process.
  160. verify_ssl: If True, verify SSL certificates.
  161. trust_env: If True, use proxy settings from environment variables.
  162. requests_per_second: Number of requests per second to limit to.
  163. continue_on_failure (bool): If True, continue loading other URLs on failure.
  164. api_key: API key for FireCrawl service. Defaults to None
  165. (uses FIRE_CRAWL_API_KEY environment variable if not provided).
  166. api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
  167. mode: Operation mode selection:
  168. - 'crawl': Website crawling mode (default)
  169. - 'scrape': Direct page scraping
  170. - 'map': Site map generation
  171. proxy: Proxy override settings for the FireCrawl API.
  172. params: The parameters to pass to the Firecrawl API.
  173. Examples include crawlerOptions.
  174. For more details, visit: https://github.com/mendableai/firecrawl-py
  175. """
  176. proxy_server = proxy.get("server") if proxy else None
  177. if trust_env and not proxy_server:
  178. env_proxies = urllib.request.getproxies()
  179. env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
  180. if env_proxy_server:
  181. if proxy:
  182. proxy["server"] = env_proxy_server
  183. else:
  184. proxy = {"server": env_proxy_server}
  185. self.web_paths = web_paths
  186. self.verify_ssl = verify_ssl
  187. self.requests_per_second = requests_per_second
  188. self.last_request_time = None
  189. self.trust_env = trust_env
  190. self.continue_on_failure = continue_on_failure
  191. self.api_key = api_key
  192. self.api_url = api_url
  193. self.mode = mode
  194. self.params = params
  195. def lazy_load(self) -> Iterator[Document]:
  196. """Load documents concurrently using FireCrawl."""
  197. for url in self.web_paths:
  198. try:
  199. self._safe_process_url_sync(url)
  200. loader = FireCrawlLoader(
  201. url=url,
  202. api_key=self.api_key,
  203. api_url=self.api_url,
  204. mode=self.mode,
  205. params=self.params,
  206. )
  207. for document in loader.lazy_load():
  208. if not document.metadata.get("source"):
  209. document.metadata["source"] = document.metadata.get("sourceURL")
  210. yield document
  211. except Exception as e:
  212. if self.continue_on_failure:
  213. log.exception(f"Error loading {url}: {e}")
  214. continue
  215. raise e
  216. async def alazy_load(self):
  217. """Async version of lazy_load."""
  218. for url in self.web_paths:
  219. try:
  220. await self._safe_process_url(url)
  221. loader = FireCrawlLoader(
  222. url=url,
  223. api_key=self.api_key,
  224. api_url=self.api_url,
  225. mode=self.mode,
  226. params=self.params,
  227. )
  228. async for document in loader.alazy_load():
  229. if not document.metadata.get("source"):
  230. document.metadata["source"] = document.metadata.get("sourceURL")
  231. yield document
  232. except Exception as e:
  233. if self.continue_on_failure:
  234. log.exception(f"Error loading {url}: {e}")
  235. continue
  236. raise e
  237. class SafeTavilyLoader(BaseLoader, RateLimitMixin, URLProcessingMixin):
  238. def __init__(
  239. self,
  240. web_paths: Union[str, List[str]],
  241. api_key: str,
  242. extract_depth: Literal["basic", "advanced"] = "basic",
  243. continue_on_failure: bool = True,
  244. requests_per_second: Optional[float] = None,
  245. verify_ssl: bool = True,
  246. trust_env: bool = False,
  247. proxy: Optional[Dict[str, str]] = None,
  248. ):
  249. """Initialize SafeTavilyLoader with rate limiting and SSL verification support.
  250. Args:
  251. web_paths: List of URLs/paths to process.
  252. api_key: The Tavily API key.
  253. extract_depth: Depth of extraction ("basic" or "advanced").
  254. continue_on_failure: Whether to continue if extraction of a URL fails.
  255. requests_per_second: Number of requests per second to limit to.
  256. verify_ssl: If True, verify SSL certificates.
  257. trust_env: If True, use proxy settings from environment variables.
  258. proxy: Optional proxy configuration.
  259. """
  260. # Initialize proxy configuration if using environment variables
  261. proxy_server = proxy.get("server") if proxy else None
  262. if trust_env and not proxy_server:
  263. env_proxies = urllib.request.getproxies()
  264. env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
  265. if env_proxy_server:
  266. if proxy:
  267. proxy["server"] = env_proxy_server
  268. else:
  269. proxy = {"server": env_proxy_server}
  270. # Store parameters for creating TavilyLoader instances
  271. self.web_paths = web_paths if isinstance(web_paths, list) else [web_paths]
  272. self.api_key = api_key
  273. self.extract_depth = extract_depth
  274. self.continue_on_failure = continue_on_failure
  275. self.verify_ssl = verify_ssl
  276. self.trust_env = trust_env
  277. self.proxy = proxy
  278. # Add rate limiting
  279. self.requests_per_second = requests_per_second
  280. self.last_request_time = None
  281. def lazy_load(self) -> Iterator[Document]:
  282. """Load documents with rate limiting support, delegating to TavilyLoader."""
  283. valid_urls = []
  284. for url in self.web_paths:
  285. try:
  286. self._safe_process_url_sync(url)
  287. valid_urls.append(url)
  288. except Exception as e:
  289. log.warning(f"SSL verification failed for {url}: {str(e)}")
  290. if not self.continue_on_failure:
  291. raise e
  292. if not valid_urls:
  293. if self.continue_on_failure:
  294. log.warning("No valid URLs to process after SSL verification")
  295. return
  296. raise ValueError("No valid URLs to process after SSL verification")
  297. try:
  298. loader = TavilyLoader(
  299. urls=valid_urls,
  300. api_key=self.api_key,
  301. extract_depth=self.extract_depth,
  302. continue_on_failure=self.continue_on_failure,
  303. )
  304. yield from loader.lazy_load()
  305. except Exception as e:
  306. if self.continue_on_failure:
  307. log.exception(f"Error extracting content from URLs: {e}")
  308. else:
  309. raise e
  310. async def alazy_load(self) -> AsyncIterator[Document]:
  311. """Async version with rate limiting and SSL verification."""
  312. valid_urls = []
  313. for url in self.web_paths:
  314. try:
  315. await self._safe_process_url(url)
  316. valid_urls.append(url)
  317. except Exception as e:
  318. log.warning(f"SSL verification failed for {url}: {str(e)}")
  319. if not self.continue_on_failure:
  320. raise e
  321. if not valid_urls:
  322. if self.continue_on_failure:
  323. log.warning("No valid URLs to process after SSL verification")
  324. return
  325. raise ValueError("No valid URLs to process after SSL verification")
  326. try:
  327. loader = TavilyLoader(
  328. urls=valid_urls,
  329. api_key=self.api_key,
  330. extract_depth=self.extract_depth,
  331. continue_on_failure=self.continue_on_failure,
  332. )
  333. async for document in loader.alazy_load():
  334. yield document
  335. except Exception as e:
  336. if self.continue_on_failure:
  337. log.exception(f"Error loading URLs: {e}")
  338. else:
  339. raise e
  340. class SafePlaywrightURLLoader(PlaywrightURLLoader, RateLimitMixin, URLProcessingMixin):
  341. """Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.
  342. Attributes:
  343. web_paths (List[str]): List of URLs to load.
  344. verify_ssl (bool): If True, verify SSL certificates.
  345. trust_env (bool): If True, use proxy settings from environment variables.
  346. requests_per_second (Optional[float]): Number of requests per second to limit to.
  347. continue_on_failure (bool): If True, continue loading other URLs on failure.
  348. headless (bool): If True, the browser will run in headless mode.
  349. proxy (dict): Proxy override settings for the Playwright session.
  350. playwright_ws_url (Optional[str]): WebSocket endpoint URI for remote browser connection.
  351. playwright_timeout (Optional[int]): Maximum operation time in milliseconds.
  352. """
  353. def __init__(
  354. self,
  355. web_paths: List[str],
  356. verify_ssl: bool = True,
  357. trust_env: bool = False,
  358. requests_per_second: Optional[float] = None,
  359. continue_on_failure: bool = True,
  360. headless: bool = True,
  361. remove_selectors: Optional[List[str]] = None,
  362. proxy: Optional[Dict[str, str]] = None,
  363. playwright_ws_url: Optional[str] = None,
  364. playwright_timeout: Optional[int] = 10000,
  365. ):
  366. """Initialize with additional safety parameters and remote browser support."""
  367. proxy_server = proxy.get("server") if proxy else None
  368. if trust_env and not proxy_server:
  369. env_proxies = urllib.request.getproxies()
  370. env_proxy_server = env_proxies.get("https") or env_proxies.get("http")
  371. if env_proxy_server:
  372. if proxy:
  373. proxy["server"] = env_proxy_server
  374. else:
  375. proxy = {"server": env_proxy_server}
  376. # We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
  377. super().__init__(
  378. urls=web_paths,
  379. continue_on_failure=continue_on_failure,
  380. headless=headless if playwright_ws_url is None else False,
  381. remove_selectors=remove_selectors,
  382. proxy=proxy,
  383. )
  384. self.verify_ssl = verify_ssl
  385. self.requests_per_second = requests_per_second
  386. self.last_request_time = None
  387. self.playwright_ws_url = playwright_ws_url
  388. self.trust_env = trust_env
  389. self.playwright_timeout = playwright_timeout
  390. def lazy_load(self) -> Iterator[Document]:
  391. """Safely load URLs synchronously with support for remote browser."""
  392. from playwright.sync_api import sync_playwright
  393. with sync_playwright() as p:
  394. # Use remote browser if ws_endpoint is provided, otherwise use local browser
  395. if self.playwright_ws_url:
  396. browser = p.chromium.connect(self.playwright_ws_url)
  397. else:
  398. browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
  399. for url in self.urls:
  400. try:
  401. self._safe_process_url_sync(url)
  402. page = browser.new_page()
  403. response = page.goto(url, timeout=self.playwright_timeout)
  404. if response is None:
  405. raise ValueError(f"page.goto() returned None for url {url}")
  406. text = self.evaluator.evaluate(page, browser, response)
  407. metadata = {"source": url}
  408. yield Document(page_content=text, metadata=metadata)
  409. except Exception as e:
  410. if self.continue_on_failure:
  411. log.exception(f"Error loading {url}: {e}")
  412. continue
  413. raise e
  414. browser.close()
  415. async def alazy_load(self) -> AsyncIterator[Document]:
  416. """Safely load URLs asynchronously with support for remote browser."""
  417. from playwright.async_api import async_playwright
  418. async with async_playwright() as p:
  419. # Use remote browser if ws_endpoint is provided, otherwise use local browser
  420. if self.playwright_ws_url:
  421. browser = await p.chromium.connect(self.playwright_ws_url)
  422. else:
  423. browser = await p.chromium.launch(
  424. headless=self.headless, proxy=self.proxy
  425. )
  426. for url in self.urls:
  427. try:
  428. await self._safe_process_url(url)
  429. page = await browser.new_page()
  430. response = await page.goto(url, timeout=self.playwright_timeout)
  431. if response is None:
  432. raise ValueError(f"page.goto() returned None for url {url}")
  433. text = await self.evaluator.evaluate_async(page, browser, response)
  434. metadata = {"source": url}
  435. yield Document(page_content=text, metadata=metadata)
  436. except Exception as e:
  437. if self.continue_on_failure:
  438. log.exception(f"Error loading {url}: {e}")
  439. continue
  440. raise e
  441. await browser.close()
  442. class SafeWebBaseLoader(WebBaseLoader):
  443. """WebBaseLoader with enhanced error handling for URLs."""
  444. def __init__(self, trust_env: bool = False, *args, **kwargs):
  445. """Initialize SafeWebBaseLoader
  446. Args:
  447. trust_env (bool, optional): set to True if using proxy to make web requests, for example
  448. using http(s)_proxy environment variables. Defaults to False.
  449. """
  450. super().__init__(*args, **kwargs)
  451. self.trust_env = trust_env
  452. async def _fetch(
  453. self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
  454. ) -> str:
  455. async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
  456. for i in range(retries):
  457. try:
  458. kwargs: Dict = dict(
  459. headers=self.session.headers,
  460. cookies=self.session.cookies.get_dict(),
  461. )
  462. if not self.session.verify:
  463. kwargs["ssl"] = False
  464. async with session.get(
  465. url,
  466. **(self.requests_kwargs | kwargs),
  467. allow_redirects=False,
  468. ) as response:
  469. if self.raise_for_status:
  470. response.raise_for_status()
  471. return await response.text()
  472. except aiohttp.ClientConnectionError as e:
  473. if i == retries - 1:
  474. raise
  475. else:
  476. log.warning(
  477. f"Error fetching {url} with attempt "
  478. f"{i + 1}/{retries}: {e}. Retrying..."
  479. )
  480. await asyncio.sleep(cooldown * backoff**i)
  481. raise ValueError("retry count exceeded")
  482. def _unpack_fetch_results(
  483. self, results: Any, urls: List[str], parser: Union[str, None] = None
  484. ) -> List[Any]:
  485. """Unpack fetch results into BeautifulSoup objects."""
  486. from bs4 import BeautifulSoup
  487. final_results = []
  488. for i, result in enumerate(results):
  489. url = urls[i]
  490. if parser is None:
  491. if url.endswith(".xml"):
  492. parser = "xml"
  493. else:
  494. parser = self.default_parser
  495. self._check_parser(parser)
  496. final_results.append(BeautifulSoup(result, parser, **self.bs_kwargs))
  497. return final_results
  498. async def ascrape_all(
  499. self, urls: List[str], parser: Union[str, None] = None
  500. ) -> List[Any]:
  501. """Async fetch all urls, then return soups for all results."""
  502. results = await self.fetch_all(urls)
  503. return self._unpack_fetch_results(results, urls, parser=parser)
  504. def lazy_load(self) -> Iterator[Document]:
  505. """Lazy load text from the url(s) in web_path with error handling."""
  506. for path in self.web_paths:
  507. try:
  508. soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
  509. text = soup.get_text(**self.bs_get_text_kwargs)
  510. # Build metadata
  511. metadata = extract_metadata(soup, path)
  512. yield Document(page_content=text, metadata=metadata)
  513. except Exception as e:
  514. # Log the error and continue with the next URL
  515. log.exception(f"Error loading {path}: {e}")
  516. async def alazy_load(self) -> AsyncIterator[Document]:
  517. """Async lazy load text from the url(s) in web_path."""
  518. results = await self.ascrape_all(self.web_paths)
  519. for path, soup in zip(self.web_paths, results):
  520. text = soup.get_text(**self.bs_get_text_kwargs)
  521. metadata = {"source": path}
  522. if title := soup.find("title"):
  523. metadata["title"] = title.get_text()
  524. if description := soup.find("meta", attrs={"name": "description"}):
  525. metadata["description"] = description.get(
  526. "content", "No description found."
  527. )
  528. if html := soup.find("html"):
  529. metadata["language"] = html.get("lang", "No language found.")
  530. yield Document(page_content=text, metadata=metadata)
  531. async def aload(self) -> list[Document]:
  532. """Load data into Document objects."""
  533. return [document async for document in self.alazy_load()]
  534. def get_web_loader(
  535. urls: Union[str, Sequence[str]],
  536. verify_ssl: bool = True,
  537. requests_per_second: int = 2,
  538. trust_env: bool = False,
  539. ):
  540. # Check if the URLs are valid
  541. safe_urls = safe_validate_urls([urls] if isinstance(urls, str) else urls)
  542. web_loader_args = {
  543. "web_paths": safe_urls,
  544. "verify_ssl": verify_ssl,
  545. "requests_per_second": requests_per_second,
  546. "continue_on_failure": True,
  547. "trust_env": trust_env,
  548. }
  549. if WEB_LOADER_ENGINE.value == "" or WEB_LOADER_ENGINE.value == "safe_web":
  550. WebLoaderClass = SafeWebBaseLoader
  551. if WEB_LOADER_ENGINE.value == "playwright":
  552. WebLoaderClass = SafePlaywrightURLLoader
  553. web_loader_args["playwright_timeout"] = PLAYWRIGHT_TIMEOUT.value
  554. if PLAYWRIGHT_WS_URL.value:
  555. web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URL.value
  556. if WEB_LOADER_ENGINE.value == "firecrawl":
  557. WebLoaderClass = SafeFireCrawlLoader
  558. web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
  559. web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value
  560. if WEB_LOADER_ENGINE.value == "tavily":
  561. WebLoaderClass = SafeTavilyLoader
  562. web_loader_args["api_key"] = TAVILY_API_KEY.value
  563. web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
  564. if WEB_LOADER_ENGINE.value == "external":
  565. WebLoaderClass = ExternalWebLoader
  566. web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value
  567. web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value
  568. if WebLoaderClass:
  569. web_loader = WebLoaderClass(**web_loader_args)
  570. log.debug(
  571. "Using WEB_LOADER_ENGINE %s for %s URLs",
  572. web_loader.__class__.__name__,
  573. len(safe_urls),
  574. )
  575. return web_loader
  576. else:
  577. raise ValueError(
  578. f"Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. "
  579. "Please set it to 'safe_web', 'playwright', 'firecrawl', or 'tavily'."
  580. )