external.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import requests
  2. import logging
  3. from typing import Iterator, List, Union
  4. from langchain_core.document_loaders import BaseLoader
  5. from langchain_core.documents import Document
  6. from open_webui.env import SRC_LOG_LEVELS
  7. log = logging.getLogger(__name__)
  8. log.setLevel(SRC_LOG_LEVELS["RAG"])
  9. class ExternalLoader(BaseLoader):
  10. def __init__(
  11. self,
  12. web_paths: Union[str, List[str]],
  13. external_url: str,
  14. external_api_key: str,
  15. continue_on_failure: bool = True,
  16. **kwargs,
  17. ) -> None:
  18. if not web_paths:
  19. raise ValueError("At least one URL must be provided.")
  20. self.external_url = external_url
  21. self.external_api_key = external_api_key
  22. self.urls = web_paths if isinstance(web_paths, list) else [web_paths]
  23. self.continue_on_failure = continue_on_failure
  24. def lazy_load(self) -> Iterator[Document]:
  25. batch_size = 20
  26. for i in range(0, len(self.urls), batch_size):
  27. urls = self.urls[i : i + batch_size]
  28. try:
  29. response = requests.get(
  30. self.external_url,
  31. headers={
  32. "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
  33. "Authorization": f"Bearer {self.external_api_key}",
  34. },
  35. params={
  36. "urls": urls,
  37. }
  38. )
  39. response.raise_for_status()
  40. results = response.json()
  41. for result in results:
  42. yield Document(
  43. page_content=result.get("page_content", ""),
  44. metadata=result.get("metadata", {}),
  45. )
  46. except Exception as e:
  47. if self.continue_on_failure:
  48. log.error(f"Error extracting content from batch {urls}: {e}")
  49. else:
  50. raise e