external.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import requests
  2. import logging
  3. from typing import Iterator, List, Union
  4. from langchain_core.document_loaders import BaseLoader
  5. from langchain_core.documents import Document
  6. from open_webui.env import SRC_LOG_LEVELS
  7. log = logging.getLogger(__name__)
  8. log.setLevel(SRC_LOG_LEVELS["RAG"])
  9. class ExternalLoader(BaseLoader):
  10. def __init__(
  11. self,
  12. web_paths: Union[str, List[str]],
  13. external_url: str,
  14. external_api_key: str,
  15. continue_on_failure: bool = True,
  16. **kwargs,
  17. ) -> None:
  18. self.external_url = external_url
  19. self.external_api_key = external_api_key
  20. self.urls = web_paths if isinstance(web_paths, list) else [web_paths]
  21. self.continue_on_failure = continue_on_failure
  22. def lazy_load(self) -> Iterator[Document]:
  23. batch_size = 20
  24. for i in range(0, len(self.urls), batch_size):
  25. urls = self.urls[i : i + batch_size]
  26. try:
  27. response = requests.post(
  28. self.external_url,
  29. headers={
  30. "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot",
  31. "Authorization": f"Bearer {self.external_api_key}",
  32. },
  33. json={
  34. "urls": urls,
  35. },
  36. )
  37. response.raise_for_status()
  38. results = response.json()
  39. for result in results:
  40. yield Document(
  41. page_content=result.get("page_content", ""),
  42. metadata=result.get("metadata", {}),
  43. )
  44. except Exception as e:
  45. if self.continue_on_failure:
  46. log.error(f"Error extracting content from batch {urls}: {e}")
  47. else:
  48. raise e