external_document.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import requests
  2. import logging, os
  3. from typing import Iterator, List, Union
  4. from urllib.parse import quote
  5. from langchain_core.document_loaders import BaseLoader
  6. from langchain_core.documents import Document
  7. from open_webui.utils.headers import include_user_info_headers
  8. from open_webui.env import SRC_LOG_LEVELS
  9. log = logging.getLogger(__name__)
  10. log.setLevel(SRC_LOG_LEVELS["RAG"])
  11. class ExternalDocumentLoader(BaseLoader):
  12. def __init__(
  13. self,
  14. file_path,
  15. url: str,
  16. api_key: str,
  17. mime_type=None,
  18. user=None,
  19. **kwargs,
  20. ) -> None:
  21. self.url = url
  22. self.api_key = api_key
  23. self.file_path = file_path
  24. self.mime_type = mime_type
  25. self.user = user
  26. def load(self) -> List[Document]:
  27. with open(self.file_path, "rb") as f:
  28. data = f.read()
  29. headers = {}
  30. if self.mime_type is not None:
  31. headers["Content-Type"] = self.mime_type
  32. if self.api_key is not None:
  33. headers["Authorization"] = f"Bearer {self.api_key}"
  34. try:
  35. headers["X-Filename"] = quote(os.path.basename(self.file_path))
  36. except:
  37. pass
  38. if self.user is not None:
  39. headers = include_user_info_headers(headers, self.user)
  40. url = self.url
  41. if url.endswith("/"):
  42. url = url[:-1]
  43. try:
  44. response = requests.put(f"{url}/process", data=data, headers=headers)
  45. except Exception as e:
  46. log.error(f"Error connecting to endpoint: {e}")
  47. raise Exception(f"Error connecting to endpoint: {e}")
  48. if response.ok:
  49. response_data = response.json()
  50. if response_data:
  51. if isinstance(response_data, dict):
  52. return [
  53. Document(
  54. page_content=response_data.get("page_content"),
  55. metadata=response_data.get("metadata"),
  56. )
  57. ]
  58. elif isinstance(response_data, list):
  59. documents = []
  60. for document in response_data:
  61. documents.append(
  62. Document(
  63. page_content=document.get("page_content"),
  64. metadata=document.get("metadata"),
  65. )
  66. )
  67. return documents
  68. else:
  69. raise Exception("Error loading document: Unable to parse content")
  70. else:
  71. raise Exception("Error loading document: No content returned")
  72. else:
  73. raise Exception(
  74. f"Error loading document: {response.status_code} {response.text}"
  75. )