external_document.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import requests
  2. import logging, os
  3. from typing import Iterator, List, Union
  4. from urllib.parse import quote
  5. from langchain_core.document_loaders import BaseLoader
  6. from langchain_core.documents import Document
  7. from open_webui.env import SRC_LOG_LEVELS
  8. log = logging.getLogger(__name__)
  9. log.setLevel(SRC_LOG_LEVELS["RAG"])
  10. class ExternalDocumentLoader(BaseLoader):
  11. def __init__(
  12. self,
  13. file_path,
  14. url: str,
  15. api_key: str,
  16. mime_type=None,
  17. **kwargs,
  18. ) -> None:
  19. self.url = url
  20. self.api_key = api_key
  21. self.file_path = file_path
  22. self.mime_type = mime_type
  23. def load(self) -> List[Document]:
  24. with open(self.file_path, "rb") as f:
  25. data = f.read()
  26. headers = {}
  27. if self.mime_type is not None:
  28. headers["Content-Type"] = self.mime_type
  29. if self.api_key is not None:
  30. headers["Authorization"] = f"Bearer {self.api_key}"
  31. try:
  32. headers["X-Filename"] = quote(os.path.basename(self.file_path))
  33. except:
  34. pass
  35. url = self.url
  36. if url.endswith("/"):
  37. url = url[:-1]
  38. try:
  39. response = requests.put(f"{url}/process", data=data, headers=headers)
  40. except Exception as e:
  41. log.error(f"Error connecting to endpoint: {e}")
  42. raise Exception(f"Error connecting to endpoint: {e}")
  43. if response.ok:
  44. response_data = response.json()
  45. if response_data:
  46. if isinstance(response_data, dict):
  47. return [
  48. Document(
  49. page_content=response_data.get("page_content"),
  50. metadata=response_data.get("metadata"),
  51. )
  52. ]
  53. elif isinstance(response_data, list):
  54. documents = []
  55. for document in response_data:
  56. documents.append(
  57. Document(
  58. page_content=document.get("page_content"),
  59. metadata=document.get("metadata"),
  60. )
  61. )
  62. return documents
  63. else:
  64. raise Exception("Error loading document: Unable to parse content")
  65. else:
  66. raise Exception("Error loading document: No content returned")
  67. else:
  68. raise Exception(
  69. f"Error loading document: {response.status_code} {response.text}"
  70. )