main.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. import requests
  2. import logging
  3. import ftfy
  4. import sys
  5. import json
  6. from azure.identity import DefaultAzureCredential
  7. from langchain_community.document_loaders import (
  8. AzureAIDocumentIntelligenceLoader,
  9. BSHTMLLoader,
  10. CSVLoader,
  11. Docx2txtLoader,
  12. OutlookMessageLoader,
  13. PyPDFLoader,
  14. TextLoader,
  15. UnstructuredEPubLoader,
  16. UnstructuredExcelLoader,
  17. UnstructuredODTLoader,
  18. UnstructuredPowerPointLoader,
  19. UnstructuredRSTLoader,
  20. UnstructuredXMLLoader,
  21. YoutubeLoader,
  22. )
  23. from langchain_core.documents import Document
  24. from open_webui.retrieval.loaders.external_document import ExternalDocumentLoader
  25. from open_webui.retrieval.loaders.mistral import MistralLoader
  26. from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader
  27. from open_webui.retrieval.loaders.mineru import MinerULoader
  28. from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL
  29. logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL)
  30. log = logging.getLogger(__name__)
  31. log.setLevel(SRC_LOG_LEVELS["RAG"])
  32. known_source_ext = [
  33. "go",
  34. "py",
  35. "java",
  36. "sh",
  37. "bat",
  38. "ps1",
  39. "cmd",
  40. "js",
  41. "ts",
  42. "css",
  43. "cpp",
  44. "hpp",
  45. "h",
  46. "c",
  47. "cs",
  48. "sql",
  49. "log",
  50. "ini",
  51. "pl",
  52. "pm",
  53. "r",
  54. "dart",
  55. "dockerfile",
  56. "env",
  57. "php",
  58. "hs",
  59. "hsc",
  60. "lua",
  61. "nginxconf",
  62. "conf",
  63. "m",
  64. "mm",
  65. "plsql",
  66. "perl",
  67. "rb",
  68. "rs",
  69. "db2",
  70. "scala",
  71. "bash",
  72. "swift",
  73. "vue",
  74. "svelte",
  75. "ex",
  76. "exs",
  77. "erl",
  78. "tsx",
  79. "jsx",
  80. "hs",
  81. "lhs",
  82. "json",
  83. ]
  84. class TikaLoader:
  85. def __init__(self, url, file_path, mime_type=None, extract_images=None):
  86. self.url = url
  87. self.file_path = file_path
  88. self.mime_type = mime_type
  89. self.extract_images = extract_images
  90. def load(self) -> list[Document]:
  91. with open(self.file_path, "rb") as f:
  92. data = f.read()
  93. if self.mime_type is not None:
  94. headers = {"Content-Type": self.mime_type}
  95. else:
  96. headers = {}
  97. if self.extract_images == True:
  98. headers["X-Tika-PDFextractInlineImages"] = "true"
  99. endpoint = self.url
  100. if not endpoint.endswith("/"):
  101. endpoint += "/"
  102. endpoint += "tika/text"
  103. r = requests.put(endpoint, data=data, headers=headers)
  104. if r.ok:
  105. raw_metadata = r.json()
  106. text = raw_metadata.get("X-TIKA:content", "<No text content found>").strip()
  107. if "Content-Type" in raw_metadata:
  108. headers["Content-Type"] = raw_metadata["Content-Type"]
  109. log.debug("Tika extracted text: %s", text)
  110. return [Document(page_content=text, metadata=headers)]
  111. else:
  112. raise Exception(f"Error calling Tika: {r.reason}")
  113. class DoclingLoader:
  114. def __init__(self, url, file_path=None, mime_type=None, params=None):
  115. self.url = url.rstrip("/")
  116. self.file_path = file_path
  117. self.mime_type = mime_type
  118. self.params = params or {}
  119. def load(self) -> list[Document]:
  120. with open(self.file_path, "rb") as f:
  121. files = {
  122. "files": (
  123. self.file_path,
  124. f,
  125. self.mime_type or "application/octet-stream",
  126. )
  127. }
  128. params = {"image_export_mode": "placeholder"}
  129. if self.params:
  130. if self.params.get("do_picture_description"):
  131. params["do_picture_description"] = self.params.get(
  132. "do_picture_description"
  133. )
  134. picture_description_mode = self.params.get(
  135. "picture_description_mode", ""
  136. ).lower()
  137. if picture_description_mode == "local" and self.params.get(
  138. "picture_description_local", {}
  139. ):
  140. params["picture_description_local"] = json.dumps(
  141. self.params.get("picture_description_local", {})
  142. )
  143. elif picture_description_mode == "api" and self.params.get(
  144. "picture_description_api", {}
  145. ):
  146. params["picture_description_api"] = json.dumps(
  147. self.params.get("picture_description_api", {})
  148. )
  149. params["do_ocr"] = self.params.get("do_ocr")
  150. params["force_ocr"] = self.params.get("force_ocr")
  151. if (
  152. self.params.get("do_ocr")
  153. and self.params.get("ocr_engine")
  154. and self.params.get("ocr_lang")
  155. ):
  156. params["ocr_engine"] = self.params.get("ocr_engine")
  157. params["ocr_lang"] = [
  158. lang.strip()
  159. for lang in self.params.get("ocr_lang").split(",")
  160. if lang.strip()
  161. ]
  162. if self.params.get("pdf_backend"):
  163. params["pdf_backend"] = self.params.get("pdf_backend")
  164. if self.params.get("table_mode"):
  165. params["table_mode"] = self.params.get("table_mode")
  166. if self.params.get("pipeline"):
  167. params["pipeline"] = self.params.get("pipeline")
  168. endpoint = f"{self.url}/v1/convert/file"
  169. r = requests.post(endpoint, files=files, data=params)
  170. if r.ok:
  171. result = r.json()
  172. document_data = result.get("document", {})
  173. text = document_data.get("md_content", "<No text content found>")
  174. metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
  175. log.debug("Docling extracted text: %s", text)
  176. return [Document(page_content=text, metadata=metadata)]
  177. else:
  178. error_msg = f"Error calling Docling API: {r.reason}"
  179. if r.text:
  180. try:
  181. error_data = r.json()
  182. if "detail" in error_data:
  183. error_msg += f" - {error_data['detail']}"
  184. except Exception:
  185. error_msg += f" - {r.text}"
  186. raise Exception(f"Error calling Docling: {error_msg}")
  187. class Loader:
  188. def __init__(self, engine: str = "", **kwargs):
  189. self.engine = engine
  190. self.user = kwargs.get("user", None)
  191. self.kwargs = kwargs
  192. def load(
  193. self, filename: str, file_content_type: str, file_path: str
  194. ) -> list[Document]:
  195. loader = self._get_loader(filename, file_content_type, file_path)
  196. docs = loader.load()
  197. return [
  198. Document(
  199. page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata
  200. )
  201. for doc in docs
  202. ]
  203. def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
  204. return file_ext in known_source_ext or (
  205. file_content_type
  206. and file_content_type.find("text/") >= 0
  207. # Avoid text/html files being detected as text
  208. and not file_content_type.find("html") >= 0
  209. )
  210. def _get_loader(self, filename: str, file_content_type: str, file_path: str):
  211. file_ext = filename.split(".")[-1].lower()
  212. if (
  213. self.engine == "external"
  214. and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL")
  215. and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY")
  216. ):
  217. loader = ExternalDocumentLoader(
  218. file_path=file_path,
  219. url=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL"),
  220. api_key=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY"),
  221. mime_type=file_content_type,
  222. user=self.user,
  223. )
  224. elif self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
  225. if self._is_text_file(file_ext, file_content_type):
  226. loader = TextLoader(file_path, autodetect_encoding=True)
  227. else:
  228. loader = TikaLoader(
  229. url=self.kwargs.get("TIKA_SERVER_URL"),
  230. file_path=file_path,
  231. extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
  232. )
  233. elif (
  234. self.engine == "datalab_marker"
  235. and self.kwargs.get("DATALAB_MARKER_API_KEY")
  236. and file_ext
  237. in [
  238. "pdf",
  239. "xls",
  240. "xlsx",
  241. "ods",
  242. "doc",
  243. "docx",
  244. "odt",
  245. "ppt",
  246. "pptx",
  247. "odp",
  248. "html",
  249. "epub",
  250. "png",
  251. "jpeg",
  252. "jpg",
  253. "webp",
  254. "gif",
  255. "tiff",
  256. ]
  257. ):
  258. api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
  259. if not api_base_url or api_base_url.strip() == "":
  260. api_base_url = "https://www.datalab.to/api/v1/marker" # https://github.com/open-webui/open-webui/pull/16867#issuecomment-3218424349
  261. loader = DatalabMarkerLoader(
  262. file_path=file_path,
  263. api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
  264. api_base_url=api_base_url,
  265. additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
  266. use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
  267. skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
  268. force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),
  269. paginate=self.kwargs.get("DATALAB_MARKER_PAGINATE", False),
  270. strip_existing_ocr=self.kwargs.get(
  271. "DATALAB_MARKER_STRIP_EXISTING_OCR", False
  272. ),
  273. disable_image_extraction=self.kwargs.get(
  274. "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
  275. ),
  276. format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False),
  277. output_format=self.kwargs.get(
  278. "DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
  279. ),
  280. )
  281. elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
  282. if self._is_text_file(file_ext, file_content_type):
  283. loader = TextLoader(file_path, autodetect_encoding=True)
  284. else:
  285. # Build params for DoclingLoader
  286. params = self.kwargs.get("DOCLING_PARAMS", {})
  287. if not isinstance(params, dict):
  288. try:
  289. params = json.loads(params)
  290. except json.JSONDecodeError:
  291. log.error("Invalid DOCLING_PARAMS format, expected JSON object")
  292. params = {}
  293. loader = DoclingLoader(
  294. url=self.kwargs.get("DOCLING_SERVER_URL"),
  295. file_path=file_path,
  296. mime_type=file_content_type,
  297. params=params,
  298. )
  299. elif (
  300. self.engine == "document_intelligence"
  301. and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
  302. and (
  303. file_ext in ["pdf", "docx", "ppt", "pptx"]
  304. or file_content_type
  305. in [
  306. "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  307. "application/vnd.ms-powerpoint",
  308. "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  309. ]
  310. )
  311. ):
  312. if self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != "":
  313. loader = AzureAIDocumentIntelligenceLoader(
  314. file_path=file_path,
  315. api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
  316. api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
  317. )
  318. else:
  319. loader = AzureAIDocumentIntelligenceLoader(
  320. file_path=file_path,
  321. api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
  322. azure_credential=DefaultAzureCredential(),
  323. )
  324. elif self.engine == "mineru" and file_ext in [
  325. "pdf"
  326. ]: # MinerU currently only supports PDF
  327. loader = MinerULoader(
  328. file_path=file_path,
  329. api_mode=self.kwargs.get("MINERU_API_MODE", "local"),
  330. api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"),
  331. api_key=self.kwargs.get("MINERU_API_KEY", ""),
  332. params=self.kwargs.get("MINERU_PARAMS", {}),
  333. )
  334. elif (
  335. self.engine == "mistral_ocr"
  336. and self.kwargs.get("MISTRAL_OCR_API_KEY") != ""
  337. and file_ext
  338. in ["pdf"] # Mistral OCR currently only supports PDF and images
  339. ):
  340. loader = MistralLoader(
  341. base_url=self.kwargs.get("MISTRAL_OCR_API_BASE_URL"),
  342. api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"),
  343. file_path=file_path,
  344. )
  345. else:
  346. if file_ext == "pdf":
  347. loader = PyPDFLoader(
  348. file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
  349. )
  350. elif file_ext == "csv":
  351. loader = CSVLoader(file_path, autodetect_encoding=True)
  352. elif file_ext == "rst":
  353. loader = UnstructuredRSTLoader(file_path, mode="elements")
  354. elif file_ext == "xml":
  355. loader = UnstructuredXMLLoader(file_path)
  356. elif file_ext in ["htm", "html"]:
  357. loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
  358. elif file_ext == "md":
  359. loader = TextLoader(file_path, autodetect_encoding=True)
  360. elif file_content_type == "application/epub+zip":
  361. loader = UnstructuredEPubLoader(file_path)
  362. elif (
  363. file_content_type
  364. == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
  365. or file_ext == "docx"
  366. ):
  367. loader = Docx2txtLoader(file_path)
  368. elif file_content_type in [
  369. "application/vnd.ms-excel",
  370. "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  371. ] or file_ext in ["xls", "xlsx"]:
  372. loader = UnstructuredExcelLoader(file_path)
  373. elif file_content_type in [
  374. "application/vnd.ms-powerpoint",
  375. "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  376. ] or file_ext in ["ppt", "pptx"]:
  377. loader = UnstructuredPowerPointLoader(file_path)
  378. elif file_ext == "msg":
  379. loader = OutlookMessageLoader(file_path)
  380. elif file_ext == "odt":
  381. loader = UnstructuredODTLoader(file_path)
  382. elif self._is_text_file(file_ext, file_content_type):
  383. loader = TextLoader(file_path, autodetect_encoding=True)
  384. else:
  385. loader = TextLoader(file_path, autodetect_encoding=True)
  386. return loader