Переглянути джерело

Merge pull request #12486 from FabioPolito24/text-file-handling-docling

fix: text file handling with docling
Timothy Jaeryang Baek 4 тижнів тому
батько
коміт
ef787e4a79
1 змінених файлів з 15 додано та 11 видалено
  1. 15 11
      backend/open_webui/retrieval/loaders/main.py

+ 15 - 11
backend/open_webui/retrieval/loaders/main.py

@@ -184,13 +184,16 @@ class Loader:
             for doc in docs
         ]
 
+    def _is_text_file(self, file_ext: str, file_content_type: str) -> bool:
+        return file_ext in known_source_ext or (
+            file_content_type and file_content_type.find("text/") >= 0
+        )
+
     def _get_loader(self, filename: str, file_content_type: str, file_path: str):
         file_ext = filename.split(".")[-1].lower()
 
         if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"):
-            if file_ext in known_source_ext or (
-                file_content_type and file_content_type.find("text/") >= 0
-            ):
+            if self._is_text_file(file_ext, file_content_type):
                 loader = TextLoader(file_path, autodetect_encoding=True)
             else:
                 loader = TikaLoader(
@@ -199,11 +202,14 @@ class Loader:
                     mime_type=file_content_type,
                 )
         elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
-            loader = DoclingLoader(
-                url=self.kwargs.get("DOCLING_SERVER_URL"),
-                file_path=file_path,
-                mime_type=file_content_type,
-            )
+            if self._is_text_file(file_ext, file_content_type):
+                loader = TextLoader(file_path, autodetect_encoding=True)
+            else:
+                loader = DoclingLoader(
+                    url=self.kwargs.get("DOCLING_SERVER_URL"),
+                    file_path=file_path,
+                    mime_type=file_content_type,
+                )
         elif (
             self.engine == "document_intelligence"
             and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
@@ -269,9 +275,7 @@ class Loader:
                 loader = UnstructuredPowerPointLoader(file_path)
             elif file_ext == "msg":
                 loader = OutlookMessageLoader(file_path)
-            elif file_ext in known_source_ext or (
-                file_content_type and file_content_type.find("text/") >= 0
-            ):
+            elif self._is_text_file(file_ext, file_content_type):
                 loader = TextLoader(file_path, autodetect_encoding=True)
             else:
                 loader = TextLoader(file_path, autodetect_encoding=True)