Quellcode durchsuchen

fix: tikaloader extract images

Timothy Jaeryang Baek vor 5 Monaten
Ursprung
Commit
27da31dc83
1 geänderte Dateien mit 5 neuen und 2 gelöschten Zeilen
  1. 5 2
      backend/open_webui/retrieval/loaders/main.py

+ 5 - 2
backend/open_webui/retrieval/loaders/main.py

@@ -85,11 +85,13 @@ known_source_ext = [
 
 
 class TikaLoader:
-    def __init__(self, url, file_path, mime_type=None):
+    def __init__(self, url, file_path, mime_type=None, extract_images=None):
         self.url = url
         self.file_path = file_path
         self.mime_type = mime_type
 
+        self.exextract_images = extract_images
+
     def load(self) -> list[Document]:
         with open(self.file_path, "rb") as f:
             data = f.read()
@@ -99,7 +101,7 @@ class TikaLoader:
         else:
             headers = {}
 
-        if self.kwargs.get("PDF_EXTRACT_IMAGES") == True:
+        if self.extract_images == True:
             headers["X-Tika-PDFextractInlineImages"] = "true"
 
         endpoint = self.url
@@ -213,6 +215,7 @@ class Loader:
                     url=self.kwargs.get("TIKA_SERVER_URL"),
                     file_path=file_path,
                     mime_type=file_content_type,
+                    extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
                 )
         elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"):
             if self._is_text_file(file_ext, file_content_type):