Browse Source

feat: update marker api

Hisma 2 tháng trước cách đây
mục cha
commit
f31cc07a9d

+ 10 - 4
backend/open_webui/config.py

@@ -2018,10 +2018,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig(
     os.environ.get("DATALAB_MARKER_API_KEY", ""),
 )
 
-DATALAB_MARKER_LANGS = PersistentConfig(
-    "DATALAB_MARKER_LANGS",
-    "rag.datalab_marker_langs",
-    os.environ.get("DATALAB_MARKER_LANGS", ""),
+DATALAB_MARKER_API_BASE_URL = PersistentConfig(
+    "DATALAB_MARKER_API_BASE_URL",
+    "rag.datalab_marker_api_base_url",
+    os.environ.get("DATALAB_MARKER_API_BASE_URL", ""),
+)
+
+DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig(
+    "DATALAB_MARKER_ADDITIONAL_CONFIG",
+    "rag.datalab_marker_additional_config",
+    os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""),
 )
 
 DATALAB_MARKER_USE_LLM = PersistentConfig(

+ 4 - 2
backend/open_webui/main.py

@@ -227,7 +227,8 @@ from open_webui.config import (
     CHUNK_SIZE,
     CONTENT_EXTRACTION_ENGINE,
     DATALAB_MARKER_API_KEY,
-    DATALAB_MARKER_LANGS,
+    DATALAB_MARKER_API_BASE_URL,
+    DATALAB_MARKER_ADDITIONAL_CONFIG,
     DATALAB_MARKER_SKIP_CACHE,
     DATALAB_MARKER_FORCE_OCR,
     DATALAB_MARKER_PAGINATE,
@@ -767,7 +768,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
 
 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
 app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY
-app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS
+app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL
+app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG
 app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE
 app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR
 app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE

+ 84 - 67
backend/open_webui/retrieval/loaders/datalab_marker.py

@@ -15,7 +15,8 @@ class DatalabMarkerLoader:
         self,
         file_path: str,
         api_key: str,
-        langs: Optional[str] = None,
+        api_base_url: str,
+        additional_config: Optional[str] = None,
         use_llm: bool = False,
         skip_cache: bool = False,
         force_ocr: bool = False,
@@ -26,7 +27,8 @@ class DatalabMarkerLoader:
     ):
         self.file_path = file_path
         self.api_key = api_key
-        self.langs = langs
+        self.api_base_url = api_base_url
+        self.additional_config = additional_config
         self.use_llm = use_llm
         self.skip_cache = skip_cache
         self.force_ocr = force_ocr
@@ -60,7 +62,7 @@ class DatalabMarkerLoader:
         return mime_map.get(ext, "application/octet-stream")
 
     def check_marker_request_status(self, request_id: str) -> dict:
-        url = f"https://www.datalab.to/api/v1/marker/{request_id}"
+        url = f"{self.api_base_url}/{request_id}"
         headers = {"X-Api-Key": self.api_key}
         try:
             response = requests.get(url, headers=headers)
@@ -81,13 +83,12 @@ class DatalabMarkerLoader:
             )
 
     def load(self) -> List[Document]:
-        url = "https://www.datalab.to/api/v1/marker"
+        url = self.api_base_url
         filename = os.path.basename(self.file_path)
         mime_type = self._get_mime_type(filename)
         headers = {"X-Api-Key": self.api_key}
 
         form_data = {
-            "langs": self.langs,
             "use_llm": str(self.use_llm).lower(),
             "skip_cache": str(self.skip_cache).lower(),
             "force_ocr": str(self.force_ocr).lower(),
@@ -97,6 +98,9 @@ class DatalabMarkerLoader:
             "output_format": self.output_format,
         }
 
+        if self.additional_config and self.additional_config.strip():
+            form_data["additional_config"] = self.additional_config
+
         log.info(
             f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}"
         )
@@ -133,74 +137,87 @@ class DatalabMarkerLoader:
 
         check_url = result.get("request_check_url")
         request_id = result.get("request_id")
-        if not check_url:
-            raise HTTPException(
-                status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned."
-            )
+        
+        # Check if this is a direct response (self-hosted) or polling response (DataLab)
+        if check_url:
+            # DataLab polling pattern
+            for _ in range(300):  # Up to 10 minutes
+                time.sleep(2)
+                try:
+                    poll_response = requests.get(check_url, headers=headers)
+                    poll_response.raise_for_status()
+                    poll_result = poll_response.json()
+                except (requests.HTTPError, ValueError) as e:
+                    raw_body = poll_response.text
+                    log.error(f"Polling error: {e}, response body: {raw_body}")
+                    raise HTTPException(
+                        status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
+                    )
 
-        for _ in range(300):  # Up to 10 minutes
-            time.sleep(2)
-            try:
-                poll_response = requests.get(check_url, headers=headers)
-                poll_response.raise_for_status()
-                poll_result = poll_response.json()
-            except (requests.HTTPError, ValueError) as e:
-                raw_body = poll_response.text
-                log.error(f"Polling error: {e}, response body: {raw_body}")
-                raise HTTPException(
-                    status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
-                )
+                status_val = poll_result.get("status")
+                success_val = poll_result.get("success")
 
-            status_val = poll_result.get("status")
-            success_val = poll_result.get("success")
-
-            if status_val == "complete":
-                summary = {
-                    k: poll_result.get(k)
-                    for k in (
-                        "status",
-                        "output_format",
-                        "success",
-                        "error",
-                        "page_count",
-                        "total_cost",
+                if status_val == "complete":
+                    summary = {
+                        k: poll_result.get(k)
+                        for k in (
+                            "status",
+                            "output_format",
+                            "success",
+                            "error",
+                            "page_count",
+                            "total_cost",
+                        )
+                    }
+                    log.info(
+                        f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
                     )
-                }
-                log.info(
-                    f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
-                )
-                break
+                    break
 
-            if status_val == "failed" or success_val is False:
-                log.error(
-                    f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
-                )
-                error_msg = (
-                    poll_result.get("error")
-                    or "Marker returned failure without error message"
+                if status_val == "failed" or success_val is False:
+                    log.error(
+                        f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
+                    )
+                    error_msg = (
+                        poll_result.get("error")
+                        or "Marker returned failure without error message"
+                    )
+                    raise HTTPException(
+                        status.HTTP_400_BAD_REQUEST,
+                        detail=f"Marker processing failed: {error_msg}",
+                    )
+            else:
+                raise HTTPException(
+                    status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
                 )
+
+            if not poll_result.get("success", False):
+                error_msg = poll_result.get("error") or "Unknown processing error"
                 raise HTTPException(
                     status.HTTP_400_BAD_REQUEST,
-                    detail=f"Marker processing failed: {error_msg}",
+                    detail=f"Final processing failed: {error_msg}",
                 )
-        else:
-            raise HTTPException(
-                status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
-            )
 
-        if not poll_result.get("success", False):
-            error_msg = poll_result.get("error") or "Unknown processing error"
-            raise HTTPException(
-                status.HTTP_400_BAD_REQUEST,
-                detail=f"Final processing failed: {error_msg}",
-            )
-
-        content_key = self.output_format.lower()
-        raw_content = poll_result.get(content_key)
+            # DataLab format - content in format-specific fields
+            content_key = self.output_format.lower()
+            raw_content = poll_result.get(content_key)
+            final_result = poll_result
+        else:
+            # Self-hosted direct response - content in "output" field
+            if "output" in result:
+                log.info("Self-hosted Marker returned direct response without polling")
+                raw_content = result.get("output")
+                final_result = result
+            else:
+                available_fields = list(result.keys()) if isinstance(result, dict) else "non-dict response"
+                raise HTTPException(
+                    status.HTTP_502_BAD_GATEWAY, 
+                    detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response."
+                )
 
-        if content_key == "json":
+        if self.output_format.lower() == "json":
             full_text = json.dumps(raw_content, indent=2)
-        elif content_key in {"markdown", "html"}:
+        elif self.output_format.lower() in {"markdown", "html"}:
             full_text = str(raw_content).strip()
         else:
             raise HTTPException(
@@ -211,14 +228,14 @@ class DatalabMarkerLoader:
         if not full_text:
             raise HTTPException(
                 status.HTTP_400_BAD_REQUEST,
-                detail="Datalab Marker returned empty content",
+                detail="Marker returned empty content",
             )
 
         marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
         os.makedirs(marker_output_dir, exist_ok=True)
 
         file_ext_map = {"markdown": "md", "json": "json", "html": "html"}
-        file_ext = file_ext_map.get(content_key, "txt")
+        file_ext = file_ext_map.get(self.output_format.lower(), "txt")
         output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}"
         output_path = os.path.join(marker_output_dir, output_filename)
 
@@ -231,13 +248,13 @@ class DatalabMarkerLoader:
 
         metadata = {
             "source": filename,
-            "output_format": poll_result.get("output_format", self.output_format),
-            "page_count": poll_result.get("page_count", 0),
+            "output_format": final_result.get("output_format", self.output_format),
+            "page_count": final_result.get("page_count", 0),
             "processed_with_llm": self.use_llm,
             "request_id": request_id or "",
         }
 
-        images = poll_result.get("images", {})
+        images = final_result.get("images", {})
         if images:
             metadata["image_count"] = len(images)
             metadata["images"] = json.dumps(list(images.keys()))

+ 6 - 1
backend/open_webui/retrieval/loaders/main.py

@@ -281,10 +281,15 @@ class Loader:
                 "tiff",
             ]
         ):
+            api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
+            if not api_base_url or api_base_url.strip() == "":
+                api_base_url = "https://www.datalab.to/api/v1/marker"
+
             loader = DatalabMarkerLoader(
                 file_path=file_path,
                 api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
-                langs=self.kwargs.get("DATALAB_MARKER_LANGS"),
+                api_base_url=api_base_url,
+                additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
                 use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
                 skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
                 force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),

+ 17 - 8
backend/open_webui/routers/retrieval.py

@@ -401,7 +401,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
         "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
         "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
         "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
-        "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
+        "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+        "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
         "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
         "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
         "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@@ -566,7 +567,8 @@ class ConfigForm(BaseModel):
     CONTENT_EXTRACTION_ENGINE: Optional[str] = None
     PDF_EXTRACT_IMAGES: Optional[bool] = None
     DATALAB_MARKER_API_KEY: Optional[str] = None
-    DATALAB_MARKER_LANGS: Optional[str] = None
+    DATALAB_MARKER_API_BASE_URL: Optional[str] = None
+    DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
     DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
     DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
     DATALAB_MARKER_PAGINATE: Optional[bool] = None
@@ -683,10 +685,15 @@ async def update_rag_config(
         if form_data.DATALAB_MARKER_API_KEY is not None
         else request.app.state.config.DATALAB_MARKER_API_KEY
     )
-    request.app.state.config.DATALAB_MARKER_LANGS = (
-        form_data.DATALAB_MARKER_LANGS
-        if form_data.DATALAB_MARKER_LANGS is not None
-        else request.app.state.config.DATALAB_MARKER_LANGS
+    request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
+        form_data.DATALAB_MARKER_API_BASE_URL
+        if form_data.DATALAB_MARKER_API_BASE_URL is not None
+        else request.app.state.config.DATALAB_MARKER_API_BASE_URL
+    )
+    request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
+        form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
+        if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
+        else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
     )
     request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
         form_data.DATALAB_MARKER_SKIP_CACHE
@@ -1006,7 +1013,8 @@ async def update_rag_config(
         "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
         "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
         "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
-        "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
+        "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+        "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
         "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
         "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
         "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@@ -1406,7 +1414,8 @@ def process_file(
                 loader = Loader(
                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
                     DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
-                    DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS,
+                    DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+                    DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
                     DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
                     DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
                     DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,

+ 51 - 11
src/lib/components/admin/Settings/Documents.svelte

@@ -170,6 +170,19 @@
 			return;
 		}
 
+		if (
+			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
+			RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG &&
+			RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== ''
+		) {
+			try {
+				JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG);
+			} catch (e) {
+				toast.error($i18n.t('Invalid JSON format in Additional Config'));
+				return;
+			}
+		}
+
 		if (
 			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
 			(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
@@ -243,6 +256,11 @@
 			2
 		);
 
+		// Set default API Base URL if empty
+		if (!config.DATALAB_MARKER_API_BASE_URL) {
+			config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker';
+		}
+
 		RAGConfig = config;
 	});
 </script>
@@ -337,6 +355,19 @@
 							</div>
 						{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
 							<div class="my-0.5 flex gap-2 pr-2">
+								<Tooltip
+									content={$i18n.t(
+										'API Base URL for Datalab Marker service. Defaults to: https://www.datalab.to/api/v1/marker'
+									)}
+									placement="top-start"
+									className="w-full"
+								>
+									<input
+										class="flex-1 w-full text-sm bg-transparent outline-hidden"
+										placeholder={$i18n.t('Enter Datalab Marker API Base URL')}
+										bind:value={RAGConfig.DATALAB_MARKER_API_BASE_URL}
+									/>
+								</Tooltip>
 								<SensitiveInput
 									placeholder={$i18n.t('Enter Datalab Marker API Key')}
 									required={false}
@@ -344,24 +375,33 @@
 								/>
 							</div>
 
-							<div class="flex justify-between w-full mt-2">
-								<div class="text-xs font-medium">
-									{$i18n.t('Languages')}
+							<div class="flex flex-col gap-2 mt-2">
+								<div class=" flex flex-col w-full justify-between">
+									<div class=" mb-1 text-xs font-medium">
+										{$i18n.t('Additional Config')}
+									</div>
+									<div class="flex w-full items-center relative">
+										<Tooltip
+											content={$i18n.t(
+												'Additional configuration options for marker. This should be a JSON string with key-value pairs. For example, \'{"key": "value"}\'. Supported keys include: disable_links, keep_pageheader_in_output, keep_pagefooter_in_output, filter_blank_pages, drop_repeated_text, layout_coverage_threshold, merge_threshold, height_tolerance, gap_threshold, image_threshold, min_line_length, level_count, default_level'
+											)}
+											placement="top-start"
+											className="w-full"
+										>
+											<Textarea
+												bind:value={RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG}
+												placeholder={$i18n.t('Enter JSON config (e.g., {"disable_links": true})')}
+											/>
+										</Tooltip>
+									</div>
 								</div>
-
-								<input
-									class="text-sm bg-transparent outline-hidden"
-									type="text"
-									bind:value={RAGConfig.DATALAB_MARKER_LANGS}
-									placeholder={$i18n.t('e.g.) en,fr,de')}
-								/>
 							</div>
 
 							<div class="flex justify-between w-full mt-2">
 								<div class="self-center text-xs font-medium">
 									<Tooltip
 										content={$i18n.t(
-											'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.'
+											'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to False.'
 										)}
 										placement="top-start"
 									>