Quellcode durchsuchen

Merge pull request #15903 from Hisma/marker-api-update

feat: Add configurable API URL (for self-hosting) and additional_config parameter for Datalab Marker API
Tim Jaeryang Baek vor 2 Monaten
Ursprung
Commit
5db60ca34f

+ 16 - 4
backend/open_webui/config.py

@@ -2032,10 +2032,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig(
     os.environ.get("DATALAB_MARKER_API_KEY", ""),
 )
 
-DATALAB_MARKER_LANGS = PersistentConfig(
-    "DATALAB_MARKER_LANGS",
-    "rag.datalab_marker_langs",
-    os.environ.get("DATALAB_MARKER_LANGS", ""),
+DATALAB_MARKER_API_BASE_URL = PersistentConfig(
+    "DATALAB_MARKER_API_BASE_URL",
+    "rag.datalab_marker_api_base_url",
+    os.environ.get("DATALAB_MARKER_API_BASE_URL", ""),
+)
+
+DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig(
+    "DATALAB_MARKER_ADDITIONAL_CONFIG",
+    "rag.datalab_marker_additional_config",
+    os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""),
 )
 
 DATALAB_MARKER_USE_LLM = PersistentConfig(
@@ -2075,6 +2081,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig(
     == "true",
 )
 
+DATALAB_MARKER_FORMAT_LINES = PersistentConfig(
+    "DATALAB_MARKER_FORMAT_LINES",
+    "rag.datalab_marker_format_lines",
+    os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true",
+)
+
 DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
     "DATALAB_MARKER_OUTPUT_FORMAT",
     "rag.datalab_marker_output_format",

+ 6 - 2
backend/open_webui/main.py

@@ -226,12 +226,14 @@ from open_webui.config import (
     CHUNK_SIZE,
     CONTENT_EXTRACTION_ENGINE,
     DATALAB_MARKER_API_KEY,
-    DATALAB_MARKER_LANGS,
+    DATALAB_MARKER_API_BASE_URL,
+    DATALAB_MARKER_ADDITIONAL_CONFIG,
     DATALAB_MARKER_SKIP_CACHE,
     DATALAB_MARKER_FORCE_OCR,
     DATALAB_MARKER_PAGINATE,
     DATALAB_MARKER_STRIP_EXISTING_OCR,
     DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
+    DATALAB_MARKER_FORMAT_LINES,
     DATALAB_MARKER_OUTPUT_FORMAT,
     DATALAB_MARKER_USE_LLM,
     EXTERNAL_DOCUMENT_LOADER_URL,
@@ -771,7 +773,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
 
 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
 app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY
-app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS
+app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL
+app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG
 app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE
 app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR
 app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE
@@ -779,6 +782,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI
 app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
     DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
 )
+app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES
 app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM
 app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT
 app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL

+ 92 - 67
backend/open_webui/retrieval/loaders/datalab_marker.py

@@ -15,24 +15,28 @@ class DatalabMarkerLoader:
         self,
         file_path: str,
         api_key: str,
-        langs: Optional[str] = None,
+        api_base_url: str,
+        additional_config: Optional[str] = None,
         use_llm: bool = False,
         skip_cache: bool = False,
         force_ocr: bool = False,
         paginate: bool = False,
         strip_existing_ocr: bool = False,
         disable_image_extraction: bool = False,
+        format_lines: bool = False,
         output_format: str = None,
     ):
         self.file_path = file_path
         self.api_key = api_key
-        self.langs = langs
+        self.api_base_url = api_base_url
+        self.additional_config = additional_config
         self.use_llm = use_llm
         self.skip_cache = skip_cache
         self.force_ocr = force_ocr
         self.paginate = paginate
         self.strip_existing_ocr = strip_existing_ocr
         self.disable_image_extraction = disable_image_extraction
+        self.format_lines = format_lines
         self.output_format = output_format
 
     def _get_mime_type(self, filename: str) -> str:
@@ -60,7 +64,7 @@ class DatalabMarkerLoader:
         return mime_map.get(ext, "application/octet-stream")
 
     def check_marker_request_status(self, request_id: str) -> dict:
-        url = f"https://www.datalab.to/api/v1/marker/{request_id}"
+        url = f"{self.api_base_url}/{request_id}"
         headers = {"X-Api-Key": self.api_key}
         try:
             response = requests.get(url, headers=headers)
@@ -81,22 +85,25 @@ class DatalabMarkerLoader:
             )
 
     def load(self) -> List[Document]:
-        url = "https://www.datalab.to/api/v1/marker"
+        url = self.api_base_url
         filename = os.path.basename(self.file_path)
         mime_type = self._get_mime_type(filename)
         headers = {"X-Api-Key": self.api_key}
 
         form_data = {
-            "langs": self.langs,
             "use_llm": str(self.use_llm).lower(),
             "skip_cache": str(self.skip_cache).lower(),
             "force_ocr": str(self.force_ocr).lower(),
             "paginate": str(self.paginate).lower(),
             "strip_existing_ocr": str(self.strip_existing_ocr).lower(),
             "disable_image_extraction": str(self.disable_image_extraction).lower(),
+            "format_lines": str(self.format_lines).lower(),
             "output_format": self.output_format,
         }
 
+        if self.additional_config and self.additional_config.strip():
+            form_data["additional_config"] = self.additional_config
+
         log.info(
             f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}"
         )
@@ -133,74 +140,92 @@ class DatalabMarkerLoader:
 
         check_url = result.get("request_check_url")
         request_id = result.get("request_id")
-        if not check_url:
-            raise HTTPException(
-                status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned."
-            )
 
-        for _ in range(300):  # Up to 10 minutes
-            time.sleep(2)
-            try:
-                poll_response = requests.get(check_url, headers=headers)
-                poll_response.raise_for_status()
-                poll_result = poll_response.json()
-            except (requests.HTTPError, ValueError) as e:
-                raw_body = poll_response.text
-                log.error(f"Polling error: {e}, response body: {raw_body}")
-                raise HTTPException(
-                    status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
-                )
+        # Check if this is a direct response (self-hosted) or polling response (DataLab)
+        if check_url:
+            # DataLab polling pattern
+            for _ in range(300):  # Up to 10 minutes
+                time.sleep(2)
+                try:
+                    poll_response = requests.get(check_url, headers=headers)
+                    poll_response.raise_for_status()
+                    poll_result = poll_response.json()
+                except (requests.HTTPError, ValueError) as e:
+                    raw_body = poll_response.text
+                    log.error(f"Polling error: {e}, response body: {raw_body}")
+                    raise HTTPException(
+                        status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
+                    )
+
+                status_val = poll_result.get("status")
+                success_val = poll_result.get("success")
 
-            status_val = poll_result.get("status")
-            success_val = poll_result.get("success")
-
-            if status_val == "complete":
-                summary = {
-                    k: poll_result.get(k)
-                    for k in (
-                        "status",
-                        "output_format",
-                        "success",
-                        "error",
-                        "page_count",
-                        "total_cost",
+                if status_val == "complete":
+                    summary = {
+                        k: poll_result.get(k)
+                        for k in (
+                            "status",
+                            "output_format",
+                            "success",
+                            "error",
+                            "page_count",
+                            "total_cost",
+                        )
+                    }
+                    log.info(
+                        f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
                     )
-                }
-                log.info(
-                    f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
-                )
-                break
+                    break
 
-            if status_val == "failed" or success_val is False:
-                log.error(
-                    f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
-                )
-                error_msg = (
-                    poll_result.get("error")
-                    or "Marker returned failure without error message"
+                if status_val == "failed" or success_val is False:
+                    log.error(
+                        f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
+                    )
+                    error_msg = (
+                        poll_result.get("error")
+                        or "Marker returned failure without error message"
+                    )
+                    raise HTTPException(
+                        status.HTTP_400_BAD_REQUEST,
+                        detail=f"Marker processing failed: {error_msg}",
+                    )
+            else:
+                raise HTTPException(
+                    status.HTTP_504_GATEWAY_TIMEOUT,
+                    detail="Marker processing timed out",
                 )
+
+            if not poll_result.get("success", False):
+                error_msg = poll_result.get("error") or "Unknown processing error"
                 raise HTTPException(
                     status.HTTP_400_BAD_REQUEST,
-                    detail=f"Marker processing failed: {error_msg}",
+                    detail=f"Final processing failed: {error_msg}",
                 )
-        else:
-            raise HTTPException(
-                status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
-            )
-
-        if not poll_result.get("success", False):
-            error_msg = poll_result.get("error") or "Unknown processing error"
-            raise HTTPException(
-                status.HTTP_400_BAD_REQUEST,
-                detail=f"Final processing failed: {error_msg}",
-            )
 
-        content_key = self.output_format.lower()
-        raw_content = poll_result.get(content_key)
+            # DataLab format - content in format-specific fields
+            content_key = self.output_format.lower()
+            raw_content = poll_result.get(content_key)
+            final_result = poll_result
+        else:
+            # Self-hosted direct response - content in "output" field
+            if "output" in result:
+                log.info("Self-hosted Marker returned direct response without polling")
+                raw_content = result.get("output")
+                final_result = result
+            else:
+                available_fields = (
+                    list(result.keys())
+                    if isinstance(result, dict)
+                    else "non-dict response"
+                )
+                raise HTTPException(
+                    status.HTTP_502_BAD_GATEWAY,
+                    detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response.",
+                )
 
-        if content_key == "json":
+        if self.output_format.lower() == "json":
             full_text = json.dumps(raw_content, indent=2)
-        elif content_key in {"markdown", "html"}:
+        elif self.output_format.lower() in {"markdown", "html"}:
             full_text = str(raw_content).strip()
         else:
             raise HTTPException(
@@ -211,14 +236,14 @@ class DatalabMarkerLoader:
         if not full_text:
             raise HTTPException(
                 status.HTTP_400_BAD_REQUEST,
-                detail="Datalab Marker returned empty content",
+                detail="Marker returned empty content",
             )
 
         marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
         os.makedirs(marker_output_dir, exist_ok=True)
 
         file_ext_map = {"markdown": "md", "json": "json", "html": "html"}
-        file_ext = file_ext_map.get(content_key, "txt")
+        file_ext = file_ext_map.get(self.output_format.lower(), "txt")
         output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}"
         output_path = os.path.join(marker_output_dir, output_filename)
 
@@ -231,13 +256,13 @@ class DatalabMarkerLoader:
 
         metadata = {
             "source": filename,
-            "output_format": poll_result.get("output_format", self.output_format),
-            "page_count": poll_result.get("page_count", 0),
+            "output_format": final_result.get("output_format", self.output_format),
+            "page_count": final_result.get("page_count", 0),
             "processed_with_llm": self.use_llm,
             "request_id": request_id or "",
         }
 
-        images = poll_result.get("images", {})
+        images = final_result.get("images", {})
         if images:
             metadata["image_count"] = len(images)
             metadata["images"] = json.dumps(list(images.keys()))

+ 7 - 1
backend/open_webui/retrieval/loaders/main.py

@@ -281,10 +281,15 @@ class Loader:
                 "tiff",
             ]
         ):
+            api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
+            if not api_base_url or api_base_url.strip() == "":
+                api_base_url = "https://www.datalab.to/api/v1/marker"
+
             loader = DatalabMarkerLoader(
                 file_path=file_path,
                 api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
-                langs=self.kwargs.get("DATALAB_MARKER_LANGS"),
+                api_base_url=api_base_url,
+                additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
                 use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
                 skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
                 force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),
@@ -295,6 +300,7 @@ class Loader:
                 disable_image_extraction=self.kwargs.get(
                     "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
                 ),
+                format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False),
                 output_format=self.kwargs.get(
                     "DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
                 ),

+ 25 - 8
backend/open_webui/routers/retrieval.py

@@ -401,12 +401,14 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
         "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
         "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
         "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
-        "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
+        "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+        "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
         "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
         "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
         "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
         "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
         "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
+        "DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
         "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
         "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
         "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
@@ -566,12 +568,14 @@ class ConfigForm(BaseModel):
     CONTENT_EXTRACTION_ENGINE: Optional[str] = None
     PDF_EXTRACT_IMAGES: Optional[bool] = None
     DATALAB_MARKER_API_KEY: Optional[str] = None
-    DATALAB_MARKER_LANGS: Optional[str] = None
+    DATALAB_MARKER_API_BASE_URL: Optional[str] = None
+    DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
     DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
     DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
     DATALAB_MARKER_PAGINATE: Optional[bool] = None
     DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
     DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
+    DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
     DATALAB_MARKER_USE_LLM: Optional[bool] = None
     DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
     EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
@@ -683,10 +687,15 @@ async def update_rag_config(
         if form_data.DATALAB_MARKER_API_KEY is not None
         else request.app.state.config.DATALAB_MARKER_API_KEY
     )
-    request.app.state.config.DATALAB_MARKER_LANGS = (
-        form_data.DATALAB_MARKER_LANGS
-        if form_data.DATALAB_MARKER_LANGS is not None
-        else request.app.state.config.DATALAB_MARKER_LANGS
+    request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
+        form_data.DATALAB_MARKER_API_BASE_URL
+        if form_data.DATALAB_MARKER_API_BASE_URL is not None
+        else request.app.state.config.DATALAB_MARKER_API_BASE_URL
+    )
+    request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
+        form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
+        if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
+        else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
     )
     request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
         form_data.DATALAB_MARKER_SKIP_CACHE
@@ -713,6 +722,11 @@ async def update_rag_config(
         if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
         else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
     )
+    request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
+        form_data.DATALAB_MARKER_FORMAT_LINES
+        if form_data.DATALAB_MARKER_FORMAT_LINES is not None
+        else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
+    )
     request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
         form_data.DATALAB_MARKER_OUTPUT_FORMAT
         if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
@@ -1006,7 +1020,8 @@ async def update_rag_config(
         "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
         "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
         "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
-        "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
+        "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+        "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
         "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
         "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
         "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@@ -1393,12 +1408,14 @@ def process_file(
                 loader = Loader(
                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
                     DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
-                    DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS,
+                    DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
+                    DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
                     DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
                     DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
                     DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
                     DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
                     DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
+                    DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
                     DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
                     DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
                     EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,

+ 68 - 15
src/lib/components/admin/Settings/Documents.svelte

@@ -170,6 +170,19 @@
 			return;
 		}
 
+		if (
+			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
+			RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG &&
+			RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== ''
+		) {
+			try {
+				JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG);
+			} catch (e) {
+				toast.error($i18n.t('Invalid JSON format in Additional Config'));
+				return;
+			}
+		}
+
 		if (
 			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
 			(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
@@ -195,10 +208,6 @@
 			ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',')
 				.map((ext) => ext.trim())
 				.filter((ext) => ext !== ''),
-			DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',')
-				.map((code) => code.trim())
-				.filter((code) => code !== '')
-				.join(', '),
 			DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse(
 				RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}'
 			),
@@ -243,6 +252,11 @@
 			2
 		);
 
+		// Set default API Base URL if empty
+		if (!config.DATALAB_MARKER_API_BASE_URL) {
+			config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker';
+		}
+
 		RAGConfig = config;
 	});
 </script>
@@ -336,6 +350,21 @@
 								</div>
 							</div>
 						{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
+							<div class="my-0.5 flex gap-2 pr-2">
+								<Tooltip
+									content={$i18n.t(
+										'API Base URL for Datalab Marker service. Defaults to: https://www.datalab.to/api/v1/marker'
+									)}
+									placement="top-start"
+									className="w-full"
+								>
+									<input
+										class="flex-1 w-full text-sm bg-transparent outline-hidden"
+										placeholder={$i18n.t('Enter Datalab Marker API Base URL')}
+										bind:value={RAGConfig.DATALAB_MARKER_API_BASE_URL}
+									/>
+								</Tooltip>
+							</div>
 							<div class="my-0.5 flex gap-2 pr-2">
 								<SensitiveInput
 									placeholder={$i18n.t('Enter Datalab Marker API Key')}
@@ -344,24 +373,33 @@
 								/>
 							</div>
 
-							<div class="flex justify-between w-full mt-2">
-								<div class="text-xs font-medium">
-									{$i18n.t('Languages')}
+							<div class="flex flex-col gap-2 mt-2">
+								<div class=" flex flex-col w-full justify-between">
+									<div class=" mb-1 text-xs font-medium">
+										{$i18n.t('Additional Config')}
+									</div>
+									<div class="flex w-full items-center relative">
+										<Tooltip
+											content={$i18n.t(
+												'Additional configuration options for marker. This should be a JSON string with key-value pairs. For example, \'{"key": "value"}\'. Supported keys include: disable_links, keep_pageheader_in_output, keep_pagefooter_in_output, filter_blank_pages, drop_repeated_text, layout_coverage_threshold, merge_threshold, height_tolerance, gap_threshold, image_threshold, min_line_length, level_count, default_level'
+											)}
+											placement="top-start"
+											className="w-full"
+										>
+											<Textarea
+												bind:value={RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG}
+												placeholder={$i18n.t('Enter JSON config (e.g., {"disable_links": true})')}
+											/>
+										</Tooltip>
+									</div>
 								</div>
-
-								<input
-									class="text-sm bg-transparent outline-hidden"
-									type="text"
-									bind:value={RAGConfig.DATALAB_MARKER_LANGS}
-									placeholder={$i18n.t('e.g.) en,fr,de')}
-								/>
 							</div>
 
 							<div class="flex justify-between w-full mt-2">
 								<div class="self-center text-xs font-medium">
 									<Tooltip
 										content={$i18n.t(
-											'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.'
+											'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to False.'
 										)}
 										placement="top-start"
 									>
@@ -445,6 +483,21 @@
 									<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
 								</div>
 							</div>
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											'Format the lines in the output. Defaults to False. If set to True, the lines will be formatted to detect inline math and styles.'
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Format Lines')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_FORMAT_LINES} />
+								</div>
+							</div>
 							<div class="flex justify-between w-full mt-2">
 								<div class="self-center text-xs font-medium">
 									<Tooltip