Browse Source

refac: docling picture describe params

Timothy Jaeryang Baek 4 months ago
parent
commit
0cd400f5ee

+ 22 - 37
backend/open_webui/config.py

@@ -1264,9 +1264,7 @@ def validate_cors_origin(origin):
 # To test CORS_ALLOW_ORIGIN locally, you can set something like
 # CORS_ALLOW_ORIGIN=http://localhost:5173;http://localhost:8080
 # in your .env file depending on your frontend port, 5173 in this case.
-CORS_ALLOW_ORIGIN = os.environ.get(
-    "CORS_ALLOW_ORIGIN", "*"
-).split(";")
+CORS_ALLOW_ORIGIN = os.environ.get("CORS_ALLOW_ORIGIN", "*").split(";")
 
 if CORS_ALLOW_ORIGIN == ["*"]:
     log.warning(
@@ -1278,6 +1276,7 @@ else:
     for origin in CORS_ALLOW_ORIGIN:
         validate_cors_origin(origin)
 
+
 class BannerModel(BaseModel):
     id: str
     type: str
@@ -1974,48 +1973,34 @@ DOCLING_PICTURE_DESCRIPTION_MODE = PersistentConfig(
     os.getenv("DOCLING_PICTURE_DESCRIPTION_MODE", ""),
 )
 
-DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = PersistentConfig(
-    "DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID",
-    "rag.docling_picture_description_local_repo_id",
-    os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID", "HuggingFaceTB/SmolVLM-256M-Instruct"),
-)
 
-DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = PersistentConfig(
-    "DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS",
-    "rag.docling_picture_description_local_max_tokens",
-    int(os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS", 200)),
-)
+docling_picture_description_local = os.getenv("DOCLING_PICTURE_DESCRIPTION_LOCAL", "")
+try:
+    docling_picture_description_local = json.loads(docling_picture_description_local)
+except json.JSONDecodeError:
+    docling_picture_description_local = {}
 
-DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = PersistentConfig(
-    "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT",
-    "rag.docling_picture_description_local_prompt",
-    os.getenv(
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT",
-        "Describe this image in a few sentences.",
-    )
-)
 
-DOCLING_PICTURE_DESCRIPTION_API_URL = PersistentConfig(
-    "DOCLING_PICTURE_DESCRIPTION_API_URL",
-    "rag.docling_picture_description_api_url",
-    os.getenv("DOCLING_PICTURE_DESCRIPTION_API_URL", ""),
+DOCLING_PICTURE_DESCRIPTION_LOCAL = PersistentConfig(
+    "DOCLING_PICTURE_DESCRIPTION_LOCAL",
+    "rag.docling_picture_description_local",
+    docling_picture_description_local,
 )
 
-DOCLING_PICTURE_DESCRIPTION_API_MODEL = PersistentConfig(   
-    "DOCLING_PICTURE_DESCRIPTION_API_MODEL",
-    "rag.docling_picture_description_api_model",
-    os.getenv("DOCLING_PICTURE_DESCRIPTION_API_MODEL", ""),
-)
+doclign_picture_description_api = os.getenv("DOCLING_PICTURE_DESCRIPTION_API", "")
+try:
+    doclign_picture_description_api = json.loads(doclign_picture_description_api)
+except json.JSONDecodeError:
+    doclign_picture_description_api = {}
 
-DOCLING_PICTURE_DESCRIPTION_API_PROMPT = PersistentConfig(
-    "DOCLING_PICTURE_DESCRIPTION_API_PROMPT",
-    "rag.docling_picture_description_api_prompt",
-    os.getenv(
-        "DOCLING_PICTURE_DESCRIPTION_API_PROMPT",
-        "Describe this image in a few sentences.",
-    )
+
+DOCLING_PICTURE_DESCRIPTION_API = PersistentConfig(
+    "DOCLING_PICTURE_DESCRIPTION_API",
+    "rag.docling_picture_description_api",
+    doclign_picture_description_api,
 )
 
+
 DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
     "DOCUMENT_INTELLIGENCE_ENDPOINT",
     "rag.document_intelligence_endpoint",

+ 4 - 12
backend/open_webui/main.py

@@ -232,12 +232,8 @@ from open_webui.config import (
     DOCLING_OCR_LANG,
     DOCLING_DO_PICTURE_DESCRIPTION,
     DOCLING_PICTURE_DESCRIPTION_MODE,
-    DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID,
-    DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS,
-    DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
-    DOCLING_PICTURE_DESCRIPTION_API_URL,
-    DOCLING_PICTURE_DESCRIPTION_API_MODEL,
-    DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
+    DOCLING_PICTURE_DESCRIPTION_LOCAL,
+    DOCLING_PICTURE_DESCRIPTION_API,
     DOCUMENT_INTELLIGENCE_ENDPOINT,
     DOCUMENT_INTELLIGENCE_KEY,
     MISTRAL_OCR_API_KEY,
@@ -709,12 +705,8 @@ app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE
 app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG
 app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION
 app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE = DOCLING_PICTURE_DESCRIPTION_MODE
-app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID
-app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS
-app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT
-app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL = DOCLING_PICTURE_DESCRIPTION_API_URL
-app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL = DOCLING_PICTURE_DESCRIPTION_API_MODEL
-app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT = DOCLING_PICTURE_DESCRIPTION_API_PROMPT
+app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL = DOCLING_PICTURE_DESCRIPTION_LOCAL
+app.state.config.DOCLING_PICTURE_DESCRIPTION_API = DOCLING_PICTURE_DESCRIPTION_API
 app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
 app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
 app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY

+ 25 - 47
backend/open_webui/retrieval/loaders/main.py

@@ -155,40 +155,23 @@ class DoclingLoader:
                         "do_picture_description"
                     )
 
-                    picture_description_mode = self.params.get("picture_description_mode", "").lower()
-
-                    if picture_description_mode == "local":
-
-                        params["picture_description_local"] = json.dumps({
-                            "repo_id": self.params.get(
-                                "picture_description_local_repo_id", "HuggingFaceTB/SmolVLM-256M-Instruct"
-                            ),
-                            "generation_config": {
-                                "max_new_tokens": self.params.get(
-                                    "picture_description_local_max_tokens", 200
-                                )
-                            },
-                            "prompt": self.params.get(
-                                "picture_description_local_prompt", "Describe this image in a few sentences."
-                            )
-                        })
-
-                    elif picture_description_mode == "api":
-
-                        params["picture_description_api"] = json.dumps({
-                            "url": self.params.get(
-                                "picture_description_api_url", ""
-                            ),
-                            "params": {
-                                "model": self.params.get(
-                                    "picture_description_api_model", ""
-                                )
-                            },
-                            "timeout": 30,
-                            "prompt": self.params.get(
-                                "picture_description_api_prompt", "Describe this image in a few sentences."
-                            )
-                        })
+                    picture_description_mode = self.params.get(
+                        "picture_description_mode", ""
+                    ).lower()
+
+                    if picture_description_mode == "local" and self.params.get(
+                        "picture_description_local", {}
+                    ):
+                        params["picture_description_local"] = self.params.get(
+                            "picture_description_local", {}
+                        )
+
+                    elif picture_description_mode == "api" and self.params.get(
+                        "picture_description_api", {}
+                    ):
+                        params["picture_description_api"] = self.params.get(
+                            "picture_description_api", {}
+                        )
 
                 if self.params.get("ocr_engine") and self.params.get("ocr_lang"):
                     params["ocr_engine"] = self.params.get("ocr_engine")
@@ -318,24 +301,19 @@ class Loader:
                 loader = TextLoader(file_path, autodetect_encoding=True)
             else:
                 # Build params for DoclingLoader
-                params = {
-                    "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"),
-                    "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"),
-                    "do_picture_description": self.kwargs.get("DOCLING_DO_PICTURE_DESCRIPTION"),
-                    "picture_description_mode": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_MODE"),
-                    "picture_description_local_repo_id": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID"),
-                    "picture_description_local_max_tokens": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS"),
-                    "picture_description_local_prompt": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT"),
-                    "picture_description_api_url": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_URL"),
-                    "picture_description_api_model": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_MODEL"),
-                    "picture_description_api_prompt": self.kwargs.get("DOCLING_PICTURE_DESCRIPTION_API_PROMPT")
-                }
+                params = self.kwargs.get("DOCLING_PARAMS", {})
+                if not isinstance(params, dict):
+                    try:
+                        params = json.loads(params)
+                    except json.JSONDecodeError:
+                        log.error("Invalid DOCLING_PARAMS format, expected JSON object")
+                        params = {}
 
                 loader = DoclingLoader(
                     url=self.kwargs.get("DOCLING_SERVER_URL"),
                     file_path=file_path,
                     mime_type=file_content_type,
-                    params=params
+                    params=params,
                 )
         elif (
             self.engine == "document_intelligence"

+ 22 - 56
backend/open_webui/routers/retrieval.py

@@ -415,12 +415,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
         "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
         "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
         "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID,
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS,
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
-        "DOCLING_PICTURE_DESCRIPTION_API_URL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL,
-        "DOCLING_PICTURE_DESCRIPTION_API_MODEL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL,
-        "DOCLING_PICTURE_DESCRIPTION_API_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
+        "DOCLING_PICTURE_DESCRIPTION_LOCAL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
+        "DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
         "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
         "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
         "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
@@ -583,12 +579,8 @@ class ConfigForm(BaseModel):
     DOCLING_OCR_LANG: Optional[str] = None
     DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None
     DOCLING_PICTURE_DESCRIPTION_MODE: Optional[str] = None
-    DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID: Optional[str] = None
-    DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS: Optional[int] = None
-    DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT: Optional[str] = None
-    DOCLING_PICTURE_DESCRIPTION_API_URL: Optional[str] = None
-    DOCLING_PICTURE_DESCRIPTION_API_MODEL: Optional[str] = None
-    DOCLING_PICTURE_DESCRIPTION_API_PROMPT: Optional[str] = None
+    DOCLING_PICTURE_DESCRIPTION_LOCAL: Optional[dict] = None
+    DOCLING_PICTURE_DESCRIPTION_API: Optional[dict] = None
     DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
     DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
     MISTRAL_OCR_API_KEY: Optional[str] = None
@@ -767,35 +759,15 @@ async def update_rag_config(
         if form_data.DOCLING_PICTURE_DESCRIPTION_MODE is not None
         else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE
     )
-    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID = (
-        form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID
-        if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID is not None
-        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID
+    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL = (
+        form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL
+        if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL is not None
+        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL
     )
-    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS = (
-        form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS
-        if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS is not None
-        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS
-    )
-    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT = (
-        form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT
-        if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT is not None
-        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT
-    )
-    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL = (
-        form_data.DOCLING_PICTURE_DESCRIPTION_API_URL
-        if form_data.DOCLING_PICTURE_DESCRIPTION_API_URL is not None
-        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL
-    )
-    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL = (
-        form_data.DOCLING_PICTURE_DESCRIPTION_API_MODEL
-        if form_data.DOCLING_PICTURE_DESCRIPTION_API_MODEL is not None
-        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL
-    )
-    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT = (
-        form_data.DOCLING_PICTURE_DESCRIPTION_API_PROMPT
-        if form_data.DOCLING_PICTURE_DESCRIPTION_API_PROMPT is not None
-        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT
+    request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API = (
+        form_data.DOCLING_PICTURE_DESCRIPTION_API
+        if form_data.DOCLING_PICTURE_DESCRIPTION_API is not None
+        else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API
     )
 
     request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
@@ -1036,12 +1008,8 @@ async def update_rag_config(
         "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
         "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
         "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID,
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS,
-        "DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
-        "DOCLING_PICTURE_DESCRIPTION_API_URL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL,
-        "DOCLING_PICTURE_DESCRIPTION_API_MODEL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL,
-        "DOCLING_PICTURE_DESCRIPTION_API_PROMPT": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
+        "DOCLING_PICTURE_DESCRIPTION_LOCAL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
+        "DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
         "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
         "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
         "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
@@ -1388,16 +1356,14 @@ def process_file(
                     EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
                     DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
-                    DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE,
-                    DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG,
-                    DOCLING_DO_PICTURE_DESCRIPTION=request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
-                    DOCLING_PICTURE_DESCRIPTION_MODE=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
-                    DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID,
-                    DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS,
-                    DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT,
-                    DOCLING_PICTURE_DESCRIPTION_API_URL=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_URL,
-                    DOCLING_PICTURE_DESCRIPTION_API_MODEL=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_MODEL,
-                    DOCLING_PICTURE_DESCRIPTION_API_PROMPT=request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API_PROMPT,
+                    DOCLING_PARAMS={
+                        "ocr_engine": request.app.state.config.DOCLING_OCR_ENGINE,
+                        "ocr_lang": request.app.state.config.DOCLING_OCR_LANG,
+                        "do_picture_description": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
+                        "picture_description_mode": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
+                        "picture_description_local": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
+                        "picture_description_api": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
+                    },
                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
                     DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
                     DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,

+ 68 - 124
src/lib/components/admin/Settings/Documents.svelte

@@ -194,17 +194,20 @@
 			await embeddingModelUpdateHandler();
 		}
 
-		RAGConfig.ALLOWED_FILE_EXTENSIONS = (RAGConfig?.ALLOWED_FILE_EXTENSIONS ?? '')
-			.split(',')
-			.map((ext) => ext.trim())
-			.filter((ext) => ext !== '');
-
-		RAGConfig.DATALAB_MARKER_LANGS = RAGConfig.DATALAB_MARKER_LANGS.split(',')
-			.map((code) => code.trim())
-			.filter((code) => code !== '')
-			.join(', ');
-
-		const res = await updateRAGConfig(localStorage.token, RAGConfig);
+		const res = await updateRAGConfig(localStorage.token, {
+			...RAGConfig,
+			ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',')
+				.map((ext) => ext.trim())
+				.filter((ext) => ext !== ''),
+			DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',')
+				.map((code) => code.trim())
+				.filter((code) => code !== '')
+				.join(', '),
+			DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse(
+				RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}'
+			),
+			DOCLING_PICTURE_DESCRIPTION_API: JSON.parse(RAGConfig.DOCLING_PICTURE_DESCRIPTION_API || '{}')
+		});
 		dispatch('save');
 	};
 
@@ -232,6 +235,18 @@
 
 		const config = await getRAGConfig(localStorage.token);
 		config.ALLOWED_FILE_EXTENSIONS = (config?.ALLOWED_FILE_EXTENSIONS ?? []).join(', ');
+
+		config.DOCLING_PICTURE_DESCRIPTION_LOCAL = JSON.stringify(
+			config.DOCLING_PICTURE_DESCRIPTION_LOCAL ?? {},
+			null,
+			2
+		);
+		config.DOCLING_PICTURE_DESCRIPTION_API = JSON.stringify(
+			config.DOCLING_PICTURE_DESCRIPTION_API ?? {},
+			null,
+			2
+		);
+
 		RAGConfig = config;
 	});
 </script>
@@ -511,135 +526,66 @@
 								</div>
 							</div>
 							{#if RAGConfig.DOCLING_DO_PICTURE_DESCRIPTION}
-								<div class="flex w-full mt-2">
-									<div class="flex-1 flex items-center gap-4">
-										<label class="flex items-center gap-1 text-xs font-medium">
-											<Tooltip
-												content={$i18n.t('Use a model locally executed by Docling for picture description.')}
-												placement="top-start"
-											>
-												<input
-													type="radio"
-													name="picture-description-mode"
-													value="local"
-													bind:group={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE}
-													checked={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'local'}
-												/>
-												<span style="padding-left: 0.5em">{$i18n.t('Local Description')}</span>
-											</Tooltip>
-										</label>
-										<label class="flex items-center gap-1 text-xs font-medium">
-											<Tooltip
-												content={$i18n.t('Use a remote API for picture description.')}
-												placement="top-start"
-											>
-												<input
-													type="radio"
-													name="picture-description-mode"
-													value="api"
-													bind:group={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE}
-													checked={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'api'}
-												/>
-												<span style="padding-left: 0.5em">{$i18n.t('Remote Description')}</span>
-											</Tooltip>
-										</label>
+								<div class="flex justify-between w-full mt-2">
+									<div class="self-center text-xs font-medium">
+										<Tooltip content={''} placement="top-start">
+											{$i18n.t('Picture Description Mode')}
+										</Tooltip>
+									</div>
+									<div class="">
+										<select
+											class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden text-right"
+											bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE}
+										>
+											<option value="">{$i18n.t('Default')}</option>
+											<option value="local">{$i18n.t('Local')}</option>
+											<option value="api">{$i18n.t('API')}</option>
+										</select>
 									</div>
 								</div>
 
 								{#if RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'local'}
-									<div class="flex flex-col gap-2 mt-2 ml-4">
-										<div class="flex items-center gap-2">
-											<div class="min-w-fit text-xs font-medium">
-												<Tooltip
-													content={$i18n.t('The HuggingFace repo ID for the local vision-language model.')}
-													placement="top-start"
-												>
-													{$i18n.t('Repo ID')}
-												</Tooltip>
+									<div class="flex flex-col gap-2 mt-2">
+										<div class=" flex flex-col w-full justify-between">
+											<div class=" mb-1 text-xs font-medium">
+												{$i18n.t('Picture Description Local Config')}
 											</div>
-											<input
-												class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-												placeholder={$i18n.t('HuggingFaceTB/SmolVLM-256M-Instruct')}
-												bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL_REPO_ID}
-											/>
-										</div>
-										<div class="flex items-center gap-2">
-											<div class="min-w-fit text-xs font-medium">
+											<div class="flex w-full items-center relative">
 												<Tooltip
-													content={$i18n.t('Maximum number of tokens for the generated description.')}
+													content={$i18n.t(
+														'Options for running a local vision-language model in the picture description. The parameters refer to a model hosted on Hugging Face. This parameter is mutually exclusive with picture_description_api.'
+													)}
 													placement="top-start"
+													className="w-full"
 												>
-													{$i18n.t('Max Tokens')}
+													<Textarea
+														bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL}
+														placeholder={$i18n.t('Enter Options in JSON format')}
+													/>
 												</Tooltip>
 											</div>
-											<input
-												class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-												placeholder={$i18n.t('200')}
-												bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL_MAX_TOKENS}
-											/>
-										</div>
-										<div class="flex items-center gap-2">
-											<div class="min-w-fit text-xs font-medium">
-												<Tooltip
-													content={$i18n.t('Prompt to use for describing the image.')}
-													placement="top-start"
-												>
-													{$i18n.t('Prompt')}
-												</Tooltip>
-											</div>
-											<input
-												class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-												placeholder={$i18n.t('Describe this image in a few sentences.')}
-												bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL_PROMPT}
-											/>
 										</div>
 									</div>
 								{:else if RAGConfig.DOCLING_PICTURE_DESCRIPTION_MODE === 'api'}
-									<div class="flex flex-col gap-2 mt-2 ml-4">
-										<div class="flex items-center gap-2">
-											<div class="min-w-fit text-xs font-medium">
-												<Tooltip
-													content={$i18n.t('The remote API endpoint for picture description.')}
-													placement="top-start"
-												>
-													{$i18n.t('URL')}
-												</Tooltip>
-											</div>
-											<input
-												class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-												placeholder={$i18n.t('Enter Remote API URL')}
-												bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API_URL}
-											/>
-										</div>
-										<div class="flex items-center gap-2">
-											<div class="min-w-fit text-xs font-medium">
-												<Tooltip
-													content={$i18n.t('The model name to use for remote picture description.')}
-													placement="top-start"
-												>
-													{$i18n.t('Model')}
-												</Tooltip>
+									<div class="flex flex-col gap-2 mt-2">
+										<div class=" flex flex-col w-full justify-between">
+											<div class=" mb-1 text-xs font-medium">
+												{$i18n.t('Picture Description API Config')}
 											</div>
-											<input
-												class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-												placeholder={$i18n.t('Enter Model Name')}
-												bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API_MODEL}
-											/>
-										</div>
-										<div class="flex items-center gap-2">
-											<div class="min-w-fit text-xs font-medium">
+											<div class="flex w-full items-center relative">
 												<Tooltip
-													content={$i18n.t('Prompt to use for describing the image via remote API.')}
+													content={$i18n.t(
+														'API details for using a vision-language model in the picture description. This parameter is mutually exclusive with picture_description_local.'
+													)}
 													placement="top-start"
+													className="w-full"
 												>
-													{$i18n.t('Prompt')}
+													<Textarea
+														bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API}
+														placeholder={$i18n.t('Enter Options in JSON format')}
+													/>
 												</Tooltip>
 											</div>
-											<input
-												class=" w-full rounded-lg py-1.5 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-hidden"
-												placeholder={$i18n.t('Describe this image in a few sentences.')}
-												bind:value={RAGConfig.DOCLING_PICTURE_DESCRIPTION_API_PROMPT}
-											/>
 										</div>
 									</div>
 								{/if}
@@ -964,9 +910,7 @@
 							<div class="  mb-2.5 flex w-full justify-between">
 								<div class=" self-center text-xs font-medium">{$i18n.t('Hybrid Search')}</div>
 								<div class="flex items-center relative">
-									<Switch
-										bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH}
-									/>
+									<Switch bind:state={RAGConfig.ENABLE_RAG_HYBRID_SEARCH} />
 								</div>
 							</div>