Timothy Jaeryang Baek 4 months ago
parent
commit
cb4299eb98
2 changed files with 144 additions and 144 deletions
  1. 8 7
      backend/open_webui/config.py
  2. 136 137
      src/lib/components/admin/Settings/Documents.svelte

+ 8 - 7
backend/open_webui/config.py

@@ -1863,43 +1863,44 @@ DATALAB_MARKER_LANGS = PersistentConfig(
 DATALAB_MARKER_USE_LLM = PersistentConfig(
     "DATALAB_MARKER_USE_LLM",
     "rag.DATALAB_MARKER_USE_LLM",
-    os.environ.get("DATALAB_MARKER_USE_LLM", "false") == "true",
+    os.environ.get("DATALAB_MARKER_USE_LLM", "false").lower() == "true",
 )
 
 DATALAB_MARKER_SKIP_CACHE = PersistentConfig(
     "DATALAB_MARKER_SKIP_CACHE",
     "rag.datalab_marker_skip_cache",
-    os.environ.get("DATALAB_MARKER_SKIP_CACHE", "false") == "true",
+    os.environ.get("DATALAB_MARKER_SKIP_CACHE", "false").lower() == "true",
 )
 
 DATALAB_MARKER_FORCE_OCR = PersistentConfig(
     "DATALAB_MARKER_FORCE_OCR",
     "rag.datalab_marker_force_ocr",
-    os.environ.get("DATALAB_MARKER_FORCE_OCR", "false") == "true",
+    os.environ.get("DATALAB_MARKER_FORCE_OCR", "false").lower() == "true",
 )
 
 DATALAB_MARKER_PAGINATE = PersistentConfig(
     "DATALAB_MARKER_PAGINATE",
     "rag.datalab_marker_paginate",
-    os.environ.get("DATALAB_MARKER_PAGINATE", "false") == "true",
+    os.environ.get("DATALAB_MARKER_PAGINATE", "false").lower() == "true",
 )
 
 DATALAB_MARKER_STRIP_EXISTING_OCR = PersistentConfig(
     "DATALAB_MARKER_STRIP_EXISTING_OCR",
     "rag.datalab_marker_strip_existing_ocr",
-    os.environ.get("DATALAB_MARKER_STRIP_EXISTING_OCR", "false") == "true",
+    os.environ.get("DATALAB_MARKER_STRIP_EXISTING_OCR", "false").lower() == "true",
 )
 
 DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig(
     "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION",
     "rag.datalab_marker_disable_image_extraction",
-    os.environ.get("DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", "false") == "true",
+    os.environ.get("DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", "false").lower()
+    == "true",
 )
 
 DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
     "DATALAB_MARKER_OUTPUT_FORMAT",
     "rag.datalab_marker_output_format",
-    os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", ""),
+    os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"),
 )
 
 EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig(

+ 136 - 137
src/lib/components/admin/Settings/Documents.svelte

@@ -58,27 +58,6 @@
 	};
 
 	let RAGConfig = null;
-	let selectedLanguages: string[] = ['en'];
-	let langsHydrated = false;
-
-	const SUPPORTED_LANGUAGES = {
-		"af": "Afrikaans", "am": "Amharic", "ar": "Arabic", "as": "Assamese", "az": "Azerbaijani", "be": "Belarusian",
-		"bg": "Bulgarian", "bn": "Bengali", "br": "Breton", "bs": "Bosnian", "ca": "Catalan", "cs": "Czech",
-		"cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto",
-		"es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Persian", "fi": "Finnish", "fr": "French",
-		"fy": "Western Frisian", "ga": "Irish", "gd": "Scottish Gaelic", "gl": "Galician", "gu": "Gujarati",
-		"ha": "Hausa", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "hy": "Armenian",
-		"id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "jv": "Javanese", "ka": "Georgian",
-		"kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "ku": "Kurdish", "ky": "Kyrgyz",
-		"la": "Latin", "lo": "Lao", "lt": "Lithuanian", "lv": "Latvian", "mg": "Malagasy", "mk": "Macedonian",
-		"ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "my": "Burmese", "ne": "Nepali",
-		"nl": "Dutch", "no": "Norwegian", "om": "Oromo", "or": "Oriya", "pa": "Punjabi", "pl": "Polish",
-		"ps": "Pashto", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "sa": "Sanskrit", "sd": "Sindhi",
-		"si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "so": "Somali", "sq": "Albanian", "sr": "Serbian",
-		"su": "Sundanese", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai",
-		"tl": "Tagalog", "tr": "Turkish", "ug": "Uyghur", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek",
-		"vi": "Vietnamese", "xh": "Xhosa", "yi": "Yiddish", "zh": "Chinese", "_math": "Math"
-	};
 
 	const embeddingModelUpdateHandler = async () => {
 		if (embeddingEngine === '' && embeddingModel.split('/').length - 1 > 1) {
@@ -145,10 +124,6 @@
 	};
 
 	const submitHandler = async () => {
-		if (RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' && !RAGConfig.DATALAB_MARKER_API_KEY) {
-			toast.error($i18n.t('Datalab Marker API Key required.'));
-			return;
-		}
 		if (
 			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'external' &&
 			RAGConfig.EXTERNAL_DOCUMENT_LOADER_URL === ''
@@ -175,6 +150,14 @@
 			return;
 		}
 
+		if (
+			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
+			!RAGConfig.DATALAB_MARKER_API_KEY
+		) {
+			toast.error($i18n.t('Datalab Marker API Key required.'));
+			return;
+		}
+
 		if (
 			RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
 			(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
@@ -200,6 +183,11 @@
 			.map((ext) => ext.trim())
 			.filter((ext) => ext !== '');
 
+		RAGConfig.DATALAB_MARKER_LANGS = RAGConfig.DATALAB_MARKER_LANGS.split(',')
+			.map((code) => code.trim())
+			.filter((code) => code !== '')
+			.join(', ');
+
 		const res = await updateRAGConfig(localStorage.token, RAGConfig);
 		dispatch('save');
 	};
@@ -224,27 +212,8 @@
 
 		const config = await getRAGConfig(localStorage.token);
 		config.ALLOWED_FILE_EXTENSIONS = (config?.ALLOWED_FILE_EXTENSIONS ?? []).join(', ');
-
-		if (!config.DATALAB_MARKER_OUTPUT_FORMAT) {
-			config.DATALAB_MARKER_OUTPUT_FORMAT = 'markdown';
-		}
-
-		if (config.DATALAB_MARKER_LANGS) {
-			selectedLanguages = config.DATALAB_MARKER_LANGS
-				.split(',')
-				.map(code => code.trim())
-				.filter(Boolean);
-		}
-
 		RAGConfig = config;
-		langsHydrated = true;
 	});
-
-	$: if (langsHydrated && RAGConfig) {
-		RAGConfig.DATALAB_MARKER_LANGS = selectedLanguages.length
-			? selectedLanguages.join(',')
-			: 'en';
-	}
 </script>
 
 <ResetUploadDirConfirmDialog
@@ -314,10 +283,10 @@
 									bind:value={RAGConfig.CONTENT_EXTRACTION_ENGINE}
 								>
 									<option value="">{$i18n.t('Default')}</option>
-									<option value="datalab_marker">{ $i18n.t('Datalab Marker API') }</option>
 									<option value="external">{$i18n.t('External')}</option>
 									<option value="tika">{$i18n.t('Tika')}</option>
 									<option value="docling">{$i18n.t('Docling')}</option>
+									<option value="datalab_marker">{$i18n.t('Datalab Marker API')}</option>
 									<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
 									<option value="mistral_ocr">{$i18n.t('Mistral OCR')}</option>
 								</select>
@@ -336,106 +305,136 @@
 								</div>
 							</div>
 						{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
-						  <div class="my-0.5 flex gap-2 pr-2">
-							<SensitiveInput
-							  placeholder={$i18n.t('Enter Datalab Marker API Key')}
-							  required={false}
-							  bind:value={RAGConfig.DATALAB_MARKER_API_KEY}
-							/>
-						  </div>
-						  <div class="my-0.5 flex gap-2 pr-2 w-full">
-							<div class="flex flex-col w-full">
-								<label class="text-xs font-medium mb-1">
-								{$i18n.t("OCR language(s). Hold Ctrl (Windows) or Cmd (Mac) to select multiple. If no selection defaults to English")}
-								</label>
-								<select
-								class="w-full text-sm bg-transparent border border-gray-300 dark:border-gray-700 rounded-sm p-1 outline-hidden"
-								multiple
-								size="6"
-								bind:value={selectedLanguages}
-								>
-								{#each Object.entries(SUPPORTED_LANGUAGES) as [code, label]}
-									<option value={code}>{label}</option>
-								{/each}
-								</select>
-							</div>
-					      </div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t('Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.')} placement="top-start">
-								{$i18n.t('Use LLM')}
-								</Tooltip>
-							</div>
-							<div class="flex items-center">
-								<Switch bind:state={RAGConfig.DATALAB_MARKER_USE_LLM} />
-							</div>
-							</div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t('Skip the cache and re-run the inference. Defaults to False.')} placement="top-start">
-								{$i18n.t('Skip Cache')}
-								</Tooltip>
-							</div>
-							<div class="flex items-center">
-								<Switch bind:state={RAGConfig.DATALAB_MARKER_SKIP_CACHE} />
-							</div>
-							</div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t('Force OCR on all pages of the PDF. This can lead to worse results if you have good text in your PDFs. Defaults to False.')} placement="top-start">
-								{$i18n.t('Force OCR')}
-								</Tooltip>
-							</div>
-							<div class="flex items-center">
-								<Switch bind:state={RAGConfig.DATALAB_MARKER_FORCE_OCR} />
-							</div>
-							</div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t('Whether to paginate the output. Each page will be separated by a horizontal rule and page number. Defaults to False.')} placement="top-start">
-								{$i18n.t('Paginate')}
-								</Tooltip>
-							</div>
-							<div class="flex items-center">
-								<Switch bind:state={RAGConfig.DATALAB_MARKER_PAGINATE} />
-							</div>
-							</div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t('Strip existing OCR text from the PDF and re-run OCR. Ignored if Force OCR is enabled. Defaults to False.')} placement="top-start">
-								{$i18n.t('Strip Existing OCR')}
-								</Tooltip>
+							<div class="my-0.5 flex gap-2 pr-2">
+								<SensitiveInput
+									placeholder={$i18n.t('Enter Datalab Marker API Key')}
+									required={false}
+									bind:value={RAGConfig.DATALAB_MARKER_API_KEY}
+								/>
 							</div>
-							<div class="flex items-center">
-								<Switch bind:state={RAGConfig.DATALAB_MARKER_STRIP_EXISTING_OCR} />
+
+							<div class="flex justify-between w-full mt-2">
+								<div class="text-xs font-medium">
+									{$i18n.t('Languages')}
+								</div>
+
+								<input
+									class="text-sm bg-transparent outline-hidden"
+									type="text"
+									bind:value={RAGConfig.DATALAB_MARKER_LANGS}
+									placeholder={$i18n.t('e.g.) en,fr,de')}
+								/>
 							</div>
+
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.'
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Use LLM')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_USE_LLM} />
+								</div>
 							</div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t('Disable image extraction from the PDF. If Use LLM is enabled, images will be automatically captioned. Defaults to False.')} placement="top-start">
-								{$i18n.t('Disable Image Extraction')}
-								</Tooltip>
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t('Skip the cache and re-run the inference. Defaults to False.')}
+										placement="top-start"
+									>
+										{$i18n.t('Skip Cache')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_SKIP_CACHE} />
+								</div>
 							</div>
-							<div class="flex items-center">
-								<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											'Force OCR on all pages of the PDF. This can lead to worse results if you have good text in your PDFs. Defaults to False.'
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Force OCR')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_FORCE_OCR} />
+								</div>
 							</div>
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											'Whether to paginate the output. Each page will be separated by a horizontal rule and page number. Defaults to False.'
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Paginate')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_PAGINATE} />
+								</div>
 							</div>
-							<div class="mb-1 flex w-full justify-between">
-							<div class="self-center text-xs font-medium">
-								<Tooltip content={$i18n.t("The output format for the text. Can be 'json', 'markdown', or 'html'. Defaults to 'markdown'.")} placement="top-start">
-								{$i18n.t('Output Format')}
-								</Tooltip>
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											'Strip existing OCR text from the PDF and re-run OCR. Ignored if Force OCR is enabled. Defaults to False.'
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Strip Existing OCR')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_STRIP_EXISTING_OCR} />
+								</div>
 							</div>
-							<div class="">
-								<select
-									class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden text-right"
-									bind:value={RAGConfig.DATALAB_MARKER_OUTPUT_FORMAT}
-								>
-									<option value="markdown">{$i18n.t('Markdown')}</option>
-									<option value="json">{$i18n.t('JSON')}</option>
-									<option value="html">{$i18n.t('HTML')}</option>
-								</select>
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											'Disable image extraction from the PDF. If Use LLM is enabled, images will be automatically captioned. Defaults to False.'
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Disable Image Extraction')}
+									</Tooltip>
+								</div>
+								<div class="flex items-center">
+									<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
+								</div>
 							</div>
+							<div class="flex justify-between w-full mt-2">
+								<div class="self-center text-xs font-medium">
+									<Tooltip
+										content={$i18n.t(
+											"The output format for the text. Can be 'json', 'markdown', or 'html'. Defaults to 'markdown'."
+										)}
+										placement="top-start"
+									>
+										{$i18n.t('Output Format')}
+									</Tooltip>
+								</div>
+								<div class="">
+									<select
+										class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden text-right"
+										bind:value={RAGConfig.DATALAB_MARKER_OUTPUT_FORMAT}
+									>
+										<option value="markdown">{$i18n.t('Markdown')}</option>
+										<option value="json">{$i18n.t('JSON')}</option>
+										<option value="html">{$i18n.t('HTML')}</option>
+									</select>
+								</div>
 							</div>
 						{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'external'}
 							<div class="my-0.5 flex gap-2 pr-2">