retrieval.py 101 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434
  1. import json
  2. import logging
  3. import mimetypes
  4. import os
  5. import shutil
  6. import asyncio
  7. import re
  8. import uuid
  9. from datetime import datetime
  10. from pathlib import Path
  11. from typing import Iterator, List, Optional, Sequence, Union
  12. from fastapi import (
  13. Depends,
  14. FastAPI,
  15. File,
  16. Form,
  17. HTTPException,
  18. UploadFile,
  19. Request,
  20. status,
  21. APIRouter,
  22. )
  23. from fastapi.middleware.cors import CORSMiddleware
  24. from fastapi.concurrency import run_in_threadpool
  25. from pydantic import BaseModel
  26. import tiktoken
  27. from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
  28. from langchain_text_splitters import MarkdownHeaderTextSplitter
  29. from langchain_core.documents import Document
  30. from open_webui.models.files import FileModel, Files
  31. from open_webui.models.knowledge import Knowledges
  32. from open_webui.storage.provider import Storage
  33. from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT
  34. # Document loaders
  35. from open_webui.retrieval.loaders.main import Loader
  36. from open_webui.retrieval.loaders.youtube import YoutubeLoader
  37. # Web search engines
  38. from open_webui.retrieval.web.main import SearchResult
  39. from open_webui.retrieval.web.utils import get_web_loader
  40. from open_webui.retrieval.web.ollama import search_ollama_cloud
  41. from open_webui.retrieval.web.perplexity_search import search_perplexity_search
  42. from open_webui.retrieval.web.brave import search_brave
  43. from open_webui.retrieval.web.kagi import search_kagi
  44. from open_webui.retrieval.web.mojeek import search_mojeek
  45. from open_webui.retrieval.web.bocha import search_bocha
  46. from open_webui.retrieval.web.duckduckgo import search_duckduckgo
  47. from open_webui.retrieval.web.google_pse import search_google_pse
  48. from open_webui.retrieval.web.jina_search import search_jina
  49. from open_webui.retrieval.web.searchapi import search_searchapi
  50. from open_webui.retrieval.web.serpapi import search_serpapi
  51. from open_webui.retrieval.web.searxng import search_searxng
  52. from open_webui.retrieval.web.yacy import search_yacy
  53. from open_webui.retrieval.web.serper import search_serper
  54. from open_webui.retrieval.web.serply import search_serply
  55. from open_webui.retrieval.web.serpstack import search_serpstack
  56. from open_webui.retrieval.web.tavily import search_tavily
  57. from open_webui.retrieval.web.bing import search_bing
  58. from open_webui.retrieval.web.exa import search_exa
  59. from open_webui.retrieval.web.perplexity import search_perplexity
  60. from open_webui.retrieval.web.sougou import search_sougou
  61. from open_webui.retrieval.web.firecrawl import search_firecrawl
  62. from open_webui.retrieval.web.external import search_external
  63. from open_webui.retrieval.utils import (
  64. get_content_from_url,
  65. get_embedding_function,
  66. get_reranking_function,
  67. get_model_path,
  68. query_collection,
  69. query_collection_with_hybrid_search,
  70. query_doc,
  71. query_doc_with_hybrid_search,
  72. )
  73. from open_webui.retrieval.vector.utils import filter_metadata
  74. from open_webui.utils.misc import (
  75. calculate_sha256_string,
  76. )
  77. from open_webui.utils.auth import get_admin_user, get_verified_user
  78. from open_webui.config import (
  79. ENV,
  80. RAG_EMBEDDING_MODEL_AUTO_UPDATE,
  81. RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
  82. RAG_RERANKING_MODEL_AUTO_UPDATE,
  83. RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
  84. UPLOAD_DIR,
  85. DEFAULT_LOCALE,
  86. RAG_EMBEDDING_CONTENT_PREFIX,
  87. RAG_EMBEDDING_QUERY_PREFIX,
  88. )
  89. from open_webui.env import (
  90. SRC_LOG_LEVELS,
  91. DEVICE_TYPE,
  92. DOCKER,
  93. SENTENCE_TRANSFORMERS_BACKEND,
  94. SENTENCE_TRANSFORMERS_MODEL_KWARGS,
  95. SENTENCE_TRANSFORMERS_CROSS_ENCODER_BACKEND,
  96. SENTENCE_TRANSFORMERS_CROSS_ENCODER_MODEL_KWARGS,
  97. )
  98. from open_webui.constants import ERROR_MESSAGES
  99. log = logging.getLogger(__name__)
  100. log.setLevel(SRC_LOG_LEVELS["RAG"])
  101. ##########################################
  102. #
  103. # Utility functions
  104. #
  105. ##########################################
  106. def get_ef(
  107. engine: str,
  108. embedding_model: str,
  109. auto_update: bool = False,
  110. ):
  111. ef = None
  112. if embedding_model and engine == "":
  113. from sentence_transformers import SentenceTransformer
  114. try:
  115. ef = SentenceTransformer(
  116. get_model_path(embedding_model, auto_update),
  117. device=DEVICE_TYPE,
  118. trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
  119. backend=SENTENCE_TRANSFORMERS_BACKEND,
  120. model_kwargs=SENTENCE_TRANSFORMERS_MODEL_KWARGS,
  121. )
  122. except Exception as e:
  123. log.debug(f"Error loading SentenceTransformer: {e}")
  124. return ef
  125. def get_rf(
  126. engine: str = "",
  127. reranking_model: Optional[str] = None,
  128. external_reranker_url: str = "",
  129. external_reranker_api_key: str = "",
  130. auto_update: bool = False,
  131. ):
  132. rf = None
  133. if reranking_model:
  134. if any(model in reranking_model for model in ["jinaai/jina-colbert-v2"]):
  135. try:
  136. from open_webui.retrieval.models.colbert import ColBERT
  137. rf = ColBERT(
  138. get_model_path(reranking_model, auto_update),
  139. env="docker" if DOCKER else None,
  140. )
  141. except Exception as e:
  142. log.error(f"ColBERT: {e}")
  143. raise Exception(ERROR_MESSAGES.DEFAULT(e))
  144. else:
  145. if engine == "external":
  146. try:
  147. from open_webui.retrieval.models.external import ExternalReranker
  148. rf = ExternalReranker(
  149. url=external_reranker_url,
  150. api_key=external_reranker_api_key,
  151. model=reranking_model,
  152. )
  153. except Exception as e:
  154. log.error(f"ExternalReranking: {e}")
  155. raise Exception(ERROR_MESSAGES.DEFAULT(e))
  156. else:
  157. import sentence_transformers
  158. try:
  159. rf = sentence_transformers.CrossEncoder(
  160. get_model_path(reranking_model, auto_update),
  161. device=DEVICE_TYPE,
  162. trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
  163. backend=SENTENCE_TRANSFORMERS_CROSS_ENCODER_BACKEND,
  164. model_kwargs=SENTENCE_TRANSFORMERS_CROSS_ENCODER_MODEL_KWARGS,
  165. )
  166. except Exception as e:
  167. log.error(f"CrossEncoder: {e}")
  168. raise Exception(ERROR_MESSAGES.DEFAULT("CrossEncoder error"))
  169. return rf
  170. ##########################################
  171. #
  172. # API routes
  173. #
  174. ##########################################
  175. router = APIRouter()
  176. class CollectionNameForm(BaseModel):
  177. collection_name: Optional[str] = None
  178. class ProcessUrlForm(CollectionNameForm):
  179. url: str
  180. class SearchForm(BaseModel):
  181. queries: List[str]
  182. @router.get("/")
  183. async def get_status(request: Request):
  184. return {
  185. "status": True,
  186. "chunk_size": request.app.state.config.CHUNK_SIZE,
  187. "chunk_overlap": request.app.state.config.CHUNK_OVERLAP,
  188. "template": request.app.state.config.RAG_TEMPLATE,
  189. "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
  190. "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL,
  191. "reranking_model": request.app.state.config.RAG_RERANKING_MODEL,
  192. "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
  193. }
  194. @router.get("/embedding")
  195. async def get_embedding_config(request: Request, user=Depends(get_admin_user)):
  196. return {
  197. "status": True,
  198. "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
  199. "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL,
  200. "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
  201. "openai_config": {
  202. "url": request.app.state.config.RAG_OPENAI_API_BASE_URL,
  203. "key": request.app.state.config.RAG_OPENAI_API_KEY,
  204. },
  205. "ollama_config": {
  206. "url": request.app.state.config.RAG_OLLAMA_BASE_URL,
  207. "key": request.app.state.config.RAG_OLLAMA_API_KEY,
  208. },
  209. "azure_openai_config": {
  210. "url": request.app.state.config.RAG_AZURE_OPENAI_BASE_URL,
  211. "key": request.app.state.config.RAG_AZURE_OPENAI_API_KEY,
  212. "version": request.app.state.config.RAG_AZURE_OPENAI_API_VERSION,
  213. },
  214. }
  215. class OpenAIConfigForm(BaseModel):
  216. url: str
  217. key: str
  218. class OllamaConfigForm(BaseModel):
  219. url: str
  220. key: str
  221. class AzureOpenAIConfigForm(BaseModel):
  222. url: str
  223. key: str
  224. version: str
  225. class EmbeddingModelUpdateForm(BaseModel):
  226. openai_config: Optional[OpenAIConfigForm] = None
  227. ollama_config: Optional[OllamaConfigForm] = None
  228. azure_openai_config: Optional[AzureOpenAIConfigForm] = None
  229. embedding_engine: str
  230. embedding_model: str
  231. embedding_batch_size: Optional[int] = 1
  232. @router.post("/embedding/update")
  233. async def update_embedding_config(
  234. request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
  235. ):
  236. log.info(
  237. f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
  238. )
  239. if request.app.state.config.RAG_EMBEDDING_ENGINE == "":
  240. # unloads current internal embedding model and clears VRAM cache
  241. request.app.state.ef = None
  242. request.app.state.EMBEDDING_FUNCTION = None
  243. import gc
  244. gc.collect()
  245. if DEVICE_TYPE == "cuda":
  246. import torch
  247. if torch.cuda.is_available():
  248. torch.cuda.empty_cache()
  249. try:
  250. request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
  251. request.app.state.config.RAG_EMBEDDING_MODEL = form_data.embedding_model
  252. if request.app.state.config.RAG_EMBEDDING_ENGINE in [
  253. "ollama",
  254. "openai",
  255. "azure_openai",
  256. ]:
  257. if form_data.openai_config is not None:
  258. request.app.state.config.RAG_OPENAI_API_BASE_URL = (
  259. form_data.openai_config.url
  260. )
  261. request.app.state.config.RAG_OPENAI_API_KEY = (
  262. form_data.openai_config.key
  263. )
  264. if form_data.ollama_config is not None:
  265. request.app.state.config.RAG_OLLAMA_BASE_URL = (
  266. form_data.ollama_config.url
  267. )
  268. request.app.state.config.RAG_OLLAMA_API_KEY = (
  269. form_data.ollama_config.key
  270. )
  271. if form_data.azure_openai_config is not None:
  272. request.app.state.config.RAG_AZURE_OPENAI_BASE_URL = (
  273. form_data.azure_openai_config.url
  274. )
  275. request.app.state.config.RAG_AZURE_OPENAI_API_KEY = (
  276. form_data.azure_openai_config.key
  277. )
  278. request.app.state.config.RAG_AZURE_OPENAI_API_VERSION = (
  279. form_data.azure_openai_config.version
  280. )
  281. request.app.state.config.RAG_EMBEDDING_BATCH_SIZE = (
  282. form_data.embedding_batch_size
  283. )
  284. request.app.state.ef = get_ef(
  285. request.app.state.config.RAG_EMBEDDING_ENGINE,
  286. request.app.state.config.RAG_EMBEDDING_MODEL,
  287. )
  288. request.app.state.EMBEDDING_FUNCTION = get_embedding_function(
  289. request.app.state.config.RAG_EMBEDDING_ENGINE,
  290. request.app.state.config.RAG_EMBEDDING_MODEL,
  291. request.app.state.ef,
  292. (
  293. request.app.state.config.RAG_OPENAI_API_BASE_URL
  294. if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai"
  295. else (
  296. request.app.state.config.RAG_OLLAMA_BASE_URL
  297. if request.app.state.config.RAG_EMBEDDING_ENGINE == "ollama"
  298. else request.app.state.config.RAG_AZURE_OPENAI_BASE_URL
  299. )
  300. ),
  301. (
  302. request.app.state.config.RAG_OPENAI_API_KEY
  303. if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai"
  304. else (
  305. request.app.state.config.RAG_OLLAMA_API_KEY
  306. if request.app.state.config.RAG_EMBEDDING_ENGINE == "ollama"
  307. else request.app.state.config.RAG_AZURE_OPENAI_API_KEY
  308. )
  309. ),
  310. request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
  311. azure_api_version=(
  312. request.app.state.config.RAG_AZURE_OPENAI_API_VERSION
  313. if request.app.state.config.RAG_EMBEDDING_ENGINE == "azure_openai"
  314. else None
  315. ),
  316. )
  317. return {
  318. "status": True,
  319. "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
  320. "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL,
  321. "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
  322. "openai_config": {
  323. "url": request.app.state.config.RAG_OPENAI_API_BASE_URL,
  324. "key": request.app.state.config.RAG_OPENAI_API_KEY,
  325. },
  326. "ollama_config": {
  327. "url": request.app.state.config.RAG_OLLAMA_BASE_URL,
  328. "key": request.app.state.config.RAG_OLLAMA_API_KEY,
  329. },
  330. "azure_openai_config": {
  331. "url": request.app.state.config.RAG_AZURE_OPENAI_BASE_URL,
  332. "key": request.app.state.config.RAG_AZURE_OPENAI_API_KEY,
  333. "version": request.app.state.config.RAG_AZURE_OPENAI_API_VERSION,
  334. },
  335. }
  336. except Exception as e:
  337. log.exception(f"Problem updating embedding model: {e}")
  338. raise HTTPException(
  339. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  340. detail=ERROR_MESSAGES.DEFAULT(e),
  341. )
  342. @router.get("/config")
  343. async def get_rag_config(request: Request, user=Depends(get_admin_user)):
  344. return {
  345. "status": True,
  346. # RAG settings
  347. "RAG_TEMPLATE": request.app.state.config.RAG_TEMPLATE,
  348. "TOP_K": request.app.state.config.TOP_K,
  349. "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
  350. "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
  351. # Hybrid search settings
  352. "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
  353. "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER,
  354. "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD,
  355. "HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT,
  356. # Content extraction settings
  357. "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
  358. "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
  359. "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
  360. "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
  361. "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
  362. "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
  363. "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
  364. "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
  365. "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
  366. "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
  367. "DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
  368. "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
  369. "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
  370. "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
  371. "EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
  372. "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
  373. "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
  374. "DOCLING_PARAMS": request.app.state.config.DOCLING_PARAMS,
  375. "DOCLING_DO_OCR": request.app.state.config.DOCLING_DO_OCR,
  376. "DOCLING_FORCE_OCR": request.app.state.config.DOCLING_FORCE_OCR,
  377. "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
  378. "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
  379. "DOCLING_PDF_BACKEND": request.app.state.config.DOCLING_PDF_BACKEND,
  380. "DOCLING_TABLE_MODE": request.app.state.config.DOCLING_TABLE_MODE,
  381. "DOCLING_PIPELINE": request.app.state.config.DOCLING_PIPELINE,
  382. "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
  383. "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
  384. "DOCLING_PICTURE_DESCRIPTION_LOCAL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
  385. "DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
  386. "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
  387. "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
  388. "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
  389. # Reranking settings
  390. "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
  391. "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
  392. "RAG_EXTERNAL_RERANKER_URL": request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
  393. "RAG_EXTERNAL_RERANKER_API_KEY": request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
  394. # Chunking settings
  395. "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER,
  396. "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE,
  397. "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP,
  398. # File upload settings
  399. "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE,
  400. "FILE_MAX_COUNT": request.app.state.config.FILE_MAX_COUNT,
  401. "FILE_IMAGE_COMPRESSION_WIDTH": request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH,
  402. "FILE_IMAGE_COMPRESSION_HEIGHT": request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT,
  403. "ALLOWED_FILE_EXTENSIONS": request.app.state.config.ALLOWED_FILE_EXTENSIONS,
  404. # Integration settings
  405. "ENABLE_GOOGLE_DRIVE_INTEGRATION": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
  406. "ENABLE_ONEDRIVE_INTEGRATION": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
  407. # Web search settings
  408. "web": {
  409. "ENABLE_WEB_SEARCH": request.app.state.config.ENABLE_WEB_SEARCH,
  410. "WEB_SEARCH_ENGINE": request.app.state.config.WEB_SEARCH_ENGINE,
  411. "WEB_SEARCH_TRUST_ENV": request.app.state.config.WEB_SEARCH_TRUST_ENV,
  412. "WEB_SEARCH_RESULT_COUNT": request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  413. "WEB_SEARCH_CONCURRENT_REQUESTS": request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
  414. "WEB_LOADER_CONCURRENT_REQUESTS": request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
  415. "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  416. "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
  417. "BYPASS_WEB_SEARCH_WEB_LOADER": request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER,
  418. "OLLAMA_CLOUD_WEB_SEARCH_API_KEY": request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
  419. "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL,
  420. "YACY_QUERY_URL": request.app.state.config.YACY_QUERY_URL,
  421. "YACY_USERNAME": request.app.state.config.YACY_USERNAME,
  422. "YACY_PASSWORD": request.app.state.config.YACY_PASSWORD,
  423. "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY,
  424. "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID,
  425. "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY,
  426. "KAGI_SEARCH_API_KEY": request.app.state.config.KAGI_SEARCH_API_KEY,
  427. "MOJEEK_SEARCH_API_KEY": request.app.state.config.MOJEEK_SEARCH_API_KEY,
  428. "BOCHA_SEARCH_API_KEY": request.app.state.config.BOCHA_SEARCH_API_KEY,
  429. "SERPSTACK_API_KEY": request.app.state.config.SERPSTACK_API_KEY,
  430. "SERPSTACK_HTTPS": request.app.state.config.SERPSTACK_HTTPS,
  431. "SERPER_API_KEY": request.app.state.config.SERPER_API_KEY,
  432. "SERPLY_API_KEY": request.app.state.config.SERPLY_API_KEY,
  433. "TAVILY_API_KEY": request.app.state.config.TAVILY_API_KEY,
  434. "SEARCHAPI_API_KEY": request.app.state.config.SEARCHAPI_API_KEY,
  435. "SEARCHAPI_ENGINE": request.app.state.config.SEARCHAPI_ENGINE,
  436. "SERPAPI_API_KEY": request.app.state.config.SERPAPI_API_KEY,
  437. "SERPAPI_ENGINE": request.app.state.config.SERPAPI_ENGINE,
  438. "JINA_API_KEY": request.app.state.config.JINA_API_KEY,
  439. "BING_SEARCH_V7_ENDPOINT": request.app.state.config.BING_SEARCH_V7_ENDPOINT,
  440. "BING_SEARCH_V7_SUBSCRIPTION_KEY": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
  441. "EXA_API_KEY": request.app.state.config.EXA_API_KEY,
  442. "PERPLEXITY_API_KEY": request.app.state.config.PERPLEXITY_API_KEY,
  443. "PERPLEXITY_MODEL": request.app.state.config.PERPLEXITY_MODEL,
  444. "PERPLEXITY_SEARCH_CONTEXT_USAGE": request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
  445. "SOUGOU_API_SID": request.app.state.config.SOUGOU_API_SID,
  446. "SOUGOU_API_SK": request.app.state.config.SOUGOU_API_SK,
  447. "WEB_LOADER_ENGINE": request.app.state.config.WEB_LOADER_ENGINE,
  448. "ENABLE_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
  449. "PLAYWRIGHT_WS_URL": request.app.state.config.PLAYWRIGHT_WS_URL,
  450. "PLAYWRIGHT_TIMEOUT": request.app.state.config.PLAYWRIGHT_TIMEOUT,
  451. "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY,
  452. "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL,
  453. "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH,
  454. "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
  455. "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
  456. "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL,
  457. "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
  458. "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
  459. "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
  460. "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION,
  461. },
  462. }
  463. class WebConfig(BaseModel):
  464. ENABLE_WEB_SEARCH: Optional[bool] = None
  465. WEB_SEARCH_ENGINE: Optional[str] = None
  466. WEB_SEARCH_TRUST_ENV: Optional[bool] = None
  467. WEB_SEARCH_RESULT_COUNT: Optional[int] = None
  468. WEB_SEARCH_CONCURRENT_REQUESTS: Optional[int] = None
  469. WEB_LOADER_CONCURRENT_REQUESTS: Optional[int] = None
  470. WEB_SEARCH_DOMAIN_FILTER_LIST: Optional[List[str]] = []
  471. BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
  472. BYPASS_WEB_SEARCH_WEB_LOADER: Optional[bool] = None
  473. OLLAMA_CLOUD_WEB_SEARCH_API_KEY: Optional[str] = None
  474. SEARXNG_QUERY_URL: Optional[str] = None
  475. YACY_QUERY_URL: Optional[str] = None
  476. YACY_USERNAME: Optional[str] = None
  477. YACY_PASSWORD: Optional[str] = None
  478. GOOGLE_PSE_API_KEY: Optional[str] = None
  479. GOOGLE_PSE_ENGINE_ID: Optional[str] = None
  480. BRAVE_SEARCH_API_KEY: Optional[str] = None
  481. KAGI_SEARCH_API_KEY: Optional[str] = None
  482. MOJEEK_SEARCH_API_KEY: Optional[str] = None
  483. BOCHA_SEARCH_API_KEY: Optional[str] = None
  484. SERPSTACK_API_KEY: Optional[str] = None
  485. SERPSTACK_HTTPS: Optional[bool] = None
  486. SERPER_API_KEY: Optional[str] = None
  487. SERPLY_API_KEY: Optional[str] = None
  488. TAVILY_API_KEY: Optional[str] = None
  489. SEARCHAPI_API_KEY: Optional[str] = None
  490. SEARCHAPI_ENGINE: Optional[str] = None
  491. SERPAPI_API_KEY: Optional[str] = None
  492. SERPAPI_ENGINE: Optional[str] = None
  493. JINA_API_KEY: Optional[str] = None
  494. BING_SEARCH_V7_ENDPOINT: Optional[str] = None
  495. BING_SEARCH_V7_SUBSCRIPTION_KEY: Optional[str] = None
  496. EXA_API_KEY: Optional[str] = None
  497. PERPLEXITY_API_KEY: Optional[str] = None
  498. PERPLEXITY_MODEL: Optional[str] = None
  499. PERPLEXITY_SEARCH_CONTEXT_USAGE: Optional[str] = None
  500. SOUGOU_API_SID: Optional[str] = None
  501. SOUGOU_API_SK: Optional[str] = None
  502. WEB_LOADER_ENGINE: Optional[str] = None
  503. ENABLE_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None
  504. PLAYWRIGHT_WS_URL: Optional[str] = None
  505. PLAYWRIGHT_TIMEOUT: Optional[int] = None
  506. FIRECRAWL_API_KEY: Optional[str] = None
  507. FIRECRAWL_API_BASE_URL: Optional[str] = None
  508. TAVILY_EXTRACT_DEPTH: Optional[str] = None
  509. EXTERNAL_WEB_SEARCH_URL: Optional[str] = None
  510. EXTERNAL_WEB_SEARCH_API_KEY: Optional[str] = None
  511. EXTERNAL_WEB_LOADER_URL: Optional[str] = None
  512. EXTERNAL_WEB_LOADER_API_KEY: Optional[str] = None
  513. YOUTUBE_LOADER_LANGUAGE: Optional[List[str]] = None
  514. YOUTUBE_LOADER_PROXY_URL: Optional[str] = None
  515. YOUTUBE_LOADER_TRANSLATION: Optional[str] = None
  516. class ConfigForm(BaseModel):
  517. # RAG settings
  518. RAG_TEMPLATE: Optional[str] = None
  519. TOP_K: Optional[int] = None
  520. BYPASS_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
  521. RAG_FULL_CONTEXT: Optional[bool] = None
  522. # Hybrid search settings
  523. ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
  524. TOP_K_RERANKER: Optional[int] = None
  525. RELEVANCE_THRESHOLD: Optional[float] = None
  526. HYBRID_BM25_WEIGHT: Optional[float] = None
  527. # Content extraction settings
  528. CONTENT_EXTRACTION_ENGINE: Optional[str] = None
  529. PDF_EXTRACT_IMAGES: Optional[bool] = None
  530. DATALAB_MARKER_API_KEY: Optional[str] = None
  531. DATALAB_MARKER_API_BASE_URL: Optional[str] = None
  532. DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
  533. DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
  534. DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
  535. DATALAB_MARKER_PAGINATE: Optional[bool] = None
  536. DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
  537. DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
  538. DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
  539. DATALAB_MARKER_USE_LLM: Optional[bool] = None
  540. DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
  541. EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
  542. EXTERNAL_DOCUMENT_LOADER_API_KEY: Optional[str] = None
  543. TIKA_SERVER_URL: Optional[str] = None
  544. DOCLING_SERVER_URL: Optional[str] = None
  545. DOCLING_PARAMS: Optional[dict] = None
  546. DOCLING_DO_OCR: Optional[bool] = None
  547. DOCLING_FORCE_OCR: Optional[bool] = None
  548. DOCLING_OCR_ENGINE: Optional[str] = None
  549. DOCLING_OCR_LANG: Optional[str] = None
  550. DOCLING_PDF_BACKEND: Optional[str] = None
  551. DOCLING_TABLE_MODE: Optional[str] = None
  552. DOCLING_PIPELINE: Optional[str] = None
  553. DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None
  554. DOCLING_PICTURE_DESCRIPTION_MODE: Optional[str] = None
  555. DOCLING_PICTURE_DESCRIPTION_LOCAL: Optional[dict] = None
  556. DOCLING_PICTURE_DESCRIPTION_API: Optional[dict] = None
  557. DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
  558. DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
  559. MISTRAL_OCR_API_KEY: Optional[str] = None
  560. # Reranking settings
  561. RAG_RERANKING_MODEL: Optional[str] = None
  562. RAG_RERANKING_ENGINE: Optional[str] = None
  563. RAG_EXTERNAL_RERANKER_URL: Optional[str] = None
  564. RAG_EXTERNAL_RERANKER_API_KEY: Optional[str] = None
  565. # Chunking settings
  566. TEXT_SPLITTER: Optional[str] = None
  567. CHUNK_SIZE: Optional[int] = None
  568. CHUNK_OVERLAP: Optional[int] = None
  569. # File upload settings
  570. FILE_MAX_SIZE: Optional[int] = None
  571. FILE_MAX_COUNT: Optional[int] = None
  572. FILE_IMAGE_COMPRESSION_WIDTH: Optional[int] = None
  573. FILE_IMAGE_COMPRESSION_HEIGHT: Optional[int] = None
  574. ALLOWED_FILE_EXTENSIONS: Optional[List[str]] = None
  575. # Integration settings
  576. ENABLE_GOOGLE_DRIVE_INTEGRATION: Optional[bool] = None
  577. ENABLE_ONEDRIVE_INTEGRATION: Optional[bool] = None
  578. # Web search settings
  579. web: Optional[WebConfig] = None
  580. @router.post("/config/update")
  581. async def update_rag_config(
  582. request: Request, form_data: ConfigForm, user=Depends(get_admin_user)
  583. ):
  584. # RAG settings
  585. request.app.state.config.RAG_TEMPLATE = (
  586. form_data.RAG_TEMPLATE
  587. if form_data.RAG_TEMPLATE is not None
  588. else request.app.state.config.RAG_TEMPLATE
  589. )
  590. request.app.state.config.TOP_K = (
  591. form_data.TOP_K
  592. if form_data.TOP_K is not None
  593. else request.app.state.config.TOP_K
  594. )
  595. request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = (
  596. form_data.BYPASS_EMBEDDING_AND_RETRIEVAL
  597. if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None
  598. else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
  599. )
  600. request.app.state.config.RAG_FULL_CONTEXT = (
  601. form_data.RAG_FULL_CONTEXT
  602. if form_data.RAG_FULL_CONTEXT is not None
  603. else request.app.state.config.RAG_FULL_CONTEXT
  604. )
  605. # Hybrid search settings
  606. request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
  607. form_data.ENABLE_RAG_HYBRID_SEARCH
  608. if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
  609. else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
  610. )
  611. request.app.state.config.TOP_K_RERANKER = (
  612. form_data.TOP_K_RERANKER
  613. if form_data.TOP_K_RERANKER is not None
  614. else request.app.state.config.TOP_K_RERANKER
  615. )
  616. request.app.state.config.RELEVANCE_THRESHOLD = (
  617. form_data.RELEVANCE_THRESHOLD
  618. if form_data.RELEVANCE_THRESHOLD is not None
  619. else request.app.state.config.RELEVANCE_THRESHOLD
  620. )
  621. request.app.state.config.HYBRID_BM25_WEIGHT = (
  622. form_data.HYBRID_BM25_WEIGHT
  623. if form_data.HYBRID_BM25_WEIGHT is not None
  624. else request.app.state.config.HYBRID_BM25_WEIGHT
  625. )
  626. # Content extraction settings
  627. request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
  628. form_data.CONTENT_EXTRACTION_ENGINE
  629. if form_data.CONTENT_EXTRACTION_ENGINE is not None
  630. else request.app.state.config.CONTENT_EXTRACTION_ENGINE
  631. )
  632. request.app.state.config.PDF_EXTRACT_IMAGES = (
  633. form_data.PDF_EXTRACT_IMAGES
  634. if form_data.PDF_EXTRACT_IMAGES is not None
  635. else request.app.state.config.PDF_EXTRACT_IMAGES
  636. )
  637. request.app.state.config.DATALAB_MARKER_API_KEY = (
  638. form_data.DATALAB_MARKER_API_KEY
  639. if form_data.DATALAB_MARKER_API_KEY is not None
  640. else request.app.state.config.DATALAB_MARKER_API_KEY
  641. )
  642. request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
  643. form_data.DATALAB_MARKER_API_BASE_URL
  644. if form_data.DATALAB_MARKER_API_BASE_URL is not None
  645. else request.app.state.config.DATALAB_MARKER_API_BASE_URL
  646. )
  647. request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
  648. form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
  649. if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
  650. else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
  651. )
  652. request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
  653. form_data.DATALAB_MARKER_SKIP_CACHE
  654. if form_data.DATALAB_MARKER_SKIP_CACHE is not None
  655. else request.app.state.config.DATALAB_MARKER_SKIP_CACHE
  656. )
  657. request.app.state.config.DATALAB_MARKER_FORCE_OCR = (
  658. form_data.DATALAB_MARKER_FORCE_OCR
  659. if form_data.DATALAB_MARKER_FORCE_OCR is not None
  660. else request.app.state.config.DATALAB_MARKER_FORCE_OCR
  661. )
  662. request.app.state.config.DATALAB_MARKER_PAGINATE = (
  663. form_data.DATALAB_MARKER_PAGINATE
  664. if form_data.DATALAB_MARKER_PAGINATE is not None
  665. else request.app.state.config.DATALAB_MARKER_PAGINATE
  666. )
  667. request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = (
  668. form_data.DATALAB_MARKER_STRIP_EXISTING_OCR
  669. if form_data.DATALAB_MARKER_STRIP_EXISTING_OCR is not None
  670. else request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR
  671. )
  672. request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
  673. form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
  674. if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
  675. else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
  676. )
  677. request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
  678. form_data.DATALAB_MARKER_FORMAT_LINES
  679. if form_data.DATALAB_MARKER_FORMAT_LINES is not None
  680. else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
  681. )
  682. request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
  683. form_data.DATALAB_MARKER_OUTPUT_FORMAT
  684. if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
  685. else request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT
  686. )
  687. request.app.state.config.DATALAB_MARKER_USE_LLM = (
  688. form_data.DATALAB_MARKER_USE_LLM
  689. if form_data.DATALAB_MARKER_USE_LLM is not None
  690. else request.app.state.config.DATALAB_MARKER_USE_LLM
  691. )
  692. request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = (
  693. form_data.EXTERNAL_DOCUMENT_LOADER_URL
  694. if form_data.EXTERNAL_DOCUMENT_LOADER_URL is not None
  695. else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL
  696. )
  697. request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = (
  698. form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY
  699. if form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY is not None
  700. else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY
  701. )
  702. request.app.state.config.TIKA_SERVER_URL = (
  703. form_data.TIKA_SERVER_URL
  704. if form_data.TIKA_SERVER_URL is not None
  705. else request.app.state.config.TIKA_SERVER_URL
  706. )
  707. request.app.state.config.DOCLING_SERVER_URL = (
  708. form_data.DOCLING_SERVER_URL
  709. if form_data.DOCLING_SERVER_URL is not None
  710. else request.app.state.config.DOCLING_SERVER_URL
  711. )
  712. request.app.state.config.DOCLING_PARAMS = (
  713. form_data.DOCLING_PARAMS
  714. if form_data.DOCLING_PARAMS is not None
  715. else request.app.state.config.DOCLING_PARAMS
  716. )
  717. request.app.state.config.DOCLING_DO_OCR = (
  718. form_data.DOCLING_DO_OCR
  719. if form_data.DOCLING_DO_OCR is not None
  720. else request.app.state.config.DOCLING_DO_OCR
  721. )
  722. request.app.state.config.DOCLING_FORCE_OCR = (
  723. form_data.DOCLING_FORCE_OCR
  724. if form_data.DOCLING_FORCE_OCR is not None
  725. else request.app.state.config.DOCLING_FORCE_OCR
  726. )
  727. request.app.state.config.DOCLING_OCR_ENGINE = (
  728. form_data.DOCLING_OCR_ENGINE
  729. if form_data.DOCLING_OCR_ENGINE is not None
  730. else request.app.state.config.DOCLING_OCR_ENGINE
  731. )
  732. request.app.state.config.DOCLING_OCR_LANG = (
  733. form_data.DOCLING_OCR_LANG
  734. if form_data.DOCLING_OCR_LANG is not None
  735. else request.app.state.config.DOCLING_OCR_LANG
  736. )
  737. request.app.state.config.DOCLING_PDF_BACKEND = (
  738. form_data.DOCLING_PDF_BACKEND
  739. if form_data.DOCLING_PDF_BACKEND is not None
  740. else request.app.state.config.DOCLING_PDF_BACKEND
  741. )
  742. request.app.state.config.DOCLING_TABLE_MODE = (
  743. form_data.DOCLING_TABLE_MODE
  744. if form_data.DOCLING_TABLE_MODE is not None
  745. else request.app.state.config.DOCLING_TABLE_MODE
  746. )
  747. request.app.state.config.DOCLING_PIPELINE = (
  748. form_data.DOCLING_PIPELINE
  749. if form_data.DOCLING_PIPELINE is not None
  750. else request.app.state.config.DOCLING_PIPELINE
  751. )
  752. request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = (
  753. form_data.DOCLING_DO_PICTURE_DESCRIPTION
  754. if form_data.DOCLING_DO_PICTURE_DESCRIPTION is not None
  755. else request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION
  756. )
  757. request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE = (
  758. form_data.DOCLING_PICTURE_DESCRIPTION_MODE
  759. if form_data.DOCLING_PICTURE_DESCRIPTION_MODE is not None
  760. else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE
  761. )
  762. request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL = (
  763. form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL
  764. if form_data.DOCLING_PICTURE_DESCRIPTION_LOCAL is not None
  765. else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL
  766. )
  767. request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API = (
  768. form_data.DOCLING_PICTURE_DESCRIPTION_API
  769. if form_data.DOCLING_PICTURE_DESCRIPTION_API is not None
  770. else request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API
  771. )
  772. request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
  773. form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
  774. if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
  775. else request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT
  776. )
  777. request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
  778. form_data.DOCUMENT_INTELLIGENCE_KEY
  779. if form_data.DOCUMENT_INTELLIGENCE_KEY is not None
  780. else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY
  781. )
  782. request.app.state.config.MISTRAL_OCR_API_KEY = (
  783. form_data.MISTRAL_OCR_API_KEY
  784. if form_data.MISTRAL_OCR_API_KEY is not None
  785. else request.app.state.config.MISTRAL_OCR_API_KEY
  786. )
  787. # Reranking settings
  788. if request.app.state.config.RAG_RERANKING_ENGINE == "":
  789. # Unloading the internal reranker and clear VRAM memory
  790. request.app.state.rf = None
  791. request.app.state.RERANKING_FUNCTION = None
  792. import gc
  793. gc.collect()
  794. if DEVICE_TYPE == "cuda":
  795. import torch
  796. if torch.cuda.is_available():
  797. torch.cuda.empty_cache()
  798. request.app.state.config.RAG_RERANKING_ENGINE = (
  799. form_data.RAG_RERANKING_ENGINE
  800. if form_data.RAG_RERANKING_ENGINE is not None
  801. else request.app.state.config.RAG_RERANKING_ENGINE
  802. )
  803. request.app.state.config.RAG_EXTERNAL_RERANKER_URL = (
  804. form_data.RAG_EXTERNAL_RERANKER_URL
  805. if form_data.RAG_EXTERNAL_RERANKER_URL is not None
  806. else request.app.state.config.RAG_EXTERNAL_RERANKER_URL
  807. )
  808. request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY = (
  809. form_data.RAG_EXTERNAL_RERANKER_API_KEY
  810. if form_data.RAG_EXTERNAL_RERANKER_API_KEY is not None
  811. else request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY
  812. )
  813. log.info(
  814. f"Updating reranking model: {request.app.state.config.RAG_RERANKING_MODEL} to {form_data.RAG_RERANKING_MODEL}"
  815. )
  816. try:
  817. request.app.state.config.RAG_RERANKING_MODEL = (
  818. form_data.RAG_RERANKING_MODEL
  819. if form_data.RAG_RERANKING_MODEL is not None
  820. else request.app.state.config.RAG_RERANKING_MODEL
  821. )
  822. try:
  823. if (
  824. request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
  825. and not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
  826. ):
  827. request.app.state.rf = get_rf(
  828. request.app.state.config.RAG_RERANKING_ENGINE,
  829. request.app.state.config.RAG_RERANKING_MODEL,
  830. request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
  831. request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
  832. True,
  833. )
  834. request.app.state.RERANKING_FUNCTION = get_reranking_function(
  835. request.app.state.config.RAG_RERANKING_ENGINE,
  836. request.app.state.config.RAG_RERANKING_MODEL,
  837. request.app.state.rf,
  838. )
  839. except Exception as e:
  840. log.error(f"Error loading reranking model: {e}")
  841. request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = False
  842. except Exception as e:
  843. log.exception(f"Problem updating reranking model: {e}")
  844. raise HTTPException(
  845. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  846. detail=ERROR_MESSAGES.DEFAULT(e),
  847. )
  848. # Chunking settings
  849. request.app.state.config.TEXT_SPLITTER = (
  850. form_data.TEXT_SPLITTER
  851. if form_data.TEXT_SPLITTER is not None
  852. else request.app.state.config.TEXT_SPLITTER
  853. )
  854. request.app.state.config.CHUNK_SIZE = (
  855. form_data.CHUNK_SIZE
  856. if form_data.CHUNK_SIZE is not None
  857. else request.app.state.config.CHUNK_SIZE
  858. )
  859. request.app.state.config.CHUNK_OVERLAP = (
  860. form_data.CHUNK_OVERLAP
  861. if form_data.CHUNK_OVERLAP is not None
  862. else request.app.state.config.CHUNK_OVERLAP
  863. )
  864. # File upload settings
  865. request.app.state.config.FILE_MAX_SIZE = form_data.FILE_MAX_SIZE
  866. request.app.state.config.FILE_MAX_COUNT = form_data.FILE_MAX_COUNT
  867. request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH = (
  868. form_data.FILE_IMAGE_COMPRESSION_WIDTH
  869. )
  870. request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = (
  871. form_data.FILE_IMAGE_COMPRESSION_HEIGHT
  872. )
  873. request.app.state.config.ALLOWED_FILE_EXTENSIONS = (
  874. form_data.ALLOWED_FILE_EXTENSIONS
  875. if form_data.ALLOWED_FILE_EXTENSIONS is not None
  876. else request.app.state.config.ALLOWED_FILE_EXTENSIONS
  877. )
  878. # Integration settings
  879. request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = (
  880. form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION
  881. if form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION is not None
  882. else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION
  883. )
  884. request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = (
  885. form_data.ENABLE_ONEDRIVE_INTEGRATION
  886. if form_data.ENABLE_ONEDRIVE_INTEGRATION is not None
  887. else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION
  888. )
  889. if form_data.web is not None:
  890. # Web search settings
  891. request.app.state.config.ENABLE_WEB_SEARCH = form_data.web.ENABLE_WEB_SEARCH
  892. request.app.state.config.WEB_SEARCH_ENGINE = form_data.web.WEB_SEARCH_ENGINE
  893. request.app.state.config.WEB_SEARCH_TRUST_ENV = (
  894. form_data.web.WEB_SEARCH_TRUST_ENV
  895. )
  896. request.app.state.config.WEB_SEARCH_RESULT_COUNT = (
  897. form_data.web.WEB_SEARCH_RESULT_COUNT
  898. )
  899. request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS = (
  900. form_data.web.WEB_SEARCH_CONCURRENT_REQUESTS
  901. )
  902. request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS = (
  903. form_data.web.WEB_LOADER_CONCURRENT_REQUESTS
  904. )
  905. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST = (
  906. form_data.web.WEB_SEARCH_DOMAIN_FILTER_LIST
  907. )
  908. request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
  909. form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
  910. )
  911. request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER = (
  912. form_data.web.BYPASS_WEB_SEARCH_WEB_LOADER
  913. )
  914. request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY = (
  915. form_data.web.OLLAMA_CLOUD_WEB_SEARCH_API_KEY
  916. )
  917. request.app.state.config.SEARXNG_QUERY_URL = form_data.web.SEARXNG_QUERY_URL
  918. request.app.state.config.YACY_QUERY_URL = form_data.web.YACY_QUERY_URL
  919. request.app.state.config.YACY_USERNAME = form_data.web.YACY_USERNAME
  920. request.app.state.config.YACY_PASSWORD = form_data.web.YACY_PASSWORD
  921. request.app.state.config.GOOGLE_PSE_API_KEY = form_data.web.GOOGLE_PSE_API_KEY
  922. request.app.state.config.GOOGLE_PSE_ENGINE_ID = (
  923. form_data.web.GOOGLE_PSE_ENGINE_ID
  924. )
  925. request.app.state.config.BRAVE_SEARCH_API_KEY = (
  926. form_data.web.BRAVE_SEARCH_API_KEY
  927. )
  928. request.app.state.config.KAGI_SEARCH_API_KEY = form_data.web.KAGI_SEARCH_API_KEY
  929. request.app.state.config.MOJEEK_SEARCH_API_KEY = (
  930. form_data.web.MOJEEK_SEARCH_API_KEY
  931. )
  932. request.app.state.config.BOCHA_SEARCH_API_KEY = (
  933. form_data.web.BOCHA_SEARCH_API_KEY
  934. )
  935. request.app.state.config.SERPSTACK_API_KEY = form_data.web.SERPSTACK_API_KEY
  936. request.app.state.config.SERPSTACK_HTTPS = form_data.web.SERPSTACK_HTTPS
  937. request.app.state.config.SERPER_API_KEY = form_data.web.SERPER_API_KEY
  938. request.app.state.config.SERPLY_API_KEY = form_data.web.SERPLY_API_KEY
  939. request.app.state.config.TAVILY_API_KEY = form_data.web.TAVILY_API_KEY
  940. request.app.state.config.SEARCHAPI_API_KEY = form_data.web.SEARCHAPI_API_KEY
  941. request.app.state.config.SEARCHAPI_ENGINE = form_data.web.SEARCHAPI_ENGINE
  942. request.app.state.config.SERPAPI_API_KEY = form_data.web.SERPAPI_API_KEY
  943. request.app.state.config.SERPAPI_ENGINE = form_data.web.SERPAPI_ENGINE
  944. request.app.state.config.JINA_API_KEY = form_data.web.JINA_API_KEY
  945. request.app.state.config.BING_SEARCH_V7_ENDPOINT = (
  946. form_data.web.BING_SEARCH_V7_ENDPOINT
  947. )
  948. request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = (
  949. form_data.web.BING_SEARCH_V7_SUBSCRIPTION_KEY
  950. )
  951. request.app.state.config.EXA_API_KEY = form_data.web.EXA_API_KEY
  952. request.app.state.config.PERPLEXITY_API_KEY = form_data.web.PERPLEXITY_API_KEY
  953. request.app.state.config.PERPLEXITY_MODEL = form_data.web.PERPLEXITY_MODEL
  954. request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE = (
  955. form_data.web.PERPLEXITY_SEARCH_CONTEXT_USAGE
  956. )
  957. request.app.state.config.SOUGOU_API_SID = form_data.web.SOUGOU_API_SID
  958. request.app.state.config.SOUGOU_API_SK = form_data.web.SOUGOU_API_SK
  959. # Web loader settings
  960. request.app.state.config.WEB_LOADER_ENGINE = form_data.web.WEB_LOADER_ENGINE
  961. request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = (
  962. form_data.web.ENABLE_WEB_LOADER_SSL_VERIFICATION
  963. )
  964. request.app.state.config.PLAYWRIGHT_WS_URL = form_data.web.PLAYWRIGHT_WS_URL
  965. request.app.state.config.PLAYWRIGHT_TIMEOUT = form_data.web.PLAYWRIGHT_TIMEOUT
  966. request.app.state.config.FIRECRAWL_API_KEY = form_data.web.FIRECRAWL_API_KEY
  967. request.app.state.config.FIRECRAWL_API_BASE_URL = (
  968. form_data.web.FIRECRAWL_API_BASE_URL
  969. )
  970. request.app.state.config.EXTERNAL_WEB_SEARCH_URL = (
  971. form_data.web.EXTERNAL_WEB_SEARCH_URL
  972. )
  973. request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = (
  974. form_data.web.EXTERNAL_WEB_SEARCH_API_KEY
  975. )
  976. request.app.state.config.EXTERNAL_WEB_LOADER_URL = (
  977. form_data.web.EXTERNAL_WEB_LOADER_URL
  978. )
  979. request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY = (
  980. form_data.web.EXTERNAL_WEB_LOADER_API_KEY
  981. )
  982. request.app.state.config.TAVILY_EXTRACT_DEPTH = (
  983. form_data.web.TAVILY_EXTRACT_DEPTH
  984. )
  985. request.app.state.config.YOUTUBE_LOADER_LANGUAGE = (
  986. form_data.web.YOUTUBE_LOADER_LANGUAGE
  987. )
  988. request.app.state.config.YOUTUBE_LOADER_PROXY_URL = (
  989. form_data.web.YOUTUBE_LOADER_PROXY_URL
  990. )
  991. request.app.state.YOUTUBE_LOADER_TRANSLATION = (
  992. form_data.web.YOUTUBE_LOADER_TRANSLATION
  993. )
  994. return {
  995. "status": True,
  996. # RAG settings
  997. "RAG_TEMPLATE": request.app.state.config.RAG_TEMPLATE,
  998. "TOP_K": request.app.state.config.TOP_K,
  999. "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
  1000. "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT,
  1001. # Hybrid search settings
  1002. "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
  1003. "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER,
  1004. "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD,
  1005. "HYBRID_BM25_WEIGHT": request.app.state.config.HYBRID_BM25_WEIGHT,
  1006. # Content extraction settings
  1007. "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
  1008. "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
  1009. "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
  1010. "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
  1011. "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
  1012. "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
  1013. "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
  1014. "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
  1015. "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
  1016. "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
  1017. "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
  1018. "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
  1019. "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
  1020. "EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
  1021. "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
  1022. "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
  1023. "DOCLING_PARAMS": request.app.state.config.DOCLING_PARAMS,
  1024. "DOCLING_DO_OCR": request.app.state.config.DOCLING_DO_OCR,
  1025. "DOCLING_FORCE_OCR": request.app.state.config.DOCLING_FORCE_OCR,
  1026. "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
  1027. "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG,
  1028. "DOCLING_PDF_BACKEND": request.app.state.config.DOCLING_PDF_BACKEND,
  1029. "DOCLING_TABLE_MODE": request.app.state.config.DOCLING_TABLE_MODE,
  1030. "DOCLING_PIPELINE": request.app.state.config.DOCLING_PIPELINE,
  1031. "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
  1032. "DOCLING_PICTURE_DESCRIPTION_MODE": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
  1033. "DOCLING_PICTURE_DESCRIPTION_LOCAL": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
  1034. "DOCLING_PICTURE_DESCRIPTION_API": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
  1035. "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
  1036. "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
  1037. "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY,
  1038. # Reranking settings
  1039. "RAG_RERANKING_MODEL": request.app.state.config.RAG_RERANKING_MODEL,
  1040. "RAG_RERANKING_ENGINE": request.app.state.config.RAG_RERANKING_ENGINE,
  1041. "RAG_EXTERNAL_RERANKER_URL": request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
  1042. "RAG_EXTERNAL_RERANKER_API_KEY": request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
  1043. # Chunking settings
  1044. "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER,
  1045. "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE,
  1046. "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP,
  1047. # File upload settings
  1048. "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE,
  1049. "FILE_MAX_COUNT": request.app.state.config.FILE_MAX_COUNT,
  1050. "FILE_IMAGE_COMPRESSION_WIDTH": request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH,
  1051. "FILE_IMAGE_COMPRESSION_HEIGHT": request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT,
  1052. "ALLOWED_FILE_EXTENSIONS": request.app.state.config.ALLOWED_FILE_EXTENSIONS,
  1053. # Integration settings
  1054. "ENABLE_GOOGLE_DRIVE_INTEGRATION": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
  1055. "ENABLE_ONEDRIVE_INTEGRATION": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
  1056. # Web search settings
  1057. "web": {
  1058. "ENABLE_WEB_SEARCH": request.app.state.config.ENABLE_WEB_SEARCH,
  1059. "WEB_SEARCH_ENGINE": request.app.state.config.WEB_SEARCH_ENGINE,
  1060. "WEB_SEARCH_TRUST_ENV": request.app.state.config.WEB_SEARCH_TRUST_ENV,
  1061. "WEB_SEARCH_RESULT_COUNT": request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1062. "WEB_SEARCH_CONCURRENT_REQUESTS": request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
  1063. "WEB_LOADER_CONCURRENT_REQUESTS": request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
  1064. "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1065. "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
  1066. "BYPASS_WEB_SEARCH_WEB_LOADER": request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER,
  1067. "OLLAMA_CLOUD_WEB_SEARCH_API_KEY": request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
  1068. "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL,
  1069. "YACY_QUERY_URL": request.app.state.config.YACY_QUERY_URL,
  1070. "YACY_USERNAME": request.app.state.config.YACY_USERNAME,
  1071. "YACY_PASSWORD": request.app.state.config.YACY_PASSWORD,
  1072. "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY,
  1073. "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID,
  1074. "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY,
  1075. "KAGI_SEARCH_API_KEY": request.app.state.config.KAGI_SEARCH_API_KEY,
  1076. "MOJEEK_SEARCH_API_KEY": request.app.state.config.MOJEEK_SEARCH_API_KEY,
  1077. "BOCHA_SEARCH_API_KEY": request.app.state.config.BOCHA_SEARCH_API_KEY,
  1078. "SERPSTACK_API_KEY": request.app.state.config.SERPSTACK_API_KEY,
  1079. "SERPSTACK_HTTPS": request.app.state.config.SERPSTACK_HTTPS,
  1080. "SERPER_API_KEY": request.app.state.config.SERPER_API_KEY,
  1081. "SERPLY_API_KEY": request.app.state.config.SERPLY_API_KEY,
  1082. "TAVILY_API_KEY": request.app.state.config.TAVILY_API_KEY,
  1083. "SEARCHAPI_API_KEY": request.app.state.config.SEARCHAPI_API_KEY,
  1084. "SEARCHAPI_ENGINE": request.app.state.config.SEARCHAPI_ENGINE,
  1085. "SERPAPI_API_KEY": request.app.state.config.SERPAPI_API_KEY,
  1086. "SERPAPI_ENGINE": request.app.state.config.SERPAPI_ENGINE,
  1087. "JINA_API_KEY": request.app.state.config.JINA_API_KEY,
  1088. "BING_SEARCH_V7_ENDPOINT": request.app.state.config.BING_SEARCH_V7_ENDPOINT,
  1089. "BING_SEARCH_V7_SUBSCRIPTION_KEY": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
  1090. "EXA_API_KEY": request.app.state.config.EXA_API_KEY,
  1091. "PERPLEXITY_API_KEY": request.app.state.config.PERPLEXITY_API_KEY,
  1092. "PERPLEXITY_MODEL": request.app.state.config.PERPLEXITY_MODEL,
  1093. "PERPLEXITY_SEARCH_CONTEXT_USAGE": request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
  1094. "SOUGOU_API_SID": request.app.state.config.SOUGOU_API_SID,
  1095. "SOUGOU_API_SK": request.app.state.config.SOUGOU_API_SK,
  1096. "WEB_LOADER_ENGINE": request.app.state.config.WEB_LOADER_ENGINE,
  1097. "ENABLE_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
  1098. "PLAYWRIGHT_WS_URL": request.app.state.config.PLAYWRIGHT_WS_URL,
  1099. "PLAYWRIGHT_TIMEOUT": request.app.state.config.PLAYWRIGHT_TIMEOUT,
  1100. "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY,
  1101. "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL,
  1102. "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH,
  1103. "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
  1104. "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
  1105. "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL,
  1106. "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
  1107. "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
  1108. "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
  1109. "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION,
  1110. },
  1111. }
  1112. ####################################
  1113. #
  1114. # Document process and retrieval
  1115. #
  1116. ####################################
  1117. def save_docs_to_vector_db(
  1118. request: Request,
  1119. docs,
  1120. collection_name,
  1121. metadata: Optional[dict] = None,
  1122. overwrite: bool = False,
  1123. split: bool = True,
  1124. add: bool = False,
  1125. user=None,
  1126. ) -> bool:
  1127. def _get_docs_info(docs: list[Document]) -> str:
  1128. docs_info = set()
  1129. # Trying to select relevant metadata identifying the document.
  1130. for doc in docs:
  1131. metadata = getattr(doc, "metadata", {})
  1132. doc_name = metadata.get("name", "")
  1133. if not doc_name:
  1134. doc_name = metadata.get("title", "")
  1135. if not doc_name:
  1136. doc_name = metadata.get("source", "")
  1137. if doc_name:
  1138. docs_info.add(doc_name)
  1139. return ", ".join(docs_info)
  1140. log.info(
  1141. f"save_docs_to_vector_db: document {_get_docs_info(docs)} {collection_name}"
  1142. )
  1143. # Check if entries with the same hash (metadata.hash) already exist
  1144. if metadata and "hash" in metadata:
  1145. result = VECTOR_DB_CLIENT.query(
  1146. collection_name=collection_name,
  1147. filter={"hash": metadata["hash"]},
  1148. )
  1149. if result is not None:
  1150. existing_doc_ids = result.ids[0]
  1151. if existing_doc_ids:
  1152. log.info(f"Document with hash {metadata['hash']} already exists")
  1153. raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
  1154. if split:
  1155. if request.app.state.config.TEXT_SPLITTER in ["", "character"]:
  1156. text_splitter = RecursiveCharacterTextSplitter(
  1157. chunk_size=request.app.state.config.CHUNK_SIZE,
  1158. chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
  1159. add_start_index=True,
  1160. )
  1161. docs = text_splitter.split_documents(docs)
  1162. elif request.app.state.config.TEXT_SPLITTER == "token":
  1163. log.info(
  1164. f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
  1165. )
  1166. tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
  1167. text_splitter = TokenTextSplitter(
  1168. encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME),
  1169. chunk_size=request.app.state.config.CHUNK_SIZE,
  1170. chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
  1171. add_start_index=True,
  1172. )
  1173. docs = text_splitter.split_documents(docs)
  1174. elif request.app.state.config.TEXT_SPLITTER == "markdown_header":
  1175. log.info("Using markdown header text splitter")
  1176. # Define headers to split on - covering most common markdown header levels
  1177. headers_to_split_on = [
  1178. ("#", "Header 1"),
  1179. ("##", "Header 2"),
  1180. ("###", "Header 3"),
  1181. ("####", "Header 4"),
  1182. ("#####", "Header 5"),
  1183. ("######", "Header 6"),
  1184. ]
  1185. markdown_splitter = MarkdownHeaderTextSplitter(
  1186. headers_to_split_on=headers_to_split_on,
  1187. strip_headers=False, # Keep headers in content for context
  1188. )
  1189. md_split_docs = []
  1190. for doc in docs:
  1191. md_header_splits = markdown_splitter.split_text(doc.page_content)
  1192. text_splitter = RecursiveCharacterTextSplitter(
  1193. chunk_size=request.app.state.config.CHUNK_SIZE,
  1194. chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
  1195. add_start_index=True,
  1196. )
  1197. md_header_splits = text_splitter.split_documents(md_header_splits)
  1198. # Convert back to Document objects, preserving original metadata
  1199. for split_chunk in md_header_splits:
  1200. headings_list = []
  1201. # Extract header values in order based on headers_to_split_on
  1202. for _, header_meta_key_name in headers_to_split_on:
  1203. if header_meta_key_name in split_chunk.metadata:
  1204. headings_list.append(
  1205. split_chunk.metadata[header_meta_key_name]
  1206. )
  1207. md_split_docs.append(
  1208. Document(
  1209. page_content=split_chunk.page_content,
  1210. metadata={**doc.metadata, "headings": headings_list},
  1211. )
  1212. )
  1213. docs = md_split_docs
  1214. else:
  1215. raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
  1216. if len(docs) == 0:
  1217. raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
  1218. texts = [doc.page_content for doc in docs]
  1219. metadatas = [
  1220. {
  1221. **doc.metadata,
  1222. **(metadata if metadata else {}),
  1223. "embedding_config": {
  1224. "engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
  1225. "model": request.app.state.config.RAG_EMBEDDING_MODEL,
  1226. },
  1227. }
  1228. for doc in docs
  1229. ]
  1230. try:
  1231. if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
  1232. log.info(f"collection {collection_name} already exists")
  1233. if overwrite:
  1234. VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
  1235. log.info(f"deleting existing collection {collection_name}")
  1236. elif add is False:
  1237. log.info(
  1238. f"collection {collection_name} already exists, overwrite is False and add is False"
  1239. )
  1240. return True
  1241. log.info(f"generating embeddings for {collection_name}")
  1242. embedding_function = get_embedding_function(
  1243. request.app.state.config.RAG_EMBEDDING_ENGINE,
  1244. request.app.state.config.RAG_EMBEDDING_MODEL,
  1245. request.app.state.ef,
  1246. (
  1247. request.app.state.config.RAG_OPENAI_API_BASE_URL
  1248. if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai"
  1249. else (
  1250. request.app.state.config.RAG_OLLAMA_BASE_URL
  1251. if request.app.state.config.RAG_EMBEDDING_ENGINE == "ollama"
  1252. else request.app.state.config.RAG_AZURE_OPENAI_BASE_URL
  1253. )
  1254. ),
  1255. (
  1256. request.app.state.config.RAG_OPENAI_API_KEY
  1257. if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai"
  1258. else (
  1259. request.app.state.config.RAG_OLLAMA_API_KEY
  1260. if request.app.state.config.RAG_EMBEDDING_ENGINE == "ollama"
  1261. else request.app.state.config.RAG_AZURE_OPENAI_API_KEY
  1262. )
  1263. ),
  1264. request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
  1265. azure_api_version=(
  1266. request.app.state.config.RAG_AZURE_OPENAI_API_VERSION
  1267. if request.app.state.config.RAG_EMBEDDING_ENGINE == "azure_openai"
  1268. else None
  1269. ),
  1270. )
  1271. embeddings = embedding_function(
  1272. list(map(lambda x: x.replace("\n", " "), texts)),
  1273. prefix=RAG_EMBEDDING_CONTENT_PREFIX,
  1274. user=user,
  1275. )
  1276. log.info(f"embeddings generated {len(embeddings)} for {len(texts)} items")
  1277. items = [
  1278. {
  1279. "id": str(uuid.uuid4()),
  1280. "text": text,
  1281. "vector": embeddings[idx],
  1282. "metadata": metadatas[idx],
  1283. }
  1284. for idx, text in enumerate(texts)
  1285. ]
  1286. log.info(f"adding to collection {collection_name}")
  1287. VECTOR_DB_CLIENT.insert(
  1288. collection_name=collection_name,
  1289. items=items,
  1290. )
  1291. log.info(f"added {len(items)} items to collection {collection_name}")
  1292. return True
  1293. except Exception as e:
  1294. log.exception(e)
  1295. raise e
  1296. class ProcessFileForm(BaseModel):
  1297. file_id: str
  1298. content: Optional[str] = None
  1299. collection_name: Optional[str] = None
  1300. @router.post("/process/file")
  1301. def process_file(
  1302. request: Request,
  1303. form_data: ProcessFileForm,
  1304. user=Depends(get_verified_user),
  1305. ):
  1306. if user.role == "admin":
  1307. file = Files.get_file_by_id(form_data.file_id)
  1308. else:
  1309. file = Files.get_file_by_id_and_user_id(form_data.file_id, user.id)
  1310. if file:
  1311. try:
  1312. collection_name = form_data.collection_name
  1313. if collection_name is None:
  1314. collection_name = f"file-{file.id}"
  1315. if form_data.content:
  1316. # Update the content in the file
  1317. # Usage: /files/{file_id}/data/content/update, /files/ (audio file upload pipeline)
  1318. try:
  1319. # /files/{file_id}/data/content/update
  1320. VECTOR_DB_CLIENT.delete_collection(
  1321. collection_name=f"file-{file.id}"
  1322. )
  1323. except:
  1324. # Audio file upload pipeline
  1325. pass
  1326. docs = [
  1327. Document(
  1328. page_content=form_data.content.replace("<br/>", "\n"),
  1329. metadata={
  1330. **file.meta,
  1331. "name": file.filename,
  1332. "created_by": file.user_id,
  1333. "file_id": file.id,
  1334. "source": file.filename,
  1335. },
  1336. )
  1337. ]
  1338. text_content = form_data.content
  1339. elif form_data.collection_name:
  1340. # Check if the file has already been processed and save the content
  1341. # Usage: /knowledge/{id}/file/add, /knowledge/{id}/file/update
  1342. result = VECTOR_DB_CLIENT.query(
  1343. collection_name=f"file-{file.id}", filter={"file_id": file.id}
  1344. )
  1345. if result is not None and len(result.ids[0]) > 0:
  1346. docs = [
  1347. Document(
  1348. page_content=result.documents[0][idx],
  1349. metadata=result.metadatas[0][idx],
  1350. )
  1351. for idx, id in enumerate(result.ids[0])
  1352. ]
  1353. else:
  1354. docs = [
  1355. Document(
  1356. page_content=file.data.get("content", ""),
  1357. metadata={
  1358. **file.meta,
  1359. "name": file.filename,
  1360. "created_by": file.user_id,
  1361. "file_id": file.id,
  1362. "source": file.filename,
  1363. },
  1364. )
  1365. ]
  1366. text_content = file.data.get("content", "")
  1367. else:
  1368. # Process the file and save the content
  1369. # Usage: /files/
  1370. file_path = file.path
  1371. if file_path:
  1372. file_path = Storage.get_file(file_path)
  1373. loader = Loader(
  1374. engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
  1375. DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
  1376. DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
  1377. DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
  1378. DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
  1379. DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
  1380. DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
  1381. DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
  1382. DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
  1383. DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
  1384. DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
  1385. DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
  1386. EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
  1387. EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
  1388. TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
  1389. DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
  1390. DOCLING_PARAMS={
  1391. "do_ocr": request.app.state.config.DOCLING_DO_OCR,
  1392. "force_ocr": request.app.state.config.DOCLING_FORCE_OCR,
  1393. "ocr_engine": request.app.state.config.DOCLING_OCR_ENGINE,
  1394. "ocr_lang": request.app.state.config.DOCLING_OCR_LANG,
  1395. "pdf_backend": request.app.state.config.DOCLING_PDF_BACKEND,
  1396. "table_mode": request.app.state.config.DOCLING_TABLE_MODE,
  1397. "pipeline": request.app.state.config.DOCLING_PIPELINE,
  1398. "do_picture_description": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION,
  1399. "picture_description_mode": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_MODE,
  1400. "picture_description_local": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_LOCAL,
  1401. "picture_description_api": request.app.state.config.DOCLING_PICTURE_DESCRIPTION_API,
  1402. **request.app.state.config.DOCLING_PARAMS,
  1403. },
  1404. PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
  1405. DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
  1406. DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
  1407. MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY,
  1408. )
  1409. docs = loader.load(
  1410. file.filename, file.meta.get("content_type"), file_path
  1411. )
  1412. docs = [
  1413. Document(
  1414. page_content=doc.page_content,
  1415. metadata={
  1416. **filter_metadata(doc.metadata),
  1417. "name": file.filename,
  1418. "created_by": file.user_id,
  1419. "file_id": file.id,
  1420. "source": file.filename,
  1421. },
  1422. )
  1423. for doc in docs
  1424. ]
  1425. else:
  1426. docs = [
  1427. Document(
  1428. page_content=file.data.get("content", ""),
  1429. metadata={
  1430. **file.meta,
  1431. "name": file.filename,
  1432. "created_by": file.user_id,
  1433. "file_id": file.id,
  1434. "source": file.filename,
  1435. },
  1436. )
  1437. ]
  1438. text_content = " ".join([doc.page_content for doc in docs])
  1439. log.debug(f"text_content: {text_content}")
  1440. Files.update_file_data_by_id(
  1441. file.id,
  1442. {"content": text_content},
  1443. )
  1444. hash = calculate_sha256_string(text_content)
  1445. Files.update_file_hash_by_id(file.id, hash)
  1446. if request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
  1447. Files.update_file_data_by_id(file.id, {"status": "completed"})
  1448. return {
  1449. "status": True,
  1450. "collection_name": None,
  1451. "filename": file.filename,
  1452. "content": text_content,
  1453. }
  1454. else:
  1455. try:
  1456. result = save_docs_to_vector_db(
  1457. request,
  1458. docs=docs,
  1459. collection_name=collection_name,
  1460. metadata={
  1461. "file_id": file.id,
  1462. "name": file.filename,
  1463. "hash": hash,
  1464. },
  1465. add=(True if form_data.collection_name else False),
  1466. user=user,
  1467. )
  1468. log.info(f"added {len(docs)} items to collection {collection_name}")
  1469. if result:
  1470. Files.update_file_metadata_by_id(
  1471. file.id,
  1472. {
  1473. "collection_name": collection_name,
  1474. },
  1475. )
  1476. Files.update_file_data_by_id(
  1477. file.id,
  1478. {"status": "completed"},
  1479. )
  1480. return {
  1481. "status": True,
  1482. "collection_name": collection_name,
  1483. "filename": file.filename,
  1484. "content": text_content,
  1485. }
  1486. else:
  1487. raise Exception("Error saving document to vector database")
  1488. except Exception as e:
  1489. raise e
  1490. except Exception as e:
  1491. log.exception(e)
  1492. Files.update_file_data_by_id(
  1493. file.id,
  1494. {"status": "failed"},
  1495. )
  1496. if "No pandoc was found" in str(e):
  1497. raise HTTPException(
  1498. status_code=status.HTTP_400_BAD_REQUEST,
  1499. detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
  1500. )
  1501. else:
  1502. raise HTTPException(
  1503. status_code=status.HTTP_400_BAD_REQUEST,
  1504. detail=str(e),
  1505. )
  1506. else:
  1507. raise HTTPException(
  1508. status_code=status.HTTP_404_NOT_FOUND, detail=ERROR_MESSAGES.NOT_FOUND
  1509. )
  1510. class ProcessTextForm(BaseModel):
  1511. name: str
  1512. content: str
  1513. collection_name: Optional[str] = None
  1514. @router.post("/process/text")
  1515. def process_text(
  1516. request: Request,
  1517. form_data: ProcessTextForm,
  1518. user=Depends(get_verified_user),
  1519. ):
  1520. collection_name = form_data.collection_name
  1521. if collection_name is None:
  1522. collection_name = calculate_sha256_string(form_data.content)
  1523. docs = [
  1524. Document(
  1525. page_content=form_data.content,
  1526. metadata={"name": form_data.name, "created_by": user.id},
  1527. )
  1528. ]
  1529. text_content = form_data.content
  1530. log.debug(f"text_content: {text_content}")
  1531. result = save_docs_to_vector_db(request, docs, collection_name, user=user)
  1532. if result:
  1533. return {
  1534. "status": True,
  1535. "collection_name": collection_name,
  1536. "content": text_content,
  1537. }
  1538. else:
  1539. raise HTTPException(
  1540. status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
  1541. detail=ERROR_MESSAGES.DEFAULT(),
  1542. )
  1543. @router.post("/process/youtube")
  1544. @router.post("/process/web")
  1545. def process_web(
  1546. request: Request, form_data: ProcessUrlForm, user=Depends(get_verified_user)
  1547. ):
  1548. try:
  1549. collection_name = form_data.collection_name
  1550. if not collection_name:
  1551. collection_name = calculate_sha256_string(form_data.url)[:63]
  1552. content, docs = get_content_from_url(request, form_data.url)
  1553. log.debug(f"text_content: {content}")
  1554. if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
  1555. save_docs_to_vector_db(
  1556. request,
  1557. docs,
  1558. collection_name,
  1559. overwrite=True,
  1560. user=user,
  1561. )
  1562. else:
  1563. collection_name = None
  1564. return {
  1565. "status": True,
  1566. "collection_name": collection_name,
  1567. "filename": form_data.url,
  1568. "file": {
  1569. "data": {
  1570. "content": content,
  1571. },
  1572. "meta": {
  1573. "name": form_data.url,
  1574. "source": form_data.url,
  1575. },
  1576. },
  1577. }
  1578. except Exception as e:
  1579. log.exception(e)
  1580. raise HTTPException(
  1581. status_code=status.HTTP_400_BAD_REQUEST,
  1582. detail=ERROR_MESSAGES.DEFAULT(e),
  1583. )
  1584. def search_web(request: Request, engine: str, query: str) -> list[SearchResult]:
  1585. """Search the web using a search engine and return the results as a list of SearchResult objects.
  1586. Will look for a search engine API key in environment variables in the following order:
  1587. - SEARXNG_QUERY_URL
  1588. - YACY_QUERY_URL + YACY_USERNAME + YACY_PASSWORD
  1589. - GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID
  1590. - BRAVE_SEARCH_API_KEY
  1591. - KAGI_SEARCH_API_KEY
  1592. - MOJEEK_SEARCH_API_KEY
  1593. - BOCHA_SEARCH_API_KEY
  1594. - SERPSTACK_API_KEY
  1595. - SERPER_API_KEY
  1596. - SERPLY_API_KEY
  1597. - TAVILY_API_KEY
  1598. - EXA_API_KEY
  1599. - PERPLEXITY_API_KEY
  1600. - SOUGOU_API_SID + SOUGOU_API_SK
  1601. - SEARCHAPI_API_KEY + SEARCHAPI_ENGINE (by default `google`)
  1602. - SERPAPI_API_KEY + SERPAPI_ENGINE (by default `google`)
  1603. Args:
  1604. query (str): The query to search for
  1605. """
  1606. # TODO: add playwright to search the web
  1607. if engine == "ollama_cloud":
  1608. return search_ollama_cloud(
  1609. "https://ollama.com",
  1610. request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
  1611. query,
  1612. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1613. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1614. )
  1615. elif engine == "perplexity_search":
  1616. if request.app.state.config.PERPLEXITY_API_KEY:
  1617. return search_perplexity_search(
  1618. request.app.state.config.PERPLEXITY_API_KEY,
  1619. query,
  1620. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1621. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1622. )
  1623. else:
  1624. raise Exception("No PERPLEXITY_API_KEY found in environment variables")
  1625. elif engine == "searxng":
  1626. if request.app.state.config.SEARXNG_QUERY_URL:
  1627. return search_searxng(
  1628. request.app.state.config.SEARXNG_QUERY_URL,
  1629. query,
  1630. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1631. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1632. )
  1633. else:
  1634. raise Exception("No SEARXNG_QUERY_URL found in environment variables")
  1635. elif engine == "yacy":
  1636. if request.app.state.config.YACY_QUERY_URL:
  1637. return search_yacy(
  1638. request.app.state.config.YACY_QUERY_URL,
  1639. request.app.state.config.YACY_USERNAME,
  1640. request.app.state.config.YACY_PASSWORD,
  1641. query,
  1642. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1643. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1644. )
  1645. else:
  1646. raise Exception("No YACY_QUERY_URL found in environment variables")
  1647. elif engine == "google_pse":
  1648. if (
  1649. request.app.state.config.GOOGLE_PSE_API_KEY
  1650. and request.app.state.config.GOOGLE_PSE_ENGINE_ID
  1651. ):
  1652. return search_google_pse(
  1653. request.app.state.config.GOOGLE_PSE_API_KEY,
  1654. request.app.state.config.GOOGLE_PSE_ENGINE_ID,
  1655. query,
  1656. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1657. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1658. )
  1659. else:
  1660. raise Exception(
  1661. "No GOOGLE_PSE_API_KEY or GOOGLE_PSE_ENGINE_ID found in environment variables"
  1662. )
  1663. elif engine == "brave":
  1664. if request.app.state.config.BRAVE_SEARCH_API_KEY:
  1665. return search_brave(
  1666. request.app.state.config.BRAVE_SEARCH_API_KEY,
  1667. query,
  1668. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1669. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1670. )
  1671. else:
  1672. raise Exception("No BRAVE_SEARCH_API_KEY found in environment variables")
  1673. elif engine == "kagi":
  1674. if request.app.state.config.KAGI_SEARCH_API_KEY:
  1675. return search_kagi(
  1676. request.app.state.config.KAGI_SEARCH_API_KEY,
  1677. query,
  1678. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1679. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1680. )
  1681. else:
  1682. raise Exception("No KAGI_SEARCH_API_KEY found in environment variables")
  1683. elif engine == "mojeek":
  1684. if request.app.state.config.MOJEEK_SEARCH_API_KEY:
  1685. return search_mojeek(
  1686. request.app.state.config.MOJEEK_SEARCH_API_KEY,
  1687. query,
  1688. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1689. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1690. )
  1691. else:
  1692. raise Exception("No MOJEEK_SEARCH_API_KEY found in environment variables")
  1693. elif engine == "bocha":
  1694. if request.app.state.config.BOCHA_SEARCH_API_KEY:
  1695. return search_bocha(
  1696. request.app.state.config.BOCHA_SEARCH_API_KEY,
  1697. query,
  1698. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1699. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1700. )
  1701. else:
  1702. raise Exception("No BOCHA_SEARCH_API_KEY found in environment variables")
  1703. elif engine == "serpstack":
  1704. if request.app.state.config.SERPSTACK_API_KEY:
  1705. return search_serpstack(
  1706. request.app.state.config.SERPSTACK_API_KEY,
  1707. query,
  1708. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1709. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1710. https_enabled=request.app.state.config.SERPSTACK_HTTPS,
  1711. )
  1712. else:
  1713. raise Exception("No SERPSTACK_API_KEY found in environment variables")
  1714. elif engine == "serper":
  1715. if request.app.state.config.SERPER_API_KEY:
  1716. return search_serper(
  1717. request.app.state.config.SERPER_API_KEY,
  1718. query,
  1719. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1720. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1721. )
  1722. else:
  1723. raise Exception("No SERPER_API_KEY found in environment variables")
  1724. elif engine == "serply":
  1725. if request.app.state.config.SERPLY_API_KEY:
  1726. return search_serply(
  1727. request.app.state.config.SERPLY_API_KEY,
  1728. query,
  1729. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1730. filter_list=request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1731. )
  1732. else:
  1733. raise Exception("No SERPLY_API_KEY found in environment variables")
  1734. elif engine == "duckduckgo":
  1735. return search_duckduckgo(
  1736. query,
  1737. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1738. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1739. concurrent_requests=request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
  1740. )
  1741. elif engine == "tavily":
  1742. if request.app.state.config.TAVILY_API_KEY:
  1743. return search_tavily(
  1744. request.app.state.config.TAVILY_API_KEY,
  1745. query,
  1746. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1747. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1748. )
  1749. else:
  1750. raise Exception("No TAVILY_API_KEY found in environment variables")
  1751. elif engine == "exa":
  1752. if request.app.state.config.EXA_API_KEY:
  1753. return search_exa(
  1754. request.app.state.config.EXA_API_KEY,
  1755. query,
  1756. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1757. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1758. )
  1759. else:
  1760. raise Exception("No EXA_API_KEY found in environment variables")
  1761. elif engine == "searchapi":
  1762. if request.app.state.config.SEARCHAPI_API_KEY:
  1763. return search_searchapi(
  1764. request.app.state.config.SEARCHAPI_API_KEY,
  1765. request.app.state.config.SEARCHAPI_ENGINE,
  1766. query,
  1767. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1768. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1769. )
  1770. else:
  1771. raise Exception("No SEARCHAPI_API_KEY found in environment variables")
  1772. elif engine == "serpapi":
  1773. if request.app.state.config.SERPAPI_API_KEY:
  1774. return search_serpapi(
  1775. request.app.state.config.SERPAPI_API_KEY,
  1776. request.app.state.config.SERPAPI_ENGINE,
  1777. query,
  1778. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1779. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1780. )
  1781. else:
  1782. raise Exception("No SERPAPI_API_KEY found in environment variables")
  1783. elif engine == "jina":
  1784. return search_jina(
  1785. request.app.state.config.JINA_API_KEY,
  1786. query,
  1787. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1788. )
  1789. elif engine == "bing":
  1790. return search_bing(
  1791. request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
  1792. request.app.state.config.BING_SEARCH_V7_ENDPOINT,
  1793. str(DEFAULT_LOCALE),
  1794. query,
  1795. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1796. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1797. )
  1798. elif engine == "exa":
  1799. return search_exa(
  1800. request.app.state.config.EXA_API_KEY,
  1801. query,
  1802. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1803. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1804. )
  1805. elif engine == "perplexity":
  1806. return search_perplexity(
  1807. request.app.state.config.PERPLEXITY_API_KEY,
  1808. query,
  1809. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1810. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1811. model=request.app.state.config.PERPLEXITY_MODEL,
  1812. search_context_usage=request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
  1813. )
  1814. elif engine == "sougou":
  1815. if (
  1816. request.app.state.config.SOUGOU_API_SID
  1817. and request.app.state.config.SOUGOU_API_SK
  1818. ):
  1819. return search_sougou(
  1820. request.app.state.config.SOUGOU_API_SID,
  1821. request.app.state.config.SOUGOU_API_SK,
  1822. query,
  1823. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1824. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1825. )
  1826. else:
  1827. raise Exception(
  1828. "No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables"
  1829. )
  1830. elif engine == "firecrawl":
  1831. return search_firecrawl(
  1832. request.app.state.config.FIRECRAWL_API_BASE_URL,
  1833. request.app.state.config.FIRECRAWL_API_KEY,
  1834. query,
  1835. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1836. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1837. )
  1838. elif engine == "external":
  1839. return search_external(
  1840. request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
  1841. request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
  1842. query,
  1843. request.app.state.config.WEB_SEARCH_RESULT_COUNT,
  1844. request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
  1845. )
  1846. else:
  1847. raise Exception("No search engine API key found in environment variables")
  1848. @router.post("/process/web/search")
  1849. async def process_web_search(
  1850. request: Request, form_data: SearchForm, user=Depends(get_verified_user)
  1851. ):
  1852. urls = []
  1853. result_items = []
  1854. try:
  1855. logging.debug(
  1856. f"trying to web search with {request.app.state.config.WEB_SEARCH_ENGINE, form_data.queries}"
  1857. )
  1858. search_tasks = [
  1859. run_in_threadpool(
  1860. search_web,
  1861. request,
  1862. request.app.state.config.WEB_SEARCH_ENGINE,
  1863. query,
  1864. )
  1865. for query in form_data.queries
  1866. ]
  1867. search_results = await asyncio.gather(*search_tasks)
  1868. for result in search_results:
  1869. if result:
  1870. for item in result:
  1871. if item and item.link:
  1872. result_items.append(item)
  1873. urls.append(item.link)
  1874. urls = list(dict.fromkeys(urls))
  1875. log.debug(f"urls: {urls}")
  1876. except Exception as e:
  1877. log.exception(e)
  1878. raise HTTPException(
  1879. status_code=status.HTTP_400_BAD_REQUEST,
  1880. detail=ERROR_MESSAGES.WEB_SEARCH_ERROR(e),
  1881. )
  1882. try:
  1883. if request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER:
  1884. search_results = [
  1885. item for result in search_results for item in result if result
  1886. ]
  1887. docs = [
  1888. Document(
  1889. page_content=result.snippet,
  1890. metadata={
  1891. "source": result.link,
  1892. "title": result.title,
  1893. "snippet": result.snippet,
  1894. "link": result.link,
  1895. },
  1896. )
  1897. for result in search_results
  1898. if hasattr(result, "snippet") and result.snippet is not None
  1899. ]
  1900. else:
  1901. loader = get_web_loader(
  1902. urls,
  1903. verify_ssl=request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
  1904. requests_per_second=request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
  1905. trust_env=request.app.state.config.WEB_SEARCH_TRUST_ENV,
  1906. )
  1907. docs = await loader.aload()
  1908. urls = [
  1909. doc.metadata.get("source") for doc in docs if doc.metadata.get("source")
  1910. ] # only keep the urls returned by the loader
  1911. result_items = [
  1912. dict(item) for item in result_items if item.link in urls
  1913. ] # only keep the search results that have been loaded
  1914. if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
  1915. return {
  1916. "status": True,
  1917. "collection_name": None,
  1918. "filenames": urls,
  1919. "items": result_items,
  1920. "docs": [
  1921. {
  1922. "content": doc.page_content,
  1923. "metadata": doc.metadata,
  1924. }
  1925. for doc in docs
  1926. ],
  1927. "loaded_count": len(docs),
  1928. }
  1929. else:
  1930. # Create a single collection for all documents
  1931. collection_name = (
  1932. f"web-search-{calculate_sha256_string('-'.join(form_data.queries))}"[
  1933. :63
  1934. ]
  1935. )
  1936. try:
  1937. await run_in_threadpool(
  1938. save_docs_to_vector_db,
  1939. request,
  1940. docs,
  1941. collection_name,
  1942. overwrite=True,
  1943. user=user,
  1944. )
  1945. except Exception as e:
  1946. log.debug(f"error saving docs: {e}")
  1947. return {
  1948. "status": True,
  1949. "collection_names": [collection_name],
  1950. "items": result_items,
  1951. "filenames": urls,
  1952. "loaded_count": len(docs),
  1953. }
  1954. except Exception as e:
  1955. log.exception(e)
  1956. raise HTTPException(
  1957. status_code=status.HTTP_400_BAD_REQUEST,
  1958. detail=ERROR_MESSAGES.DEFAULT(e),
  1959. )
  1960. class QueryDocForm(BaseModel):
  1961. collection_name: str
  1962. query: str
  1963. k: Optional[int] = None
  1964. k_reranker: Optional[int] = None
  1965. r: Optional[float] = None
  1966. hybrid: Optional[bool] = None
  1967. @router.post("/query/doc")
  1968. def query_doc_handler(
  1969. request: Request,
  1970. form_data: QueryDocForm,
  1971. user=Depends(get_verified_user),
  1972. ):
  1973. try:
  1974. if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH and (
  1975. form_data.hybrid is None or form_data.hybrid
  1976. ):
  1977. collection_results = {}
  1978. collection_results[form_data.collection_name] = VECTOR_DB_CLIENT.get(
  1979. collection_name=form_data.collection_name
  1980. )
  1981. return query_doc_with_hybrid_search(
  1982. collection_name=form_data.collection_name,
  1983. collection_result=collection_results[form_data.collection_name],
  1984. query=form_data.query,
  1985. embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
  1986. query, prefix=prefix, user=user
  1987. ),
  1988. k=form_data.k if form_data.k else request.app.state.config.TOP_K,
  1989. reranking_function=(
  1990. (
  1991. lambda sentences: request.app.state.RERANKING_FUNCTION(
  1992. sentences, user=user
  1993. )
  1994. )
  1995. if request.app.state.RERANKING_FUNCTION
  1996. else None
  1997. ),
  1998. k_reranker=form_data.k_reranker
  1999. or request.app.state.config.TOP_K_RERANKER,
  2000. r=(
  2001. form_data.r
  2002. if form_data.r
  2003. else request.app.state.config.RELEVANCE_THRESHOLD
  2004. ),
  2005. hybrid_bm25_weight=(
  2006. form_data.hybrid_bm25_weight
  2007. if form_data.hybrid_bm25_weight
  2008. else request.app.state.config.HYBRID_BM25_WEIGHT
  2009. ),
  2010. user=user,
  2011. )
  2012. else:
  2013. return query_doc(
  2014. collection_name=form_data.collection_name,
  2015. query_embedding=request.app.state.EMBEDDING_FUNCTION(
  2016. form_data.query, prefix=RAG_EMBEDDING_QUERY_PREFIX, user=user
  2017. ),
  2018. k=form_data.k if form_data.k else request.app.state.config.TOP_K,
  2019. user=user,
  2020. )
  2021. except Exception as e:
  2022. log.exception(e)
  2023. raise HTTPException(
  2024. status_code=status.HTTP_400_BAD_REQUEST,
  2025. detail=ERROR_MESSAGES.DEFAULT(e),
  2026. )
  2027. class QueryCollectionsForm(BaseModel):
  2028. collection_names: list[str]
  2029. query: str
  2030. k: Optional[int] = None
  2031. k_reranker: Optional[int] = None
  2032. r: Optional[float] = None
  2033. hybrid: Optional[bool] = None
  2034. hybrid_bm25_weight: Optional[float] = None
  2035. @router.post("/query/collection")
  2036. def query_collection_handler(
  2037. request: Request,
  2038. form_data: QueryCollectionsForm,
  2039. user=Depends(get_verified_user),
  2040. ):
  2041. try:
  2042. if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH and (
  2043. form_data.hybrid is None or form_data.hybrid
  2044. ):
  2045. return query_collection_with_hybrid_search(
  2046. collection_names=form_data.collection_names,
  2047. queries=[form_data.query],
  2048. embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
  2049. query, prefix=prefix, user=user
  2050. ),
  2051. k=form_data.k if form_data.k else request.app.state.config.TOP_K,
  2052. reranking_function=(
  2053. (
  2054. lambda sentences: request.app.state.RERANKING_FUNCTION(
  2055. sentences, user=user
  2056. )
  2057. )
  2058. if request.app.state.RERANKING_FUNCTION
  2059. else None
  2060. ),
  2061. k_reranker=form_data.k_reranker
  2062. or request.app.state.config.TOP_K_RERANKER,
  2063. r=(
  2064. form_data.r
  2065. if form_data.r
  2066. else request.app.state.config.RELEVANCE_THRESHOLD
  2067. ),
  2068. hybrid_bm25_weight=(
  2069. form_data.hybrid_bm25_weight
  2070. if form_data.hybrid_bm25_weight
  2071. else request.app.state.config.HYBRID_BM25_WEIGHT
  2072. ),
  2073. )
  2074. else:
  2075. return query_collection(
  2076. collection_names=form_data.collection_names,
  2077. queries=[form_data.query],
  2078. embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
  2079. query, prefix=prefix, user=user
  2080. ),
  2081. k=form_data.k if form_data.k else request.app.state.config.TOP_K,
  2082. )
  2083. except Exception as e:
  2084. log.exception(e)
  2085. raise HTTPException(
  2086. status_code=status.HTTP_400_BAD_REQUEST,
  2087. detail=ERROR_MESSAGES.DEFAULT(e),
  2088. )
  2089. ####################################
  2090. #
  2091. # Vector DB operations
  2092. #
  2093. ####################################
  2094. class DeleteForm(BaseModel):
  2095. collection_name: str
  2096. file_id: str
  2097. @router.post("/delete")
  2098. def delete_entries_from_collection(form_data: DeleteForm, user=Depends(get_admin_user)):
  2099. try:
  2100. if VECTOR_DB_CLIENT.has_collection(collection_name=form_data.collection_name):
  2101. file = Files.get_file_by_id(form_data.file_id)
  2102. hash = file.hash
  2103. VECTOR_DB_CLIENT.delete(
  2104. collection_name=form_data.collection_name,
  2105. metadata={"hash": hash},
  2106. )
  2107. return {"status": True}
  2108. else:
  2109. return {"status": False}
  2110. except Exception as e:
  2111. log.exception(e)
  2112. return {"status": False}
  2113. @router.post("/reset/db")
  2114. def reset_vector_db(user=Depends(get_admin_user)):
  2115. VECTOR_DB_CLIENT.reset()
  2116. Knowledges.delete_all_knowledge()
  2117. @router.post("/reset/uploads")
  2118. def reset_upload_dir(user=Depends(get_admin_user)) -> bool:
  2119. folder = f"{UPLOAD_DIR}"
  2120. try:
  2121. # Check if the directory exists
  2122. if os.path.exists(folder):
  2123. # Iterate over all the files and directories in the specified directory
  2124. for filename in os.listdir(folder):
  2125. file_path = os.path.join(folder, filename)
  2126. try:
  2127. if os.path.isfile(file_path) or os.path.islink(file_path):
  2128. os.unlink(file_path) # Remove the file or link
  2129. elif os.path.isdir(file_path):
  2130. shutil.rmtree(file_path) # Remove the directory
  2131. except Exception as e:
  2132. log.exception(f"Failed to delete {file_path}. Reason: {e}")
  2133. else:
  2134. log.warning(f"The directory {folder} does not exist")
  2135. except Exception as e:
  2136. log.exception(f"Failed to process the directory {folder}. Reason: {e}")
  2137. return True
  2138. if ENV == "dev":
  2139. @router.get("/ef/{text}")
  2140. async def get_embeddings(request: Request, text: Optional[str] = "Hello World!"):
  2141. return {
  2142. "result": request.app.state.EMBEDDING_FUNCTION(
  2143. text, prefix=RAG_EMBEDDING_QUERY_PREFIX
  2144. )
  2145. }
  2146. class BatchProcessFilesForm(BaseModel):
  2147. files: List[FileModel]
  2148. collection_name: str
  2149. class BatchProcessFilesResult(BaseModel):
  2150. file_id: str
  2151. status: str
  2152. error: Optional[str] = None
  2153. class BatchProcessFilesResponse(BaseModel):
  2154. results: List[BatchProcessFilesResult]
  2155. errors: List[BatchProcessFilesResult]
  2156. @router.post("/process/files/batch")
  2157. def process_files_batch(
  2158. request: Request,
  2159. form_data: BatchProcessFilesForm,
  2160. user=Depends(get_verified_user),
  2161. ) -> BatchProcessFilesResponse:
  2162. """
  2163. Process a batch of files and save them to the vector database.
  2164. """
  2165. results: List[BatchProcessFilesResult] = []
  2166. errors: List[BatchProcessFilesResult] = []
  2167. collection_name = form_data.collection_name
  2168. # Prepare all documents first
  2169. all_docs: List[Document] = []
  2170. for file in form_data.files:
  2171. try:
  2172. text_content = file.data.get("content", "")
  2173. docs: List[Document] = [
  2174. Document(
  2175. page_content=text_content.replace("<br/>", "\n"),
  2176. metadata={
  2177. **file.meta,
  2178. "name": file.filename,
  2179. "created_by": file.user_id,
  2180. "file_id": file.id,
  2181. "source": file.filename,
  2182. },
  2183. )
  2184. ]
  2185. hash = calculate_sha256_string(text_content)
  2186. Files.update_file_hash_by_id(file.id, hash)
  2187. Files.update_file_data_by_id(file.id, {"content": text_content})
  2188. all_docs.extend(docs)
  2189. results.append(BatchProcessFilesResult(file_id=file.id, status="prepared"))
  2190. except Exception as e:
  2191. log.error(f"process_files_batch: Error processing file {file.id}: {str(e)}")
  2192. errors.append(
  2193. BatchProcessFilesResult(file_id=file.id, status="failed", error=str(e))
  2194. )
  2195. # Save all documents in one batch
  2196. if all_docs:
  2197. try:
  2198. save_docs_to_vector_db(
  2199. request=request,
  2200. docs=all_docs,
  2201. collection_name=collection_name,
  2202. add=True,
  2203. user=user,
  2204. )
  2205. # Update all files with collection name
  2206. for result in results:
  2207. Files.update_file_metadata_by_id(
  2208. result.file_id, {"collection_name": collection_name}
  2209. )
  2210. result.status = "completed"
  2211. except Exception as e:
  2212. log.error(
  2213. f"process_files_batch: Error saving documents to vector DB: {str(e)}"
  2214. )
  2215. for result in results:
  2216. result.status = "failed"
  2217. errors.append(
  2218. BatchProcessFilesResult(file_id=result.file_id, error=str(e))
  2219. )
  2220. return BatchProcessFilesResponse(results=results, errors=errors)