files.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. import logging
  2. import os
  3. import uuid
  4. from fnmatch import fnmatch
  5. from functools import lru_cache
  6. from pathlib import Path
  7. from typing import Optional
  8. from urllib.parse import quote
  9. from fastapi import (
  10. APIRouter,
  11. Depends,
  12. File,
  13. HTTPException,
  14. Request,
  15. UploadFile,
  16. status,
  17. Query,
  18. )
  19. from fastapi.responses import FileResponse, StreamingResponse
  20. from open_webui.constants import ERROR_MESSAGES
  21. from open_webui.env import SRC_LOG_LEVELS
  22. from open_webui.models.files import (
  23. FileForm,
  24. FileModel,
  25. FileModelResponse,
  26. Files,
  27. )
  28. from open_webui.models.knowledge import Knowledges
  29. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  30. from open_webui.routers.retrieval import ProcessFileForm, process_file
  31. from open_webui.routers.audio import transcribe
  32. from open_webui.storage.provider import Storage
  33. from open_webui.utils.auth import get_admin_user, get_verified_user
  34. from pydantic import BaseModel
  35. log = logging.getLogger(__name__)
  36. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  37. router = APIRouter()
  38. ############################
  39. # Check if the current user has access to a file through any knowledge bases the user may be in.
  40. ############################
  41. def has_access_to_file(
  42. file_id: Optional[str], access_type: str, user=Depends(get_verified_user)
  43. ) -> bool:
  44. file = Files.get_file_by_id(file_id)
  45. log.debug(f"Checking if user has {access_type} access to file")
  46. if not file:
  47. raise HTTPException(
  48. status_code=status.HTTP_404_NOT_FOUND,
  49. detail=ERROR_MESSAGES.NOT_FOUND,
  50. )
  51. has_access = False
  52. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  53. if knowledge_base_id:
  54. knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(
  55. user.id, access_type
  56. )
  57. for knowledge_base in knowledge_bases:
  58. if knowledge_base.id == knowledge_base_id:
  59. has_access = True
  60. break
  61. return has_access
  62. ############################
  63. # Get all files for user, with 1 cache
  64. ############################
  65. @lru_cache(maxsize=1)
  66. def get_all_files_for_user(user_id: str, admin: bool):
  67. if admin:
  68. return Files.get_files()
  69. else:
  70. return Files.get_files_by_user_id(user_id)
  71. ############################
  72. # Upload File
  73. ############################
  74. @router.post("/", response_model=FileModelResponse)
  75. def upload_file(
  76. request: Request,
  77. file: UploadFile = File(...),
  78. user=Depends(get_verified_user),
  79. file_metadata: dict = {},
  80. process: bool = Query(True),
  81. ):
  82. log.info(f"file.content_type: {file.content_type}")
  83. try:
  84. unsanitized_filename = file.filename
  85. filename = os.path.basename(unsanitized_filename)
  86. # replace filename with uuid
  87. id = str(uuid.uuid4())
  88. name = filename
  89. filename = f"{id}_{filename}"
  90. contents, file_path = Storage.upload_file(file.file, filename)
  91. file_item = Files.insert_new_file(
  92. user.id,
  93. FileForm(
  94. **{
  95. "id": id,
  96. "filename": name,
  97. "path": file_path,
  98. "meta": {
  99. "name": name,
  100. "content_type": file.content_type,
  101. "size": len(contents),
  102. "data": file_metadata,
  103. },
  104. }
  105. ),
  106. )
  107. if process:
  108. try:
  109. if file.content_type in [
  110. "audio/mpeg",
  111. "audio/wav",
  112. "audio/ogg",
  113. "audio/x-m4a",
  114. ]:
  115. file_path = Storage.get_file(file_path)
  116. result = transcribe(request, file_path)
  117. process_file(
  118. request,
  119. ProcessFileForm(file_id=id, content=result.get("text", "")),
  120. user=user,
  121. )
  122. elif file.content_type not in ["image/png", "image/jpeg", "image/gif"]:
  123. process_file(request, ProcessFileForm(file_id=id), user=user)
  124. file_item = Files.get_file_by_id(id=id)
  125. except Exception as e:
  126. log.exception(e)
  127. log.error(f"Error processing file: {file_item.id}")
  128. file_item = FileModelResponse(
  129. **{
  130. **file_item.model_dump(),
  131. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  132. }
  133. )
  134. if file_item:
  135. return file_item
  136. else:
  137. raise HTTPException(
  138. status_code=status.HTTP_400_BAD_REQUEST,
  139. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  140. )
  141. except Exception as e:
  142. log.exception(e)
  143. raise HTTPException(
  144. status_code=status.HTTP_400_BAD_REQUEST,
  145. detail=ERROR_MESSAGES.DEFAULT(e),
  146. )
  147. ############################
  148. # List Files
  149. ############################
  150. @router.get("/", response_model=list[FileModelResponse])
  151. async def list_files(user=Depends(get_verified_user), content: bool = Query(True)):
  152. if user.role == "admin":
  153. files = Files.get_files()
  154. else:
  155. files = Files.get_files_by_user_id(user.id)
  156. if not content:
  157. for file in files:
  158. del file.data["content"]
  159. return files
  160. ############################
  161. # Search Files
  162. ############################
  163. @router.get("/search", response_model=list[FileModelResponse])
  164. async def search_files(
  165. filename: str = Query(..., description="Filename pattern to search for. Supports wildcards such as '*.pdf'"),
  166. user=Depends(get_verified_user)
  167. ):
  168. # Retrieve files from cache
  169. files = get_all_files_for_user(user.id, user.role == "admin")
  170. # Normalize pattern and file names
  171. normalized_pattern = normalize_text(filename).lower()
  172. matching_files = [
  173. file for file in files
  174. if fnmatch(normalize_text(file.filename).lower(), normalized_pattern)
  175. ]
  176. if not matching_files:
  177. raise HTTPException(
  178. status_code=status.HTTP_404_NOT_FOUND,
  179. detail="No files found matching the pattern."
  180. )
  181. return matching_files
  182. ############################
  183. # Delete All Files
  184. ############################
  185. @router.delete("/all")
  186. async def delete_all_files(user=Depends(get_admin_user)):
  187. result = Files.delete_all_files()
  188. if result:
  189. try:
  190. Storage.delete_all_files()
  191. except Exception as e:
  192. log.exception(e)
  193. log.error("Error deleting files")
  194. raise HTTPException(
  195. status_code=status.HTTP_400_BAD_REQUEST,
  196. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  197. )
  198. return {"message": "All files deleted successfully"}
  199. else:
  200. raise HTTPException(
  201. status_code=status.HTTP_400_BAD_REQUEST,
  202. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  203. )
  204. ############################
  205. # Get File By Id
  206. ############################
  207. @router.get("/{id}", response_model=Optional[FileModel])
  208. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  209. file = Files.get_file_by_id(id)
  210. if not file:
  211. raise HTTPException(
  212. status_code=status.HTTP_404_NOT_FOUND,
  213. detail=ERROR_MESSAGES.NOT_FOUND,
  214. )
  215. if (
  216. file.user_id == user.id
  217. or user.role == "admin"
  218. or has_access_to_file(id, "read", user)
  219. ):
  220. return file
  221. else:
  222. raise HTTPException(
  223. status_code=status.HTTP_404_NOT_FOUND,
  224. detail=ERROR_MESSAGES.NOT_FOUND,
  225. )
  226. ############################
  227. # Get File Data Content By Id
  228. ############################
  229. @router.get("/{id}/data/content")
  230. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  231. file = Files.get_file_by_id(id)
  232. if not file:
  233. raise HTTPException(
  234. status_code=status.HTTP_404_NOT_FOUND,
  235. detail=ERROR_MESSAGES.NOT_FOUND,
  236. )
  237. if (
  238. file.user_id == user.id
  239. or user.role == "admin"
  240. or has_access_to_file(id, "read", user)
  241. ):
  242. return {"content": file.data.get("content", "")}
  243. else:
  244. raise HTTPException(
  245. status_code=status.HTTP_404_NOT_FOUND,
  246. detail=ERROR_MESSAGES.NOT_FOUND,
  247. )
  248. ############################
  249. # Update File Data Content By Id
  250. ############################
  251. class ContentForm(BaseModel):
  252. content: str
  253. @router.post("/{id}/data/content/update")
  254. async def update_file_data_content_by_id(
  255. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  256. ):
  257. file = Files.get_file_by_id(id)
  258. if not file:
  259. raise HTTPException(
  260. status_code=status.HTTP_404_NOT_FOUND,
  261. detail=ERROR_MESSAGES.NOT_FOUND,
  262. )
  263. if (
  264. file.user_id == user.id
  265. or user.role == "admin"
  266. or has_access_to_file(id, "write", user)
  267. ):
  268. try:
  269. process_file(
  270. request,
  271. ProcessFileForm(file_id=id, content=form_data.content),
  272. user=user,
  273. )
  274. file = Files.get_file_by_id(id=id)
  275. except Exception as e:
  276. log.exception(e)
  277. log.error(f"Error processing file: {file.id}")
  278. return {"content": file.data.get("content", "")}
  279. else:
  280. raise HTTPException(
  281. status_code=status.HTTP_404_NOT_FOUND,
  282. detail=ERROR_MESSAGES.NOT_FOUND,
  283. )
  284. ############################
  285. # Get File Content By Id
  286. ############################
  287. @router.get("/{id}/content")
  288. async def get_file_content_by_id(
  289. id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
  290. ):
  291. file = Files.get_file_by_id(id)
  292. if not file:
  293. raise HTTPException(
  294. status_code=status.HTTP_404_NOT_FOUND,
  295. detail=ERROR_MESSAGES.NOT_FOUND,
  296. )
  297. if (
  298. file.user_id == user.id
  299. or user.role == "admin"
  300. or has_access_to_file(id, "read", user)
  301. ):
  302. try:
  303. file_path = Storage.get_file(file.path)
  304. file_path = Path(file_path)
  305. # Check if the file already exists in the cache
  306. if file_path.is_file():
  307. # Handle Unicode filenames
  308. filename = file.meta.get("name", file.filename)
  309. encoded_filename = quote(filename) # RFC5987 encoding
  310. content_type = file.meta.get("content_type")
  311. filename = file.meta.get("name", file.filename)
  312. encoded_filename = quote(filename)
  313. headers = {}
  314. if attachment:
  315. headers["Content-Disposition"] = (
  316. f"attachment; filename*=UTF-8''{encoded_filename}"
  317. )
  318. else:
  319. if content_type == "application/pdf" or filename.lower().endswith(
  320. ".pdf"
  321. ):
  322. headers["Content-Disposition"] = (
  323. f"inline; filename*=UTF-8''{encoded_filename}"
  324. )
  325. content_type = "application/pdf"
  326. elif content_type != "text/plain":
  327. headers["Content-Disposition"] = (
  328. f"attachment; filename*=UTF-8''{encoded_filename}"
  329. )
  330. return FileResponse(file_path, headers=headers, media_type=content_type)
  331. else:
  332. raise HTTPException(
  333. status_code=status.HTTP_404_NOT_FOUND,
  334. detail=ERROR_MESSAGES.NOT_FOUND,
  335. )
  336. except Exception as e:
  337. log.exception(e)
  338. log.error("Error getting file content")
  339. raise HTTPException(
  340. status_code=status.HTTP_400_BAD_REQUEST,
  341. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  342. )
  343. else:
  344. raise HTTPException(
  345. status_code=status.HTTP_404_NOT_FOUND,
  346. detail=ERROR_MESSAGES.NOT_FOUND,
  347. )
  348. @router.get("/{id}/content/html")
  349. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  350. file = Files.get_file_by_id(id)
  351. if not file:
  352. raise HTTPException(
  353. status_code=status.HTTP_404_NOT_FOUND,
  354. detail=ERROR_MESSAGES.NOT_FOUND,
  355. )
  356. if (
  357. file.user_id == user.id
  358. or user.role == "admin"
  359. or has_access_to_file(id, "read", user)
  360. ):
  361. try:
  362. file_path = Storage.get_file(file.path)
  363. file_path = Path(file_path)
  364. # Check if the file already exists in the cache
  365. if file_path.is_file():
  366. log.info(f"file_path: {file_path}")
  367. return FileResponse(file_path)
  368. else:
  369. raise HTTPException(
  370. status_code=status.HTTP_404_NOT_FOUND,
  371. detail=ERROR_MESSAGES.NOT_FOUND,
  372. )
  373. except Exception as e:
  374. log.exception(e)
  375. log.error("Error getting file content")
  376. raise HTTPException(
  377. status_code=status.HTTP_400_BAD_REQUEST,
  378. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  379. )
  380. else:
  381. raise HTTPException(
  382. status_code=status.HTTP_404_NOT_FOUND,
  383. detail=ERROR_MESSAGES.NOT_FOUND,
  384. )
  385. @router.get("/{id}/content/{file_name}")
  386. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  387. file = Files.get_file_by_id(id)
  388. if not file:
  389. raise HTTPException(
  390. status_code=status.HTTP_404_NOT_FOUND,
  391. detail=ERROR_MESSAGES.NOT_FOUND,
  392. )
  393. if (
  394. file.user_id == user.id
  395. or user.role == "admin"
  396. or has_access_to_file(id, "read", user)
  397. ):
  398. file_path = file.path
  399. # Handle Unicode filenames
  400. filename = file.meta.get("name", file.filename)
  401. encoded_filename = quote(filename) # RFC5987 encoding
  402. headers = {
  403. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  404. }
  405. if file_path:
  406. file_path = Storage.get_file(file_path)
  407. file_path = Path(file_path)
  408. # Check if the file already exists in the cache
  409. if file_path.is_file():
  410. return FileResponse(file_path, headers=headers)
  411. else:
  412. raise HTTPException(
  413. status_code=status.HTTP_404_NOT_FOUND,
  414. detail=ERROR_MESSAGES.NOT_FOUND,
  415. )
  416. else:
  417. # File path doesn’t exist, return the content as .txt if possible
  418. file_content = file.content.get("content", "")
  419. file_name = file.filename
  420. # Create a generator that encodes the file content
  421. def generator():
  422. yield file_content.encode("utf-8")
  423. return StreamingResponse(
  424. generator(),
  425. media_type="text/plain",
  426. headers=headers,
  427. )
  428. else:
  429. raise HTTPException(
  430. status_code=status.HTTP_404_NOT_FOUND,
  431. detail=ERROR_MESSAGES.NOT_FOUND,
  432. )
  433. ############################
  434. # Delete File By Id
  435. ############################
  436. @router.delete("/{id}")
  437. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  438. file = Files.get_file_by_id(id)
  439. if not file:
  440. raise HTTPException(
  441. status_code=status.HTTP_404_NOT_FOUND,
  442. detail=ERROR_MESSAGES.NOT_FOUND,
  443. )
  444. if (
  445. file.user_id == user.id
  446. or user.role == "admin"
  447. or has_access_to_file(id, "write", user)
  448. ):
  449. # We should add Chroma cleanup here
  450. result = Files.delete_file_by_id(id)
  451. if result:
  452. try:
  453. Storage.delete_file(file.path)
  454. except Exception as e:
  455. log.exception(e)
  456. log.error("Error deleting files")
  457. raise HTTPException(
  458. status_code=status.HTTP_400_BAD_REQUEST,
  459. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  460. )
  461. return {"message": "File deleted successfully"}
  462. else:
  463. raise HTTPException(
  464. status_code=status.HTTP_400_BAD_REQUEST,
  465. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  466. )
  467. else:
  468. raise HTTPException(
  469. status_code=status.HTTP_404_NOT_FOUND,
  470. detail=ERROR_MESSAGES.NOT_FOUND,
  471. )