files.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. import logging
  2. import os
  3. import uuid
  4. from fnmatch import fnmatch
  5. from functools import lru_cache
  6. from pathlib import Path
  7. from typing import Optional
  8. from urllib.parse import quote
  9. from fastapi import (
  10. APIRouter,
  11. Depends,
  12. File,
  13. HTTPException,
  14. Request,
  15. UploadFile,
  16. status,
  17. Query,
  18. )
  19. from fastapi.responses import FileResponse, StreamingResponse
  20. from open_webui.constants import ERROR_MESSAGES
  21. from open_webui.env import SRC_LOG_LEVELS
  22. from open_webui.models.files import (
  23. FileForm,
  24. FileModel,
  25. FileModelResponse,
  26. Files,
  27. )
  28. from open_webui.models.knowledge import Knowledges
  29. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  30. from open_webui.routers.retrieval import ProcessFileForm, process_file
  31. from open_webui.routers.audio import transcribe
  32. from open_webui.storage.provider import Storage
  33. from open_webui.utils.auth import get_admin_user, get_verified_user
  34. from pydantic import BaseModel
  35. log = logging.getLogger(__name__)
  36. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  37. router = APIRouter()
  38. ############################
  39. # Check if the current user has access to a file through any knowledge bases the user may be in.
  40. ############################
  41. def has_access_to_file(
  42. file_id: Optional[str], access_type: str, user=Depends(get_verified_user)
  43. ) -> bool:
  44. file = Files.get_file_by_id(file_id)
  45. log.debug(f"Checking if user has {access_type} access to file")
  46. if not file:
  47. raise HTTPException(
  48. status_code=status.HTTP_404_NOT_FOUND,
  49. detail=ERROR_MESSAGES.NOT_FOUND,
  50. )
  51. has_access = False
  52. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  53. if knowledge_base_id:
  54. knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(
  55. user.id, access_type
  56. )
  57. for knowledge_base in knowledge_bases:
  58. if knowledge_base.id == knowledge_base_id:
  59. has_access = True
  60. break
  61. return has_access
  62. ############################
  63. # Get all files for user, with 1 cache
  64. ############################
  65. @lru_cache(maxsize=1)
  66. def get_all_files_for_user(user_id: str, admin: bool):
  67. if admin:
  68. return Files.get_files()
  69. else:
  70. return Files.get_files_by_user_id(user_id)
  71. ############################
  72. # Upload File
  73. ############################
  74. @router.post("/", response_model=FileModelResponse)
  75. def upload_file(
  76. request: Request,
  77. file: UploadFile = File(...),
  78. user=Depends(get_verified_user),
  79. file_metadata: dict = {},
  80. process: bool = Query(True),
  81. ):
  82. log.info(f"file.content_type: {file.content_type}")
  83. try:
  84. unsanitized_filename = file.filename
  85. filename = os.path.basename(unsanitized_filename)
  86. # replace filename with uuid
  87. id = str(uuid.uuid4())
  88. name = filename
  89. filename = f"{id}_{filename}"
  90. contents, file_path = Storage.upload_file(file.file, filename)
  91. file_item = Files.insert_new_file(
  92. user.id,
  93. FileForm(
  94. **{
  95. "id": id,
  96. "filename": name,
  97. "path": file_path,
  98. "meta": {
  99. "name": name,
  100. "content_type": file.content_type,
  101. "size": len(contents),
  102. "data": file_metadata,
  103. },
  104. }
  105. ),
  106. )
  107. if process:
  108. try:
  109. if file.content_type in [
  110. "audio/mpeg",
  111. "audio/wav",
  112. "audio/ogg",
  113. "audio/x-m4a",
  114. ]:
  115. file_path = Storage.get_file(file_path)
  116. result = transcribe(request, file_path)
  117. process_file(
  118. request,
  119. ProcessFileForm(file_id=id, content=result.get("text", "")),
  120. user=user,
  121. )
  122. elif file.content_type not in ["image/png", "image/jpeg", "image/gif"]:
  123. process_file(request, ProcessFileForm(file_id=id), user=user)
  124. file_item = Files.get_file_by_id(id=id)
  125. except Exception as e:
  126. log.exception(e)
  127. log.error(f"Error processing file: {file_item.id}")
  128. file_item = FileModelResponse(
  129. **{
  130. **file_item.model_dump(),
  131. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  132. }
  133. )
  134. if file_item:
  135. return file_item
  136. else:
  137. raise HTTPException(
  138. status_code=status.HTTP_400_BAD_REQUEST,
  139. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  140. )
  141. except Exception as e:
  142. log.exception(e)
  143. raise HTTPException(
  144. status_code=status.HTTP_400_BAD_REQUEST,
  145. detail=ERROR_MESSAGES.DEFAULT(e),
  146. )
  147. ############################
  148. # List Files
  149. ############################
  150. @router.get("/", response_model=list[FileModelResponse])
  151. async def list_files(user=Depends(get_verified_user), content: bool = Query(True)):
  152. if user.role == "admin":
  153. files = Files.get_files()
  154. else:
  155. files = Files.get_files_by_user_id(user.id)
  156. if not content:
  157. for file in files:
  158. del file.data["content"]
  159. return files
  160. ############################
  161. # Search Files
  162. ############################
  163. @router.get("/search", response_model=list[FileModelResponse])
  164. async def search_files(
  165. filename: str = Query(..., description="Filename pattern to search for. Supports wildcards such as '*.pdf'"),
  166. user=Depends(get_verified_user)
  167. ):
  168. # Retrieve files from cache
  169. files = get_all_files_for_user(user.id, user.role == "admin")
  170. # Get matching files
  171. matching_files = [file for file in files if fnmatch(file.filename.lower(), filename.lower())]
  172. if not matching_files:
  173. raise HTTPException(
  174. status_code=status.HTTP_404_NOT_FOUND,
  175. detail="No files found matching the pattern."
  176. )
  177. return matching_files
  178. ############################
  179. # Delete All Files
  180. ############################
  181. @router.delete("/all")
  182. async def delete_all_files(user=Depends(get_admin_user)):
  183. result = Files.delete_all_files()
  184. if result:
  185. try:
  186. Storage.delete_all_files()
  187. except Exception as e:
  188. log.exception(e)
  189. log.error("Error deleting files")
  190. raise HTTPException(
  191. status_code=status.HTTP_400_BAD_REQUEST,
  192. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  193. )
  194. return {"message": "All files deleted successfully"}
  195. else:
  196. raise HTTPException(
  197. status_code=status.HTTP_400_BAD_REQUEST,
  198. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  199. )
  200. ############################
  201. # Get File By Id
  202. ############################
  203. @router.get("/{id}", response_model=Optional[FileModel])
  204. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  205. file = Files.get_file_by_id(id)
  206. if not file:
  207. raise HTTPException(
  208. status_code=status.HTTP_404_NOT_FOUND,
  209. detail=ERROR_MESSAGES.NOT_FOUND,
  210. )
  211. if (
  212. file.user_id == user.id
  213. or user.role == "admin"
  214. or has_access_to_file(id, "read", user)
  215. ):
  216. return file
  217. else:
  218. raise HTTPException(
  219. status_code=status.HTTP_404_NOT_FOUND,
  220. detail=ERROR_MESSAGES.NOT_FOUND,
  221. )
  222. ############################
  223. # Get File Data Content By Id
  224. ############################
  225. @router.get("/{id}/data/content")
  226. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  227. file = Files.get_file_by_id(id)
  228. if not file:
  229. raise HTTPException(
  230. status_code=status.HTTP_404_NOT_FOUND,
  231. detail=ERROR_MESSAGES.NOT_FOUND,
  232. )
  233. if (
  234. file.user_id == user.id
  235. or user.role == "admin"
  236. or has_access_to_file(id, "read", user)
  237. ):
  238. return {"content": file.data.get("content", "")}
  239. else:
  240. raise HTTPException(
  241. status_code=status.HTTP_404_NOT_FOUND,
  242. detail=ERROR_MESSAGES.NOT_FOUND,
  243. )
  244. ############################
  245. # Update File Data Content By Id
  246. ############################
  247. class ContentForm(BaseModel):
  248. content: str
  249. @router.post("/{id}/data/content/update")
  250. async def update_file_data_content_by_id(
  251. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  252. ):
  253. file = Files.get_file_by_id(id)
  254. if not file:
  255. raise HTTPException(
  256. status_code=status.HTTP_404_NOT_FOUND,
  257. detail=ERROR_MESSAGES.NOT_FOUND,
  258. )
  259. if (
  260. file.user_id == user.id
  261. or user.role == "admin"
  262. or has_access_to_file(id, "write", user)
  263. ):
  264. try:
  265. process_file(
  266. request,
  267. ProcessFileForm(file_id=id, content=form_data.content),
  268. user=user,
  269. )
  270. file = Files.get_file_by_id(id=id)
  271. except Exception as e:
  272. log.exception(e)
  273. log.error(f"Error processing file: {file.id}")
  274. return {"content": file.data.get("content", "")}
  275. else:
  276. raise HTTPException(
  277. status_code=status.HTTP_404_NOT_FOUND,
  278. detail=ERROR_MESSAGES.NOT_FOUND,
  279. )
  280. ############################
  281. # Get File Content By Id
  282. ############################
  283. @router.get("/{id}/content")
  284. async def get_file_content_by_id(
  285. id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
  286. ):
  287. file = Files.get_file_by_id(id)
  288. if not file:
  289. raise HTTPException(
  290. status_code=status.HTTP_404_NOT_FOUND,
  291. detail=ERROR_MESSAGES.NOT_FOUND,
  292. )
  293. if (
  294. file.user_id == user.id
  295. or user.role == "admin"
  296. or has_access_to_file(id, "read", user)
  297. ):
  298. try:
  299. file_path = Storage.get_file(file.path)
  300. file_path = Path(file_path)
  301. # Check if the file already exists in the cache
  302. if file_path.is_file():
  303. # Handle Unicode filenames
  304. filename = file.meta.get("name", file.filename)
  305. encoded_filename = quote(filename) # RFC5987 encoding
  306. content_type = file.meta.get("content_type")
  307. filename = file.meta.get("name", file.filename)
  308. encoded_filename = quote(filename)
  309. headers = {}
  310. if attachment:
  311. headers["Content-Disposition"] = (
  312. f"attachment; filename*=UTF-8''{encoded_filename}"
  313. )
  314. else:
  315. if content_type == "application/pdf" or filename.lower().endswith(
  316. ".pdf"
  317. ):
  318. headers["Content-Disposition"] = (
  319. f"inline; filename*=UTF-8''{encoded_filename}"
  320. )
  321. content_type = "application/pdf"
  322. elif content_type != "text/plain":
  323. headers["Content-Disposition"] = (
  324. f"attachment; filename*=UTF-8''{encoded_filename}"
  325. )
  326. return FileResponse(file_path, headers=headers, media_type=content_type)
  327. else:
  328. raise HTTPException(
  329. status_code=status.HTTP_404_NOT_FOUND,
  330. detail=ERROR_MESSAGES.NOT_FOUND,
  331. )
  332. except Exception as e:
  333. log.exception(e)
  334. log.error("Error getting file content")
  335. raise HTTPException(
  336. status_code=status.HTTP_400_BAD_REQUEST,
  337. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  338. )
  339. else:
  340. raise HTTPException(
  341. status_code=status.HTTP_404_NOT_FOUND,
  342. detail=ERROR_MESSAGES.NOT_FOUND,
  343. )
  344. @router.get("/{id}/content/html")
  345. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  346. file = Files.get_file_by_id(id)
  347. if not file:
  348. raise HTTPException(
  349. status_code=status.HTTP_404_NOT_FOUND,
  350. detail=ERROR_MESSAGES.NOT_FOUND,
  351. )
  352. if (
  353. file.user_id == user.id
  354. or user.role == "admin"
  355. or has_access_to_file(id, "read", user)
  356. ):
  357. try:
  358. file_path = Storage.get_file(file.path)
  359. file_path = Path(file_path)
  360. # Check if the file already exists in the cache
  361. if file_path.is_file():
  362. log.info(f"file_path: {file_path}")
  363. return FileResponse(file_path)
  364. else:
  365. raise HTTPException(
  366. status_code=status.HTTP_404_NOT_FOUND,
  367. detail=ERROR_MESSAGES.NOT_FOUND,
  368. )
  369. except Exception as e:
  370. log.exception(e)
  371. log.error("Error getting file content")
  372. raise HTTPException(
  373. status_code=status.HTTP_400_BAD_REQUEST,
  374. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  375. )
  376. else:
  377. raise HTTPException(
  378. status_code=status.HTTP_404_NOT_FOUND,
  379. detail=ERROR_MESSAGES.NOT_FOUND,
  380. )
  381. @router.get("/{id}/content/{file_name}")
  382. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  383. file = Files.get_file_by_id(id)
  384. if not file:
  385. raise HTTPException(
  386. status_code=status.HTTP_404_NOT_FOUND,
  387. detail=ERROR_MESSAGES.NOT_FOUND,
  388. )
  389. if (
  390. file.user_id == user.id
  391. or user.role == "admin"
  392. or has_access_to_file(id, "read", user)
  393. ):
  394. file_path = file.path
  395. # Handle Unicode filenames
  396. filename = file.meta.get("name", file.filename)
  397. encoded_filename = quote(filename) # RFC5987 encoding
  398. headers = {
  399. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  400. }
  401. if file_path:
  402. file_path = Storage.get_file(file_path)
  403. file_path = Path(file_path)
  404. # Check if the file already exists in the cache
  405. if file_path.is_file():
  406. return FileResponse(file_path, headers=headers)
  407. else:
  408. raise HTTPException(
  409. status_code=status.HTTP_404_NOT_FOUND,
  410. detail=ERROR_MESSAGES.NOT_FOUND,
  411. )
  412. else:
  413. # File path doesn’t exist, return the content as .txt if possible
  414. file_content = file.content.get("content", "")
  415. file_name = file.filename
  416. # Create a generator that encodes the file content
  417. def generator():
  418. yield file_content.encode("utf-8")
  419. return StreamingResponse(
  420. generator(),
  421. media_type="text/plain",
  422. headers=headers,
  423. )
  424. else:
  425. raise HTTPException(
  426. status_code=status.HTTP_404_NOT_FOUND,
  427. detail=ERROR_MESSAGES.NOT_FOUND,
  428. )
  429. ############################
  430. # Delete File By Id
  431. ############################
  432. @router.delete("/{id}")
  433. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  434. file = Files.get_file_by_id(id)
  435. if not file:
  436. raise HTTPException(
  437. status_code=status.HTTP_404_NOT_FOUND,
  438. detail=ERROR_MESSAGES.NOT_FOUND,
  439. )
  440. if (
  441. file.user_id == user.id
  442. or user.role == "admin"
  443. or has_access_to_file(id, "write", user)
  444. ):
  445. # We should add Chroma cleanup here
  446. result = Files.delete_file_by_id(id)
  447. if result:
  448. try:
  449. Storage.delete_file(file.path)
  450. except Exception as e:
  451. log.exception(e)
  452. log.error("Error deleting files")
  453. raise HTTPException(
  454. status_code=status.HTTP_400_BAD_REQUEST,
  455. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  456. )
  457. return {"message": "File deleted successfully"}
  458. else:
  459. raise HTTPException(
  460. status_code=status.HTTP_400_BAD_REQUEST,
  461. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  462. )
  463. else:
  464. raise HTTPException(
  465. status_code=status.HTTP_404_NOT_FOUND,
  466. detail=ERROR_MESSAGES.NOT_FOUND,
  467. )