files.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. import logging
  2. import os
  3. import uuid
  4. from pathlib import Path
  5. from typing import Optional
  6. from urllib.parse import quote
  7. from fastapi import APIRouter, Depends, File, HTTPException, Request, UploadFile, status
  8. from fastapi.responses import FileResponse, StreamingResponse
  9. from open_webui.constants import ERROR_MESSAGES
  10. from open_webui.env import SRC_LOG_LEVELS
  11. from open_webui.models.files import (
  12. FileForm,
  13. FileModel,
  14. FileModelResponse,
  15. Files,
  16. )
  17. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  18. from open_webui.routers.retrieval import ProcessFileForm, process_file
  19. from open_webui.routers.audio import transcribe
  20. from open_webui.storage.provider import Storage
  21. from open_webui.utils.auth import get_admin_user, get_verified_user
  22. from pydantic import BaseModel
  23. log = logging.getLogger(__name__)
  24. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  25. router = APIRouter()
  26. ############################
  27. # Check if the current user has access to a file through any knowledge bases the user may be in.
  28. ############################
  29. async def check_user_has_access_to_file_via_any_knowledge_base(file_id: Optional[str], access_type: str, user=Depends(get_verified_user)) -> bool:
  30. file = Files.get_file_by_id(file_id)
  31. log.debug(f"Checking if user has {access_type} access to file")
  32. if not file:
  33. raise HTTPException(
  34. status_code=status.HTTP_404_NOT_FOUND,
  35. detail=ERROR_MESSAGES.NOT_FOUND,
  36. )
  37. has_access = False
  38. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  39. log.debug(f"Knowledge base associated with file: {knowledge_base_id}")
  40. if knowledge_base_id:
  41. if access_type == "read":
  42. user_access = await get_knowledge(user=user) # get_knowledge checks for read access
  43. elif access_type == "write":
  44. user_access = await get_knowledge_list(user=user) # get_knowledge_list checks for write access
  45. else:
  46. user_access = list()
  47. for knowledge_base in user_access:
  48. if knowledge_base.id == knowledge_base_id:
  49. log.debug(f"User knowledge base with {access_type} access {knowledge_base.id} == File knowledge base {knowledge_base_id}")
  50. has_access = True
  51. break
  52. log.debug(f"Does user have {access_type} access to file: {has_access}")
  53. return has_access
  54. ############################
  55. # Upload File
  56. ############################
  57. @router.post("/", response_model=FileModelResponse)
  58. def upload_file(
  59. request: Request,
  60. file: UploadFile = File(...),
  61. user=Depends(get_verified_user),
  62. file_metadata: dict = {},
  63. ):
  64. log.info(f"file.content_type: {file.content_type}")
  65. try:
  66. unsanitized_filename = file.filename
  67. filename = os.path.basename(unsanitized_filename)
  68. # replace filename with uuid
  69. id = str(uuid.uuid4())
  70. name = filename
  71. filename = f"{id}_{filename}"
  72. contents, file_path = Storage.upload_file(file.file, filename)
  73. file_item = Files.insert_new_file(
  74. user.id,
  75. FileForm(
  76. **{
  77. "id": id,
  78. "filename": name,
  79. "path": file_path,
  80. "meta": {
  81. "name": name,
  82. "content_type": file.content_type,
  83. "size": len(contents),
  84. "data": file_metadata,
  85. },
  86. }
  87. ),
  88. )
  89. try:
  90. if file.content_type in [
  91. "audio/mpeg",
  92. "audio/wav",
  93. "audio/ogg",
  94. "audio/x-m4a",
  95. ]:
  96. file_path = Storage.get_file(file_path)
  97. result = transcribe(request, file_path)
  98. process_file(
  99. request,
  100. ProcessFileForm(file_id=id, content=result.get("text", "")),
  101. user=user,
  102. )
  103. else:
  104. process_file(request, ProcessFileForm(file_id=id), user=user)
  105. file_item = Files.get_file_by_id(id=id)
  106. except Exception as e:
  107. log.exception(e)
  108. log.error(f"Error processing file: {file_item.id}")
  109. file_item = FileModelResponse(
  110. **{
  111. **file_item.model_dump(),
  112. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  113. }
  114. )
  115. if file_item:
  116. return file_item
  117. else:
  118. raise HTTPException(
  119. status_code=status.HTTP_400_BAD_REQUEST,
  120. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  121. )
  122. except Exception as e:
  123. log.exception(e)
  124. raise HTTPException(
  125. status_code=status.HTTP_400_BAD_REQUEST,
  126. detail=ERROR_MESSAGES.DEFAULT(e),
  127. )
  128. ############################
  129. # List Files
  130. ############################
  131. @router.get("/", response_model=list[FileModelResponse])
  132. async def list_files(user=Depends(get_verified_user)):
  133. if user.role == "admin":
  134. files = Files.get_files()
  135. else:
  136. files = Files.get_files_by_user_id(user.id)
  137. return files
  138. ############################
  139. # Delete All Files
  140. ############################
  141. @router.delete("/all")
  142. async def delete_all_files(user=Depends(get_admin_user)):
  143. result = Files.delete_all_files()
  144. if result:
  145. try:
  146. Storage.delete_all_files()
  147. except Exception as e:
  148. log.exception(e)
  149. log.error("Error deleting files")
  150. raise HTTPException(
  151. status_code=status.HTTP_400_BAD_REQUEST,
  152. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  153. )
  154. return {"message": "All files deleted successfully"}
  155. else:
  156. raise HTTPException(
  157. status_code=status.HTTP_400_BAD_REQUEST,
  158. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  159. )
  160. ############################
  161. # Get File By Id
  162. ############################
  163. @router.get("/{id}", response_model=Optional[FileModel])
  164. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  165. file = Files.get_file_by_id(id)
  166. if not file:
  167. raise HTTPException(
  168. status_code=status.HTTP_404_NOT_FOUND,
  169. detail=ERROR_MESSAGES.NOT_FOUND,
  170. )
  171. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  172. if file.user_id == user.id or user.role == "admin" or has_read_access:
  173. return file
  174. else:
  175. raise HTTPException(
  176. status_code=status.HTTP_404_NOT_FOUND,
  177. detail=ERROR_MESSAGES.NOT_FOUND,
  178. )
  179. ############################
  180. # Get File Data Content By Id
  181. ############################
  182. @router.get("/{id}/data/content")
  183. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  184. file = Files.get_file_by_id(id)
  185. if not file:
  186. raise HTTPException(
  187. status_code=status.HTTP_404_NOT_FOUND,
  188. detail=ERROR_MESSAGES.NOT_FOUND,
  189. )
  190. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  191. if file.user_id == user.id or user.role == "admin" or has_read_access:
  192. return {"content": file.data.get("content", "")}
  193. else:
  194. raise HTTPException(
  195. status_code=status.HTTP_404_NOT_FOUND,
  196. detail=ERROR_MESSAGES.NOT_FOUND,
  197. )
  198. ############################
  199. # Update File Data Content By Id
  200. ############################
  201. class ContentForm(BaseModel):
  202. content: str
  203. @router.post("/{id}/data/content/update")
  204. async def update_file_data_content_by_id(
  205. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  206. ):
  207. file = Files.get_file_by_id(id)
  208. if not file:
  209. raise HTTPException(
  210. status_code=status.HTTP_404_NOT_FOUND,
  211. detail=ERROR_MESSAGES.NOT_FOUND,
  212. )
  213. has_write_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "write", user)
  214. if file.user_id == user.id or user.role == "admin" or has_write_access:
  215. try:
  216. process_file(
  217. request,
  218. ProcessFileForm(file_id=id, content=form_data.content),
  219. user=user,
  220. )
  221. file = Files.get_file_by_id(id=id)
  222. except Exception as e:
  223. log.exception(e)
  224. log.error(f"Error processing file: {file.id}")
  225. return {"content": file.data.get("content", "")}
  226. else:
  227. raise HTTPException(
  228. status_code=status.HTTP_404_NOT_FOUND,
  229. detail=ERROR_MESSAGES.NOT_FOUND,
  230. )
  231. ############################
  232. # Get File Content By Id
  233. ############################
  234. @router.get("/{id}/content")
  235. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  236. file = Files.get_file_by_id(id)
  237. if not file:
  238. raise HTTPException(
  239. status_code=status.HTTP_404_NOT_FOUND,
  240. detail=ERROR_MESSAGES.NOT_FOUND,
  241. )
  242. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  243. if file.user_id == user.id or user.role == "admin" or has_read_access:
  244. try:
  245. file_path = Storage.get_file(file.path)
  246. file_path = Path(file_path)
  247. # Check if the file already exists in the cache
  248. if file_path.is_file():
  249. # Handle Unicode filenames
  250. filename = file.meta.get("name", file.filename)
  251. encoded_filename = quote(filename) # RFC5987 encoding
  252. content_type = file.meta.get("content_type")
  253. filename = file.meta.get("name", file.filename)
  254. encoded_filename = quote(filename)
  255. headers = {}
  256. if content_type == "application/pdf" or filename.lower().endswith(
  257. ".pdf"
  258. ):
  259. headers["Content-Disposition"] = (
  260. f"inline; filename*=UTF-8''{encoded_filename}"
  261. )
  262. content_type = "application/pdf"
  263. elif content_type != "text/plain":
  264. headers["Content-Disposition"] = (
  265. f"attachment; filename*=UTF-8''{encoded_filename}"
  266. )
  267. return FileResponse(file_path, headers=headers, media_type=content_type)
  268. else:
  269. raise HTTPException(
  270. status_code=status.HTTP_404_NOT_FOUND,
  271. detail=ERROR_MESSAGES.NOT_FOUND,
  272. )
  273. except Exception as e:
  274. log.exception(e)
  275. log.error("Error getting file content")
  276. raise HTTPException(
  277. status_code=status.HTTP_400_BAD_REQUEST,
  278. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  279. )
  280. else:
  281. raise HTTPException(
  282. status_code=status.HTTP_404_NOT_FOUND,
  283. detail=ERROR_MESSAGES.NOT_FOUND,
  284. )
  285. @router.get("/{id}/content/html")
  286. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  287. file = Files.get_file_by_id(id)
  288. if not file:
  289. raise HTTPException(
  290. status_code=status.HTTP_404_NOT_FOUND,
  291. detail=ERROR_MESSAGES.NOT_FOUND,
  292. )
  293. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  294. if file.user_id == user.id or user.role == "admin" or has_read_access:
  295. try:
  296. file_path = Storage.get_file(file.path)
  297. file_path = Path(file_path)
  298. # Check if the file already exists in the cache
  299. if file_path.is_file():
  300. log.info(f"file_path: {file_path}")
  301. return FileResponse(file_path)
  302. else:
  303. raise HTTPException(
  304. status_code=status.HTTP_404_NOT_FOUND,
  305. detail=ERROR_MESSAGES.NOT_FOUND,
  306. )
  307. except Exception as e:
  308. log.exception(e)
  309. log.error("Error getting file content")
  310. raise HTTPException(
  311. status_code=status.HTTP_400_BAD_REQUEST,
  312. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  313. )
  314. else:
  315. raise HTTPException(
  316. status_code=status.HTTP_404_NOT_FOUND,
  317. detail=ERROR_MESSAGES.NOT_FOUND,
  318. )
  319. @router.get("/{id}/content/{file_name}")
  320. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  321. file = Files.get_file_by_id(id)
  322. if not file:
  323. raise HTTPException(
  324. status_code=status.HTTP_404_NOT_FOUND,
  325. detail=ERROR_MESSAGES.NOT_FOUND,
  326. )
  327. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  328. if file.user_id == user.id or user.role == "admin" or has_read_access:
  329. file_path = file.path
  330. # Handle Unicode filenames
  331. filename = file.meta.get("name", file.filename)
  332. encoded_filename = quote(filename) # RFC5987 encoding
  333. headers = {
  334. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  335. }
  336. if file_path:
  337. file_path = Storage.get_file(file_path)
  338. file_path = Path(file_path)
  339. # Check if the file already exists in the cache
  340. if file_path.is_file():
  341. return FileResponse(file_path, headers=headers)
  342. else:
  343. raise HTTPException(
  344. status_code=status.HTTP_404_NOT_FOUND,
  345. detail=ERROR_MESSAGES.NOT_FOUND,
  346. )
  347. else:
  348. # File path doesn’t exist, return the content as .txt if possible
  349. file_content = file.content.get("content", "")
  350. file_name = file.filename
  351. # Create a generator that encodes the file content
  352. def generator():
  353. yield file_content.encode("utf-8")
  354. return StreamingResponse(
  355. generator(),
  356. media_type="text/plain",
  357. headers=headers,
  358. )
  359. else:
  360. raise HTTPException(
  361. status_code=status.HTTP_404_NOT_FOUND,
  362. detail=ERROR_MESSAGES.NOT_FOUND,
  363. )
  364. ############################
  365. # Delete File By Id
  366. ############################
  367. @router.delete("/{id}")
  368. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  369. file = Files.get_file_by_id(id)
  370. if not file:
  371. raise HTTPException(
  372. status_code=status.HTTP_404_NOT_FOUND,
  373. detail=ERROR_MESSAGES.NOT_FOUND,
  374. )
  375. has_write_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "write", user)
  376. if file.user_id == user.id or user.role == "admin" or has_write_access:
  377. # We should add Chroma cleanup here
  378. result = Files.delete_file_by_id(id)
  379. if result:
  380. try:
  381. Storage.delete_file(file.path)
  382. except Exception as e:
  383. log.exception(e)
  384. log.error("Error deleting files")
  385. raise HTTPException(
  386. status_code=status.HTTP_400_BAD_REQUEST,
  387. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  388. )
  389. return {"message": "File deleted successfully"}
  390. else:
  391. raise HTTPException(
  392. status_code=status.HTTP_400_BAD_REQUEST,
  393. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  394. )
  395. else:
  396. raise HTTPException(
  397. status_code=status.HTTP_404_NOT_FOUND,
  398. detail=ERROR_MESSAGES.NOT_FOUND,
  399. )