files.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. import logging
  2. import os
  3. import uuid
  4. from pathlib import Path
  5. from typing import Optional
  6. from urllib.parse import quote
  7. from fastapi import (
  8. APIRouter,
  9. Depends,
  10. File,
  11. HTTPException,
  12. Request,
  13. UploadFile,
  14. status,
  15. Query,
  16. )
  17. from fastapi.responses import FileResponse, StreamingResponse
  18. from open_webui.constants import ERROR_MESSAGES
  19. from open_webui.env import SRC_LOG_LEVELS
  20. from open_webui.models.files import (
  21. FileForm,
  22. FileModel,
  23. FileModelResponse,
  24. Files,
  25. )
  26. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  27. from open_webui.routers.retrieval import ProcessFileForm, process_file
  28. from open_webui.routers.audio import transcribe
  29. from open_webui.storage.provider import Storage
  30. from open_webui.utils.auth import get_admin_user, get_verified_user
  31. from pydantic import BaseModel
  32. log = logging.getLogger(__name__)
  33. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  34. router = APIRouter()
  35. ############################
  36. # Check if the current user has access to a file through any knowledge bases the user may be in.
  37. ############################
  38. async def check_user_has_access_to_file_via_any_knowledge_base(file_id: Optional[str], access_type: str, user=Depends(get_verified_user)) -> bool:
  39. file = Files.get_file_by_id(file_id)
  40. log.debug(f"Checking if user has {access_type} access to file")
  41. if not file:
  42. raise HTTPException(
  43. status_code=status.HTTP_404_NOT_FOUND,
  44. detail=ERROR_MESSAGES.NOT_FOUND,
  45. )
  46. has_access = False
  47. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  48. log.debug(f"Knowledge base associated with file: {knowledge_base_id}")
  49. if knowledge_base_id:
  50. if access_type == "read":
  51. user_access = await get_knowledge(user=user) # get_knowledge checks for read access
  52. elif access_type == "write":
  53. user_access = await get_knowledge_list(user=user) # get_knowledge_list checks for write access
  54. else:
  55. user_access = list()
  56. for knowledge_base in user_access:
  57. if knowledge_base.id == knowledge_base_id:
  58. log.debug(f"User knowledge base with {access_type} access {knowledge_base.id} == File knowledge base {knowledge_base_id}")
  59. has_access = True
  60. break
  61. log.debug(f"Does user have {access_type} access to file: {has_access}")
  62. return has_access
  63. ############################
  64. # Upload File
  65. ############################
  66. @router.post("/", response_model=FileModelResponse)
  67. def upload_file(
  68. request: Request,
  69. file: UploadFile = File(...),
  70. user=Depends(get_verified_user),
  71. file_metadata: dict = {},
  72. process: bool = Query(True),
  73. ):
  74. log.info(f"file.content_type: {file.content_type}")
  75. try:
  76. unsanitized_filename = file.filename
  77. filename = os.path.basename(unsanitized_filename)
  78. # replace filename with uuid
  79. id = str(uuid.uuid4())
  80. name = filename
  81. filename = f"{id}_{filename}"
  82. contents, file_path = Storage.upload_file(file.file, filename)
  83. file_item = Files.insert_new_file(
  84. user.id,
  85. FileForm(
  86. **{
  87. "id": id,
  88. "filename": name,
  89. "path": file_path,
  90. "meta": {
  91. "name": name,
  92. "content_type": file.content_type,
  93. "size": len(contents),
  94. "data": file_metadata,
  95. },
  96. }
  97. ),
  98. )
  99. if process:
  100. try:
  101. if file.content_type in [
  102. "audio/mpeg",
  103. "audio/wav",
  104. "audio/ogg",
  105. "audio/x-m4a",
  106. ]:
  107. file_path = Storage.get_file(file_path)
  108. result = transcribe(request, file_path)
  109. process_file(
  110. request,
  111. ProcessFileForm(file_id=id, content=result.get("text", "")),
  112. user=user,
  113. )
  114. elif file.content_type not in ["image/png", "image/jpeg", "image/gif"]:
  115. process_file(request, ProcessFileForm(file_id=id), user=user)
  116. file_item = Files.get_file_by_id(id=id)
  117. except Exception as e:
  118. log.exception(e)
  119. log.error(f"Error processing file: {file_item.id}")
  120. file_item = FileModelResponse(
  121. **{
  122. **file_item.model_dump(),
  123. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  124. }
  125. )
  126. if file_item:
  127. return file_item
  128. else:
  129. raise HTTPException(
  130. status_code=status.HTTP_400_BAD_REQUEST,
  131. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  132. )
  133. except Exception as e:
  134. log.exception(e)
  135. raise HTTPException(
  136. status_code=status.HTTP_400_BAD_REQUEST,
  137. detail=ERROR_MESSAGES.DEFAULT(e),
  138. )
  139. ############################
  140. # List Files
  141. ############################
  142. @router.get("/", response_model=list[FileModelResponse])
  143. async def list_files(user=Depends(get_verified_user)):
  144. if user.role == "admin":
  145. files = Files.get_files()
  146. else:
  147. files = Files.get_files_by_user_id(user.id)
  148. return files
  149. ############################
  150. # Delete All Files
  151. ############################
  152. @router.delete("/all")
  153. async def delete_all_files(user=Depends(get_admin_user)):
  154. result = Files.delete_all_files()
  155. if result:
  156. try:
  157. Storage.delete_all_files()
  158. except Exception as e:
  159. log.exception(e)
  160. log.error("Error deleting files")
  161. raise HTTPException(
  162. status_code=status.HTTP_400_BAD_REQUEST,
  163. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  164. )
  165. return {"message": "All files deleted successfully"}
  166. else:
  167. raise HTTPException(
  168. status_code=status.HTTP_400_BAD_REQUEST,
  169. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  170. )
  171. ############################
  172. # Get File By Id
  173. ############################
  174. @router.get("/{id}", response_model=Optional[FileModel])
  175. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  176. file = Files.get_file_by_id(id)
  177. if not file:
  178. raise HTTPException(
  179. status_code=status.HTTP_404_NOT_FOUND,
  180. detail=ERROR_MESSAGES.NOT_FOUND,
  181. )
  182. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  183. if file.user_id == user.id or user.role == "admin" or has_read_access:
  184. return file
  185. else:
  186. raise HTTPException(
  187. status_code=status.HTTP_404_NOT_FOUND,
  188. detail=ERROR_MESSAGES.NOT_FOUND,
  189. )
  190. ############################
  191. # Get File Data Content By Id
  192. ############################
  193. @router.get("/{id}/data/content")
  194. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  195. file = Files.get_file_by_id(id)
  196. if not file:
  197. raise HTTPException(
  198. status_code=status.HTTP_404_NOT_FOUND,
  199. detail=ERROR_MESSAGES.NOT_FOUND,
  200. )
  201. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  202. if file.user_id == user.id or user.role == "admin" or has_read_access:
  203. return {"content": file.data.get("content", "")}
  204. else:
  205. raise HTTPException(
  206. status_code=status.HTTP_404_NOT_FOUND,
  207. detail=ERROR_MESSAGES.NOT_FOUND,
  208. )
  209. ############################
  210. # Update File Data Content By Id
  211. ############################
  212. class ContentForm(BaseModel):
  213. content: str
  214. @router.post("/{id}/data/content/update")
  215. async def update_file_data_content_by_id(
  216. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  217. ):
  218. file = Files.get_file_by_id(id)
  219. if not file:
  220. raise HTTPException(
  221. status_code=status.HTTP_404_NOT_FOUND,
  222. detail=ERROR_MESSAGES.NOT_FOUND,
  223. )
  224. has_write_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "write", user)
  225. if file.user_id == user.id or user.role == "admin" or has_write_access:
  226. try:
  227. process_file(
  228. request,
  229. ProcessFileForm(file_id=id, content=form_data.content),
  230. user=user,
  231. )
  232. file = Files.get_file_by_id(id=id)
  233. except Exception as e:
  234. log.exception(e)
  235. log.error(f"Error processing file: {file.id}")
  236. return {"content": file.data.get("content", "")}
  237. else:
  238. raise HTTPException(
  239. status_code=status.HTTP_404_NOT_FOUND,
  240. detail=ERROR_MESSAGES.NOT_FOUND,
  241. )
  242. ############################
  243. # Get File Content By Id
  244. ############################
  245. @router.get("/{id}/content")
  246. async def get_file_content_by_id(
  247. id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
  248. ):
  249. file = Files.get_file_by_id(id)
  250. if not file:
  251. raise HTTPException(
  252. status_code=status.HTTP_404_NOT_FOUND,
  253. detail=ERROR_MESSAGES.NOT_FOUND,
  254. )
  255. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  256. if file.user_id == user.id or user.role == "admin" or has_read_access:
  257. try:
  258. file_path = Storage.get_file(file.path)
  259. file_path = Path(file_path)
  260. # Check if the file already exists in the cache
  261. if file_path.is_file():
  262. # Handle Unicode filenames
  263. filename = file.meta.get("name", file.filename)
  264. encoded_filename = quote(filename) # RFC5987 encoding
  265. content_type = file.meta.get("content_type")
  266. filename = file.meta.get("name", file.filename)
  267. encoded_filename = quote(filename)
  268. headers = {}
  269. if attachment:
  270. headers["Content-Disposition"] = (
  271. f"attachment; filename*=UTF-8''{encoded_filename}"
  272. )
  273. else:
  274. if content_type == "application/pdf" or filename.lower().endswith(
  275. ".pdf"
  276. ):
  277. headers["Content-Disposition"] = (
  278. f"inline; filename*=UTF-8''{encoded_filename}"
  279. )
  280. content_type = "application/pdf"
  281. elif content_type != "text/plain":
  282. headers["Content-Disposition"] = (
  283. f"attachment; filename*=UTF-8''{encoded_filename}"
  284. )
  285. return FileResponse(file_path, headers=headers, media_type=content_type)
  286. else:
  287. raise HTTPException(
  288. status_code=status.HTTP_404_NOT_FOUND,
  289. detail=ERROR_MESSAGES.NOT_FOUND,
  290. )
  291. except Exception as e:
  292. log.exception(e)
  293. log.error("Error getting file content")
  294. raise HTTPException(
  295. status_code=status.HTTP_400_BAD_REQUEST,
  296. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  297. )
  298. else:
  299. raise HTTPException(
  300. status_code=status.HTTP_404_NOT_FOUND,
  301. detail=ERROR_MESSAGES.NOT_FOUND,
  302. )
  303. @router.get("/{id}/content/html")
  304. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  305. file = Files.get_file_by_id(id)
  306. if not file:
  307. raise HTTPException(
  308. status_code=status.HTTP_404_NOT_FOUND,
  309. detail=ERROR_MESSAGES.NOT_FOUND,
  310. )
  311. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  312. if file.user_id == user.id or user.role == "admin" or has_read_access:
  313. try:
  314. file_path = Storage.get_file(file.path)
  315. file_path = Path(file_path)
  316. # Check if the file already exists in the cache
  317. if file_path.is_file():
  318. log.info(f"file_path: {file_path}")
  319. return FileResponse(file_path)
  320. else:
  321. raise HTTPException(
  322. status_code=status.HTTP_404_NOT_FOUND,
  323. detail=ERROR_MESSAGES.NOT_FOUND,
  324. )
  325. except Exception as e:
  326. log.exception(e)
  327. log.error("Error getting file content")
  328. raise HTTPException(
  329. status_code=status.HTTP_400_BAD_REQUEST,
  330. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  331. )
  332. else:
  333. raise HTTPException(
  334. status_code=status.HTTP_404_NOT_FOUND,
  335. detail=ERROR_MESSAGES.NOT_FOUND,
  336. )
  337. @router.get("/{id}/content/{file_name}")
  338. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  339. file = Files.get_file_by_id(id)
  340. if not file:
  341. raise HTTPException(
  342. status_code=status.HTTP_404_NOT_FOUND,
  343. detail=ERROR_MESSAGES.NOT_FOUND,
  344. )
  345. has_read_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "read", user)
  346. if file.user_id == user.id or user.role == "admin" or has_read_access:
  347. file_path = file.path
  348. # Handle Unicode filenames
  349. filename = file.meta.get("name", file.filename)
  350. encoded_filename = quote(filename) # RFC5987 encoding
  351. headers = {
  352. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  353. }
  354. if file_path:
  355. file_path = Storage.get_file(file_path)
  356. file_path = Path(file_path)
  357. # Check if the file already exists in the cache
  358. if file_path.is_file():
  359. return FileResponse(file_path, headers=headers)
  360. else:
  361. raise HTTPException(
  362. status_code=status.HTTP_404_NOT_FOUND,
  363. detail=ERROR_MESSAGES.NOT_FOUND,
  364. )
  365. else:
  366. # File path doesn’t exist, return the content as .txt if possible
  367. file_content = file.content.get("content", "")
  368. file_name = file.filename
  369. # Create a generator that encodes the file content
  370. def generator():
  371. yield file_content.encode("utf-8")
  372. return StreamingResponse(
  373. generator(),
  374. media_type="text/plain",
  375. headers=headers,
  376. )
  377. else:
  378. raise HTTPException(
  379. status_code=status.HTTP_404_NOT_FOUND,
  380. detail=ERROR_MESSAGES.NOT_FOUND,
  381. )
  382. ############################
  383. # Delete File By Id
  384. ############################
  385. @router.delete("/{id}")
  386. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  387. file = Files.get_file_by_id(id)
  388. if not file:
  389. raise HTTPException(
  390. status_code=status.HTTP_404_NOT_FOUND,
  391. detail=ERROR_MESSAGES.NOT_FOUND,
  392. )
  393. has_write_access: bool = await check_user_has_access_to_file_via_any_knowledge_base(id, "write", user)
  394. if file.user_id == user.id or user.role == "admin" or has_write_access:
  395. # We should add Chroma cleanup here
  396. result = Files.delete_file_by_id(id)
  397. if result:
  398. try:
  399. Storage.delete_file(file.path)
  400. except Exception as e:
  401. log.exception(e)
  402. log.error("Error deleting files")
  403. raise HTTPException(
  404. status_code=status.HTTP_400_BAD_REQUEST,
  405. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  406. )
  407. return {"message": "File deleted successfully"}
  408. else:
  409. raise HTTPException(
  410. status_code=status.HTTP_400_BAD_REQUEST,
  411. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  412. )
  413. else:
  414. raise HTTPException(
  415. status_code=status.HTTP_404_NOT_FOUND,
  416. detail=ERROR_MESSAGES.NOT_FOUND,
  417. )