files.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. import logging
  2. import os
  3. import uuid
  4. from fnmatch import fnmatch
  5. from functools import lru_cache
  6. from pathlib import Path
  7. from typing import Optional
  8. from urllib.parse import quote
  9. from fastapi import (
  10. APIRouter,
  11. Depends,
  12. File,
  13. HTTPException,
  14. Request,
  15. UploadFile,
  16. status,
  17. Query,
  18. )
  19. from fastapi.responses import FileResponse, StreamingResponse
  20. from open_webui.constants import ERROR_MESSAGES
  21. from open_webui.env import SRC_LOG_LEVELS
  22. from open_webui.models.files import (
  23. FileForm,
  24. FileModel,
  25. FileModelResponse,
  26. Files,
  27. )
  28. from open_webui.models.knowledge import Knowledges
  29. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  30. from open_webui.routers.retrieval import ProcessFileForm, process_file
  31. from open_webui.routers.audio import transcribe
  32. from open_webui.storage.provider import Storage
  33. from open_webui.utils.auth import get_admin_user, get_verified_user
  34. from pydantic import BaseModel
  35. log = logging.getLogger(__name__)
  36. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  37. router = APIRouter()
  38. ############################
  39. # Check if the current user has access to a file through any knowledge bases the user may be in.
  40. ############################
  41. def has_access_to_file(
  42. file_id: Optional[str], access_type: str, user=Depends(get_verified_user)
  43. ) -> bool:
  44. file = Files.get_file_by_id(file_id)
  45. log.debug(f"Checking if user has {access_type} access to file")
  46. if not file:
  47. raise HTTPException(
  48. status_code=status.HTTP_404_NOT_FOUND,
  49. detail=ERROR_MESSAGES.NOT_FOUND,
  50. )
  51. has_access = False
  52. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  53. if knowledge_base_id:
  54. knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(
  55. user.id, access_type
  56. )
  57. for knowledge_base in knowledge_bases:
  58. if knowledge_base.id == knowledge_base_id:
  59. has_access = True
  60. break
  61. return has_access
  62. ############################
  63. # Get all files for user, with 1 cache
  64. ############################
  65. @lru_cache(maxsize=1)
  66. def get_all_files_for_user(user_id: str, admin: bool):
  67. if admin:
  68. return Files.get_files()
  69. else:
  70. return Files.get_files_by_user_id(user_id)
  71. ############################
  72. # Upload File
  73. ############################
  74. @router.post("/", response_model=FileModelResponse)
  75. def upload_file(
  76. request: Request,
  77. file: UploadFile = File(...),
  78. user=Depends(get_verified_user),
  79. file_metadata: dict = {},
  80. process: bool = Query(True),
  81. ):
  82. log.info(f"file.content_type: {file.content_type}")
  83. try:
  84. unsanitized_filename = file.filename
  85. filename = os.path.basename(unsanitized_filename)
  86. # replace filename with uuid
  87. id = str(uuid.uuid4())
  88. name = filename
  89. filename = f"{id}_{filename}"
  90. contents, file_path = Storage.upload_file(file.file, filename)
  91. file_item = Files.insert_new_file(
  92. user.id,
  93. FileForm(
  94. **{
  95. "id": id,
  96. "filename": name,
  97. "path": file_path,
  98. "meta": {
  99. "name": name,
  100. "content_type": file.content_type,
  101. "size": len(contents),
  102. "data": file_metadata,
  103. },
  104. }
  105. ),
  106. )
  107. if process:
  108. try:
  109. if file.content_type in [
  110. "audio/mpeg",
  111. "audio/wav",
  112. "audio/ogg",
  113. "audio/x-m4a",
  114. ]:
  115. file_path = Storage.get_file(file_path)
  116. result = transcribe(request, file_path)
  117. process_file(
  118. request,
  119. ProcessFileForm(file_id=id, content=result.get("text", "")),
  120. user=user,
  121. )
  122. elif file.content_type not in ["image/png", "image/jpeg", "image/gif"]:
  123. process_file(request, ProcessFileForm(file_id=id), user=user)
  124. file_item = Files.get_file_by_id(id=id)
  125. except Exception as e:
  126. log.exception(e)
  127. log.error(f"Error processing file: {file_item.id}")
  128. file_item = FileModelResponse(
  129. **{
  130. **file_item.model_dump(),
  131. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  132. }
  133. )
  134. if file_item:
  135. return file_item
  136. else:
  137. raise HTTPException(
  138. status_code=status.HTTP_400_BAD_REQUEST,
  139. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  140. )
  141. except Exception as e:
  142. log.exception(e)
  143. raise HTTPException(
  144. status_code=status.HTTP_400_BAD_REQUEST,
  145. detail=ERROR_MESSAGES.DEFAULT(e),
  146. )
  147. ############################
  148. # List Files
  149. ############################
  150. @router.get("/", response_model=list[FileModelResponse])
  151. async def list_files(user=Depends(get_verified_user), content: bool = Query(True)):
  152. if user.role == "admin":
  153. files = Files.get_files()
  154. else:
  155. files = Files.get_files_by_user_id(user.id)
  156. if not content:
  157. for file in files:
  158. del file.data["content"]
  159. return files
  160. ############################
  161. # Search Files
  162. ############################
  163. @router.get("/search", response_model=list[FileModelResponse])
  164. async def search_files(
  165. filename: str = Query(
  166. ...,
  167. description="Filename pattern to search for. Supports wildcards such as '*.txt'"
  168. ),
  169. user=Depends(get_verified_user)
  170. ):
  171. # Retrieve files from cache
  172. files = get_all_files_for_user(user.id, user.role == "admin")
  173. # Get matching files
  174. matching_files = [
  175. file for file in files if fnmatch(file.filename.lower(), filename.lower())
  176. ]
  177. if not matching_files:
  178. raise HTTPException(
  179. status_code=status.HTTP_404_NOT_FOUND,
  180. detail="No files found matching the pattern.",
  181. )
  182. return matching_files
  183. ############################
  184. # Delete All Files
  185. ############################
  186. @router.delete("/all")
  187. async def delete_all_files(user=Depends(get_admin_user)):
  188. result = Files.delete_all_files()
  189. if result:
  190. try:
  191. Storage.delete_all_files()
  192. except Exception as e:
  193. log.exception(e)
  194. log.error("Error deleting files")
  195. raise HTTPException(
  196. status_code=status.HTTP_400_BAD_REQUEST,
  197. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  198. )
  199. return {"message": "All files deleted successfully"}
  200. else:
  201. raise HTTPException(
  202. status_code=status.HTTP_400_BAD_REQUEST,
  203. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  204. )
  205. ############################
  206. # Get File By Id
  207. ############################
  208. @router.get("/{id}", response_model=Optional[FileModel])
  209. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  210. file = Files.get_file_by_id(id)
  211. if not file:
  212. raise HTTPException(
  213. status_code=status.HTTP_404_NOT_FOUND,
  214. detail=ERROR_MESSAGES.NOT_FOUND,
  215. )
  216. if (
  217. file.user_id == user.id
  218. or user.role == "admin"
  219. or has_access_to_file(id, "read", user)
  220. ):
  221. return file
  222. else:
  223. raise HTTPException(
  224. status_code=status.HTTP_404_NOT_FOUND,
  225. detail=ERROR_MESSAGES.NOT_FOUND,
  226. )
  227. ############################
  228. # Get File Data Content By Id
  229. ############################
  230. @router.get("/{id}/data/content")
  231. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  232. file = Files.get_file_by_id(id)
  233. if not file:
  234. raise HTTPException(
  235. status_code=status.HTTP_404_NOT_FOUND,
  236. detail=ERROR_MESSAGES.NOT_FOUND,
  237. )
  238. if (
  239. file.user_id == user.id
  240. or user.role == "admin"
  241. or has_access_to_file(id, "read", user)
  242. ):
  243. return {"content": file.data.get("content", "")}
  244. else:
  245. raise HTTPException(
  246. status_code=status.HTTP_404_NOT_FOUND,
  247. detail=ERROR_MESSAGES.NOT_FOUND,
  248. )
  249. ############################
  250. # Update File Data Content By Id
  251. ############################
  252. class ContentForm(BaseModel):
  253. content: str
  254. @router.post("/{id}/data/content/update")
  255. async def update_file_data_content_by_id(
  256. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  257. ):
  258. file = Files.get_file_by_id(id)
  259. if not file:
  260. raise HTTPException(
  261. status_code=status.HTTP_404_NOT_FOUND,
  262. detail=ERROR_MESSAGES.NOT_FOUND,
  263. )
  264. if (
  265. file.user_id == user.id
  266. or user.role == "admin"
  267. or has_access_to_file(id, "write", user)
  268. ):
  269. try:
  270. process_file(
  271. request,
  272. ProcessFileForm(file_id=id, content=form_data.content),
  273. user=user,
  274. )
  275. file = Files.get_file_by_id(id=id)
  276. except Exception as e:
  277. log.exception(e)
  278. log.error(f"Error processing file: {file.id}")
  279. return {"content": file.data.get("content", "")}
  280. else:
  281. raise HTTPException(
  282. status_code=status.HTTP_404_NOT_FOUND,
  283. detail=ERROR_MESSAGES.NOT_FOUND,
  284. )
  285. ############################
  286. # Get File Content By Id
  287. ############################
  288. @router.get("/{id}/content")
  289. async def get_file_content_by_id(
  290. id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
  291. ):
  292. file = Files.get_file_by_id(id)
  293. if not file:
  294. raise HTTPException(
  295. status_code=status.HTTP_404_NOT_FOUND,
  296. detail=ERROR_MESSAGES.NOT_FOUND,
  297. )
  298. if (
  299. file.user_id == user.id
  300. or user.role == "admin"
  301. or has_access_to_file(id, "read", user)
  302. ):
  303. try:
  304. file_path = Storage.get_file(file.path)
  305. file_path = Path(file_path)
  306. # Check if the file already exists in the cache
  307. if file_path.is_file():
  308. # Handle Unicode filenames
  309. filename = file.meta.get("name", file.filename)
  310. encoded_filename = quote(filename) # RFC5987 encoding
  311. content_type = file.meta.get("content_type")
  312. filename = file.meta.get("name", file.filename)
  313. encoded_filename = quote(filename)
  314. headers = {}
  315. if attachment:
  316. headers["Content-Disposition"] = (
  317. f"attachment; filename*=UTF-8''{encoded_filename}"
  318. )
  319. else:
  320. if content_type == "application/pdf" or filename.lower().endswith(
  321. ".pdf"
  322. ):
  323. headers["Content-Disposition"] = (
  324. f"inline; filename*=UTF-8''{encoded_filename}"
  325. )
  326. content_type = "application/pdf"
  327. elif content_type != "text/plain":
  328. headers["Content-Disposition"] = (
  329. f"attachment; filename*=UTF-8''{encoded_filename}"
  330. )
  331. return FileResponse(file_path, headers=headers, media_type=content_type)
  332. else:
  333. raise HTTPException(
  334. status_code=status.HTTP_404_NOT_FOUND,
  335. detail=ERROR_MESSAGES.NOT_FOUND,
  336. )
  337. except Exception as e:
  338. log.exception(e)
  339. log.error("Error getting file content")
  340. raise HTTPException(
  341. status_code=status.HTTP_400_BAD_REQUEST,
  342. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  343. )
  344. else:
  345. raise HTTPException(
  346. status_code=status.HTTP_404_NOT_FOUND,
  347. detail=ERROR_MESSAGES.NOT_FOUND,
  348. )
  349. @router.get("/{id}/content/html")
  350. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  351. file = Files.get_file_by_id(id)
  352. if not file:
  353. raise HTTPException(
  354. status_code=status.HTTP_404_NOT_FOUND,
  355. detail=ERROR_MESSAGES.NOT_FOUND,
  356. )
  357. if (
  358. file.user_id == user.id
  359. or user.role == "admin"
  360. or has_access_to_file(id, "read", user)
  361. ):
  362. try:
  363. file_path = Storage.get_file(file.path)
  364. file_path = Path(file_path)
  365. # Check if the file already exists in the cache
  366. if file_path.is_file():
  367. log.info(f"file_path: {file_path}")
  368. return FileResponse(file_path)
  369. else:
  370. raise HTTPException(
  371. status_code=status.HTTP_404_NOT_FOUND,
  372. detail=ERROR_MESSAGES.NOT_FOUND,
  373. )
  374. except Exception as e:
  375. log.exception(e)
  376. log.error("Error getting file content")
  377. raise HTTPException(
  378. status_code=status.HTTP_400_BAD_REQUEST,
  379. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  380. )
  381. else:
  382. raise HTTPException(
  383. status_code=status.HTTP_404_NOT_FOUND,
  384. detail=ERROR_MESSAGES.NOT_FOUND,
  385. )
  386. @router.get("/{id}/content/{file_name}")
  387. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  388. file = Files.get_file_by_id(id)
  389. if not file:
  390. raise HTTPException(
  391. status_code=status.HTTP_404_NOT_FOUND,
  392. detail=ERROR_MESSAGES.NOT_FOUND,
  393. )
  394. if (
  395. file.user_id == user.id
  396. or user.role == "admin"
  397. or has_access_to_file(id, "read", user)
  398. ):
  399. file_path = file.path
  400. # Handle Unicode filenames
  401. filename = file.meta.get("name", file.filename)
  402. encoded_filename = quote(filename) # RFC5987 encoding
  403. headers = {
  404. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  405. }
  406. if file_path:
  407. file_path = Storage.get_file(file_path)
  408. file_path = Path(file_path)
  409. # Check if the file already exists in the cache
  410. if file_path.is_file():
  411. return FileResponse(file_path, headers=headers)
  412. else:
  413. raise HTTPException(
  414. status_code=status.HTTP_404_NOT_FOUND,
  415. detail=ERROR_MESSAGES.NOT_FOUND,
  416. )
  417. else:
  418. # File path doesn’t exist, return the content as .txt if possible
  419. file_content = file.content.get("content", "")
  420. file_name = file.filename
  421. # Create a generator that encodes the file content
  422. def generator():
  423. yield file_content.encode("utf-8")
  424. return StreamingResponse(
  425. generator(),
  426. media_type="text/plain",
  427. headers=headers,
  428. )
  429. else:
  430. raise HTTPException(
  431. status_code=status.HTTP_404_NOT_FOUND,
  432. detail=ERROR_MESSAGES.NOT_FOUND,
  433. )
  434. ############################
  435. # Delete File By Id
  436. ############################
  437. @router.delete("/{id}")
  438. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  439. file = Files.get_file_by_id(id)
  440. if not file:
  441. raise HTTPException(
  442. status_code=status.HTTP_404_NOT_FOUND,
  443. detail=ERROR_MESSAGES.NOT_FOUND,
  444. )
  445. if (
  446. file.user_id == user.id
  447. or user.role == "admin"
  448. or has_access_to_file(id, "write", user)
  449. ):
  450. # We should add Chroma cleanup here
  451. result = Files.delete_file_by_id(id)
  452. if result:
  453. try:
  454. Storage.delete_file(file.path)
  455. except Exception as e:
  456. log.exception(e)
  457. log.error("Error deleting files")
  458. raise HTTPException(
  459. status_code=status.HTTP_400_BAD_REQUEST,
  460. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  461. )
  462. return {"message": "File deleted successfully"}
  463. else:
  464. raise HTTPException(
  465. status_code=status.HTTP_400_BAD_REQUEST,
  466. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  467. )
  468. else:
  469. raise HTTPException(
  470. status_code=status.HTTP_404_NOT_FOUND,
  471. detail=ERROR_MESSAGES.NOT_FOUND,
  472. )