files.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. import logging
  2. import os
  3. import uuid
  4. from fnmatch import fnmatch
  5. from pathlib import Path
  6. from typing import Optional
  7. from urllib.parse import quote
  8. from fastapi import (
  9. APIRouter,
  10. Depends,
  11. File,
  12. HTTPException,
  13. Request,
  14. UploadFile,
  15. status,
  16. Query,
  17. )
  18. from fastapi.responses import FileResponse, StreamingResponse
  19. from open_webui.constants import ERROR_MESSAGES
  20. from open_webui.env import SRC_LOG_LEVELS
  21. from open_webui.models.users import Users
  22. from open_webui.models.files import (
  23. FileForm,
  24. FileModel,
  25. FileModelResponse,
  26. Files,
  27. )
  28. from open_webui.models.knowledge import Knowledges
  29. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  30. from open_webui.routers.retrieval import ProcessFileForm, process_file
  31. from open_webui.routers.audio import transcribe
  32. from open_webui.storage.provider import Storage
  33. from open_webui.utils.auth import get_admin_user, get_verified_user
  34. from pydantic import BaseModel
  35. log = logging.getLogger(__name__)
  36. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  37. router = APIRouter()
  38. ############################
  39. # Check if the current user has access to a file through any knowledge bases the user may be in.
  40. ############################
  41. def has_access_to_file(
  42. file_id: Optional[str], access_type: str, user=Depends(get_verified_user)
  43. ) -> bool:
  44. file = Files.get_file_by_id(file_id)
  45. log.debug(f"Checking if user has {access_type} access to file")
  46. if not file:
  47. raise HTTPException(
  48. status_code=status.HTTP_404_NOT_FOUND,
  49. detail=ERROR_MESSAGES.NOT_FOUND,
  50. )
  51. has_access = False
  52. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  53. if knowledge_base_id:
  54. knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(
  55. user.id, access_type
  56. )
  57. for knowledge_base in knowledge_bases:
  58. if knowledge_base.id == knowledge_base_id:
  59. has_access = True
  60. break
  61. return has_access
  62. ############################
  63. # Upload File
  64. ############################
  65. @router.post("/", response_model=FileModelResponse)
  66. def upload_file(
  67. request: Request,
  68. file: UploadFile = File(...),
  69. user=Depends(get_verified_user),
  70. file_metadata: dict = None,
  71. process: bool = Query(True),
  72. ):
  73. log.info(f"file.content_type: {file.content_type}")
  74. file_metadata = file_metadata if file_metadata else {}
  75. try:
  76. unsanitized_filename = file.filename
  77. filename = os.path.basename(unsanitized_filename)
  78. file_extension = os.path.splitext(filename)[1]
  79. if request.app.state.config.ALLOWED_FILE_EXTENSIONS:
  80. if file_extension not in request.app.state.config.ALLOWED_FILE_EXTENSIONS:
  81. raise HTTPException(
  82. status_code=status.HTTP_400_BAD_REQUEST,
  83. detail=ERROR_MESSAGES.DEFAULT(
  84. f"File type {file_extension} is not allowed"
  85. ),
  86. )
  87. # replace filename with uuid
  88. id = str(uuid.uuid4())
  89. name = filename
  90. filename = f"{id}_{filename}"
  91. tags = {
  92. "OpenWebUI-User-Email": user.email,
  93. "OpenWebUI-User-Id": user.id,
  94. "OpenWebUI-User-Name": user.name,
  95. "OpenWebUI-File-Id": id,
  96. }
  97. contents, file_path = Storage.upload_file(file.file, filename, tags)
  98. file_item = Files.insert_new_file(
  99. user.id,
  100. FileForm(
  101. **{
  102. "id": id,
  103. "filename": name,
  104. "path": file_path,
  105. "meta": {
  106. "name": name,
  107. "content_type": file.content_type,
  108. "size": len(contents),
  109. "data": file_metadata,
  110. },
  111. }
  112. ),
  113. )
  114. if process:
  115. try:
  116. if file.content_type:
  117. if file.content_type.startswith(
  118. (
  119. "audio/mpeg",
  120. "audio/wav",
  121. "audio/ogg",
  122. "audio/x-m4a",
  123. "audio/webm",
  124. "video/webm",
  125. )
  126. ):
  127. file_path = Storage.get_file(file_path)
  128. result = transcribe(request, file_path)
  129. process_file(
  130. request,
  131. ProcessFileForm(file_id=id, content=result.get("text", "")),
  132. user=user,
  133. )
  134. elif file.content_type not in [
  135. "image/png",
  136. "image/jpeg",
  137. "image/gif",
  138. "video/mp4",
  139. "video/ogg",
  140. "video/quicktime",
  141. ]:
  142. process_file(request, ProcessFileForm(file_id=id), user=user)
  143. else:
  144. log.info(
  145. f"File type {file.content_type} is not provided, but trying to process anyway"
  146. )
  147. process_file(request, ProcessFileForm(file_id=id), user=user)
  148. file_item = Files.get_file_by_id(id=id)
  149. except Exception as e:
  150. log.exception(e)
  151. log.error(f"Error processing file: {file_item.id}")
  152. file_item = FileModelResponse(
  153. **{
  154. **file_item.model_dump(),
  155. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  156. }
  157. )
  158. if file_item:
  159. return file_item
  160. else:
  161. raise HTTPException(
  162. status_code=status.HTTP_400_BAD_REQUEST,
  163. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  164. )
  165. except Exception as e:
  166. log.exception(e)
  167. raise HTTPException(
  168. status_code=status.HTTP_400_BAD_REQUEST,
  169. detail=ERROR_MESSAGES.DEFAULT(e),
  170. )
  171. ############################
  172. # List Files
  173. ############################
  174. @router.get("/", response_model=list[FileModelResponse])
  175. async def list_files(user=Depends(get_verified_user), content: bool = Query(True)):
  176. if user.role == "admin":
  177. files = Files.get_files()
  178. else:
  179. files = Files.get_files_by_user_id(user.id)
  180. if not content:
  181. for file in files:
  182. if "content" in file.data:
  183. del file.data["content"]
  184. return files
  185. ############################
  186. # Search Files
  187. ############################
  188. @router.get("/search", response_model=list[FileModelResponse])
  189. async def search_files(
  190. filename: str = Query(
  191. ...,
  192. description="Filename pattern to search for. Supports wildcards such as '*.txt'",
  193. ),
  194. content: bool = Query(True),
  195. user=Depends(get_verified_user),
  196. ):
  197. """
  198. Search for files by filename with support for wildcard patterns.
  199. """
  200. # Get files according to user role
  201. if user.role == "admin":
  202. files = Files.get_files()
  203. else:
  204. files = Files.get_files_by_user_id(user.id)
  205. # Get matching files
  206. matching_files = [
  207. file for file in files if fnmatch(file.filename.lower(), filename.lower())
  208. ]
  209. if not matching_files:
  210. raise HTTPException(
  211. status_code=status.HTTP_404_NOT_FOUND,
  212. detail="No files found matching the pattern.",
  213. )
  214. if not content:
  215. for file in matching_files:
  216. if "content" in file.data:
  217. del file.data["content"]
  218. return matching_files
  219. ############################
  220. # Delete All Files
  221. ############################
  222. @router.delete("/all")
  223. async def delete_all_files(user=Depends(get_admin_user)):
  224. result = Files.delete_all_files()
  225. if result:
  226. try:
  227. Storage.delete_all_files()
  228. except Exception as e:
  229. log.exception(e)
  230. log.error("Error deleting files")
  231. raise HTTPException(
  232. status_code=status.HTTP_400_BAD_REQUEST,
  233. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  234. )
  235. return {"message": "All files deleted successfully"}
  236. else:
  237. raise HTTPException(
  238. status_code=status.HTTP_400_BAD_REQUEST,
  239. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  240. )
  241. ############################
  242. # Get File By Id
  243. ############################
  244. @router.get("/{id}", response_model=Optional[FileModel])
  245. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  246. file = Files.get_file_by_id(id)
  247. if not file:
  248. raise HTTPException(
  249. status_code=status.HTTP_404_NOT_FOUND,
  250. detail=ERROR_MESSAGES.NOT_FOUND,
  251. )
  252. if (
  253. file.user_id == user.id
  254. or user.role == "admin"
  255. or has_access_to_file(id, "read", user)
  256. ):
  257. return file
  258. else:
  259. raise HTTPException(
  260. status_code=status.HTTP_404_NOT_FOUND,
  261. detail=ERROR_MESSAGES.NOT_FOUND,
  262. )
  263. ############################
  264. # Get File Data Content By Id
  265. ############################
  266. @router.get("/{id}/data/content")
  267. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  268. file = Files.get_file_by_id(id)
  269. if not file:
  270. raise HTTPException(
  271. status_code=status.HTTP_404_NOT_FOUND,
  272. detail=ERROR_MESSAGES.NOT_FOUND,
  273. )
  274. if (
  275. file.user_id == user.id
  276. or user.role == "admin"
  277. or has_access_to_file(id, "read", user)
  278. ):
  279. return {"content": file.data.get("content", "")}
  280. else:
  281. raise HTTPException(
  282. status_code=status.HTTP_404_NOT_FOUND,
  283. detail=ERROR_MESSAGES.NOT_FOUND,
  284. )
  285. ############################
  286. # Update File Data Content By Id
  287. ############################
  288. class ContentForm(BaseModel):
  289. content: str
  290. @router.post("/{id}/data/content/update")
  291. async def update_file_data_content_by_id(
  292. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  293. ):
  294. file = Files.get_file_by_id(id)
  295. if not file:
  296. raise HTTPException(
  297. status_code=status.HTTP_404_NOT_FOUND,
  298. detail=ERROR_MESSAGES.NOT_FOUND,
  299. )
  300. if (
  301. file.user_id == user.id
  302. or user.role == "admin"
  303. or has_access_to_file(id, "write", user)
  304. ):
  305. try:
  306. process_file(
  307. request,
  308. ProcessFileForm(file_id=id, content=form_data.content),
  309. user=user,
  310. )
  311. file = Files.get_file_by_id(id=id)
  312. except Exception as e:
  313. log.exception(e)
  314. log.error(f"Error processing file: {file.id}")
  315. return {"content": file.data.get("content", "")}
  316. else:
  317. raise HTTPException(
  318. status_code=status.HTTP_404_NOT_FOUND,
  319. detail=ERROR_MESSAGES.NOT_FOUND,
  320. )
  321. ############################
  322. # Get File Content By Id
  323. ############################
  324. @router.get("/{id}/content")
  325. async def get_file_content_by_id(
  326. id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
  327. ):
  328. file = Files.get_file_by_id(id)
  329. if not file:
  330. raise HTTPException(
  331. status_code=status.HTTP_404_NOT_FOUND,
  332. detail=ERROR_MESSAGES.NOT_FOUND,
  333. )
  334. if (
  335. file.user_id == user.id
  336. or user.role == "admin"
  337. or has_access_to_file(id, "read", user)
  338. ):
  339. try:
  340. file_path = Storage.get_file(file.path)
  341. file_path = Path(file_path)
  342. # Check if the file already exists in the cache
  343. if file_path.is_file():
  344. # Handle Unicode filenames
  345. filename = file.meta.get("name", file.filename)
  346. encoded_filename = quote(filename) # RFC5987 encoding
  347. content_type = file.meta.get("content_type")
  348. filename = file.meta.get("name", file.filename)
  349. encoded_filename = quote(filename)
  350. headers = {}
  351. if attachment:
  352. headers["Content-Disposition"] = (
  353. f"attachment; filename*=UTF-8''{encoded_filename}"
  354. )
  355. else:
  356. if content_type == "application/pdf" or filename.lower().endswith(
  357. ".pdf"
  358. ):
  359. headers["Content-Disposition"] = (
  360. f"inline; filename*=UTF-8''{encoded_filename}"
  361. )
  362. content_type = "application/pdf"
  363. elif content_type != "text/plain":
  364. headers["Content-Disposition"] = (
  365. f"attachment; filename*=UTF-8''{encoded_filename}"
  366. )
  367. return FileResponse(file_path, headers=headers, media_type=content_type)
  368. else:
  369. raise HTTPException(
  370. status_code=status.HTTP_404_NOT_FOUND,
  371. detail=ERROR_MESSAGES.NOT_FOUND,
  372. )
  373. except Exception as e:
  374. log.exception(e)
  375. log.error("Error getting file content")
  376. raise HTTPException(
  377. status_code=status.HTTP_400_BAD_REQUEST,
  378. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  379. )
  380. else:
  381. raise HTTPException(
  382. status_code=status.HTTP_404_NOT_FOUND,
  383. detail=ERROR_MESSAGES.NOT_FOUND,
  384. )
  385. @router.get("/{id}/content/html")
  386. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  387. file = Files.get_file_by_id(id)
  388. if not file:
  389. raise HTTPException(
  390. status_code=status.HTTP_404_NOT_FOUND,
  391. detail=ERROR_MESSAGES.NOT_FOUND,
  392. )
  393. file_user = Users.get_user_by_id(file.user_id)
  394. if not file_user.role == "admin":
  395. raise HTTPException(
  396. status_code=status.HTTP_404_NOT_FOUND,
  397. detail=ERROR_MESSAGES.NOT_FOUND,
  398. )
  399. if (
  400. file.user_id == user.id
  401. or user.role == "admin"
  402. or has_access_to_file(id, "read", user)
  403. ):
  404. try:
  405. file_path = Storage.get_file(file.path)
  406. file_path = Path(file_path)
  407. # Check if the file already exists in the cache
  408. if file_path.is_file():
  409. log.info(f"file_path: {file_path}")
  410. return FileResponse(file_path)
  411. else:
  412. raise HTTPException(
  413. status_code=status.HTTP_404_NOT_FOUND,
  414. detail=ERROR_MESSAGES.NOT_FOUND,
  415. )
  416. except Exception as e:
  417. log.exception(e)
  418. log.error("Error getting file content")
  419. raise HTTPException(
  420. status_code=status.HTTP_400_BAD_REQUEST,
  421. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  422. )
  423. else:
  424. raise HTTPException(
  425. status_code=status.HTTP_404_NOT_FOUND,
  426. detail=ERROR_MESSAGES.NOT_FOUND,
  427. )
  428. @router.get("/{id}/content/{file_name}")
  429. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  430. file = Files.get_file_by_id(id)
  431. if not file:
  432. raise HTTPException(
  433. status_code=status.HTTP_404_NOT_FOUND,
  434. detail=ERROR_MESSAGES.NOT_FOUND,
  435. )
  436. if (
  437. file.user_id == user.id
  438. or user.role == "admin"
  439. or has_access_to_file(id, "read", user)
  440. ):
  441. file_path = file.path
  442. # Handle Unicode filenames
  443. filename = file.meta.get("name", file.filename)
  444. encoded_filename = quote(filename) # RFC5987 encoding
  445. headers = {
  446. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  447. }
  448. if file_path:
  449. file_path = Storage.get_file(file_path)
  450. file_path = Path(file_path)
  451. # Check if the file already exists in the cache
  452. if file_path.is_file():
  453. return FileResponse(file_path, headers=headers)
  454. else:
  455. raise HTTPException(
  456. status_code=status.HTTP_404_NOT_FOUND,
  457. detail=ERROR_MESSAGES.NOT_FOUND,
  458. )
  459. else:
  460. # File path doesn’t exist, return the content as .txt if possible
  461. file_content = file.content.get("content", "")
  462. file_name = file.filename
  463. # Create a generator that encodes the file content
  464. def generator():
  465. yield file_content.encode("utf-8")
  466. return StreamingResponse(
  467. generator(),
  468. media_type="text/plain",
  469. headers=headers,
  470. )
  471. else:
  472. raise HTTPException(
  473. status_code=status.HTTP_404_NOT_FOUND,
  474. detail=ERROR_MESSAGES.NOT_FOUND,
  475. )
  476. ############################
  477. # Delete File By Id
  478. ############################
  479. @router.delete("/{id}")
  480. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  481. file = Files.get_file_by_id(id)
  482. if not file:
  483. raise HTTPException(
  484. status_code=status.HTTP_404_NOT_FOUND,
  485. detail=ERROR_MESSAGES.NOT_FOUND,
  486. )
  487. if (
  488. file.user_id == user.id
  489. or user.role == "admin"
  490. or has_access_to_file(id, "write", user)
  491. ):
  492. # We should add Chroma cleanup here
  493. result = Files.delete_file_by_id(id)
  494. if result:
  495. try:
  496. Storage.delete_file(file.path)
  497. except Exception as e:
  498. log.exception(e)
  499. log.error("Error deleting files")
  500. raise HTTPException(
  501. status_code=status.HTTP_400_BAD_REQUEST,
  502. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  503. )
  504. return {"message": "File deleted successfully"}
  505. else:
  506. raise HTTPException(
  507. status_code=status.HTTP_400_BAD_REQUEST,
  508. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  509. )
  510. else:
  511. raise HTTPException(
  512. status_code=status.HTTP_404_NOT_FOUND,
  513. detail=ERROR_MESSAGES.NOT_FOUND,
  514. )