files.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. import logging
  2. import os
  3. import uuid
  4. import json
  5. from fnmatch import fnmatch
  6. from pathlib import Path
  7. from typing import Optional
  8. from urllib.parse import quote
  9. import asyncio
  10. from fastapi import (
  11. BackgroundTasks,
  12. APIRouter,
  13. Depends,
  14. File,
  15. Form,
  16. HTTPException,
  17. Request,
  18. UploadFile,
  19. status,
  20. Query,
  21. )
  22. from fastapi.responses import FileResponse, StreamingResponse
  23. from open_webui.constants import ERROR_MESSAGES
  24. from open_webui.env import SRC_LOG_LEVELS
  25. from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT
  26. from open_webui.models.users import Users
  27. from open_webui.models.files import (
  28. FileForm,
  29. FileModel,
  30. FileModelResponse,
  31. Files,
  32. )
  33. from open_webui.models.knowledge import Knowledges
  34. from open_webui.routers.knowledge import get_knowledge, get_knowledge_list
  35. from open_webui.routers.retrieval import ProcessFileForm, process_file
  36. from open_webui.routers.audio import transcribe
  37. from open_webui.storage.provider import Storage
  38. from open_webui.utils.auth import get_admin_user, get_verified_user
  39. from pydantic import BaseModel
  40. log = logging.getLogger(__name__)
  41. log.setLevel(SRC_LOG_LEVELS["MODELS"])
  42. router = APIRouter()
  43. ############################
  44. # Check if the current user has access to a file through any knowledge bases the user may be in.
  45. ############################
  46. def has_access_to_file(
  47. file_id: Optional[str], access_type: str, user=Depends(get_verified_user)
  48. ) -> bool:
  49. file = Files.get_file_by_id(file_id)
  50. log.debug(f"Checking if user has {access_type} access to file")
  51. if not file:
  52. raise HTTPException(
  53. status_code=status.HTTP_404_NOT_FOUND,
  54. detail=ERROR_MESSAGES.NOT_FOUND,
  55. )
  56. has_access = False
  57. knowledge_base_id = file.meta.get("collection_name") if file.meta else None
  58. if knowledge_base_id:
  59. knowledge_bases = Knowledges.get_knowledge_bases_by_user_id(
  60. user.id, access_type
  61. )
  62. for knowledge_base in knowledge_bases:
  63. if knowledge_base.id == knowledge_base_id:
  64. has_access = True
  65. break
  66. return has_access
  67. ############################
  68. # Upload File
  69. ############################
  70. def process_uploaded_file(request, file, file_path, file_item, file_metadata, user):
  71. try:
  72. if file.content_type:
  73. stt_supported_content_types = getattr(
  74. request.app.state.config, "STT_SUPPORTED_CONTENT_TYPES", []
  75. )
  76. if any(
  77. fnmatch(file.content_type, content_type)
  78. for content_type in (
  79. stt_supported_content_types
  80. if stt_supported_content_types
  81. and any(t.strip() for t in stt_supported_content_types)
  82. else ["audio/*", "video/webm"]
  83. )
  84. ):
  85. file_path = Storage.get_file(file_path)
  86. result = transcribe(request, file_path, file_metadata)
  87. process_file(
  88. request,
  89. ProcessFileForm(
  90. file_id=file_item.id, content=result.get("text", "")
  91. ),
  92. user=user,
  93. )
  94. elif (not file.content_type.startswith(("image/", "video/"))) or (
  95. request.app.state.config.CONTENT_EXTRACTION_ENGINE == "external"
  96. ):
  97. process_file(request, ProcessFileForm(file_id=file_item.id), user=user)
  98. else:
  99. log.info(
  100. f"File type {file.content_type} is not provided, but trying to process anyway"
  101. )
  102. process_file(request, ProcessFileForm(file_id=file_item.id), user=user)
  103. except Exception as e:
  104. log.error(f"Error processing file: {file_item.id}")
  105. Files.update_file_data_by_id(
  106. file_item.id,
  107. {
  108. "status": "failed",
  109. "error": str(e.detail) if hasattr(e, "detail") else str(e),
  110. },
  111. )
  112. @router.post("/", response_model=FileModelResponse)
  113. def upload_file(
  114. request: Request,
  115. background_tasks: BackgroundTasks,
  116. file: UploadFile = File(...),
  117. metadata: Optional[dict | str] = Form(None),
  118. process: bool = Query(True),
  119. process_in_background: bool = Query(True),
  120. user=Depends(get_verified_user),
  121. ):
  122. return upload_file_handler(
  123. request,
  124. file=file,
  125. metadata=metadata,
  126. process=process,
  127. process_in_background=process_in_background,
  128. user=user,
  129. background_tasks=background_tasks,
  130. )
  131. def upload_file_handler(
  132. request: Request,
  133. file: UploadFile = File(...),
  134. metadata: Optional[dict | str] = Form(None),
  135. process: bool = Query(True),
  136. process_in_background: bool = Query(True),
  137. user=Depends(get_verified_user),
  138. background_tasks: Optional[BackgroundTasks] = None,
  139. ):
  140. log.info(f"file.content_type: {file.content_type}")
  141. if isinstance(metadata, str):
  142. try:
  143. metadata = json.loads(metadata)
  144. except json.JSONDecodeError:
  145. raise HTTPException(
  146. status_code=status.HTTP_400_BAD_REQUEST,
  147. detail=ERROR_MESSAGES.DEFAULT("Invalid metadata format"),
  148. )
  149. file_metadata = metadata if metadata else {}
  150. try:
  151. unsanitized_filename = file.filename
  152. filename = os.path.basename(unsanitized_filename)
  153. file_extension = os.path.splitext(filename)[1]
  154. # Remove the leading dot from the file extension
  155. file_extension = file_extension[1:] if file_extension else ""
  156. if process and request.app.state.config.ALLOWED_FILE_EXTENSIONS:
  157. request.app.state.config.ALLOWED_FILE_EXTENSIONS = [
  158. ext for ext in request.app.state.config.ALLOWED_FILE_EXTENSIONS if ext
  159. ]
  160. if file_extension not in request.app.state.config.ALLOWED_FILE_EXTENSIONS:
  161. raise HTTPException(
  162. status_code=status.HTTP_400_BAD_REQUEST,
  163. detail=ERROR_MESSAGES.DEFAULT(
  164. f"File type {file_extension} is not allowed"
  165. ),
  166. )
  167. # replace filename with uuid
  168. id = str(uuid.uuid4())
  169. name = filename
  170. filename = f"{id}_{filename}"
  171. contents, file_path = Storage.upload_file(
  172. file.file,
  173. filename,
  174. {
  175. "OpenWebUI-User-Email": user.email,
  176. "OpenWebUI-User-Id": user.id,
  177. "OpenWebUI-User-Name": user.name,
  178. "OpenWebUI-File-Id": id,
  179. },
  180. )
  181. file_item = Files.insert_new_file(
  182. user.id,
  183. FileForm(
  184. **{
  185. "id": id,
  186. "filename": name,
  187. "path": file_path,
  188. "data": {
  189. **({"status": "pending"} if process else {}),
  190. },
  191. "meta": {
  192. "name": name,
  193. "content_type": file.content_type,
  194. "size": len(contents),
  195. "data": file_metadata,
  196. },
  197. }
  198. ),
  199. )
  200. if process:
  201. if background_tasks and process_in_background:
  202. background_tasks.add_task(
  203. process_uploaded_file,
  204. request,
  205. file,
  206. file_path,
  207. file_item,
  208. file_metadata,
  209. user,
  210. )
  211. return {"status": True, **file_item.model_dump()}
  212. else:
  213. process_uploaded_file(
  214. request,
  215. file,
  216. file_path,
  217. file_item,
  218. file_metadata,
  219. user,
  220. )
  221. return {"status": True, **file_item.model_dump()}
  222. else:
  223. if file_item:
  224. return file_item
  225. else:
  226. raise HTTPException(
  227. status_code=status.HTTP_400_BAD_REQUEST,
  228. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  229. )
  230. except Exception as e:
  231. log.exception(e)
  232. raise HTTPException(
  233. status_code=status.HTTP_400_BAD_REQUEST,
  234. detail=ERROR_MESSAGES.DEFAULT("Error uploading file"),
  235. )
  236. ############################
  237. # List Files
  238. ############################
  239. @router.get("/", response_model=list[FileModelResponse])
  240. async def list_files(user=Depends(get_verified_user), content: bool = Query(True)):
  241. if user.role == "admin":
  242. files = Files.get_files()
  243. else:
  244. files = Files.get_files_by_user_id(user.id)
  245. if not content:
  246. for file in files:
  247. if "content" in file.data:
  248. del file.data["content"]
  249. return files
  250. ############################
  251. # Search Files
  252. ############################
  253. @router.get("/search", response_model=list[FileModelResponse])
  254. async def search_files(
  255. filename: str = Query(
  256. ...,
  257. description="Filename pattern to search for. Supports wildcards such as '*.txt'",
  258. ),
  259. content: bool = Query(True),
  260. user=Depends(get_verified_user),
  261. ):
  262. """
  263. Search for files by filename with support for wildcard patterns.
  264. """
  265. # Get files according to user role
  266. if user.role == "admin":
  267. files = Files.get_files()
  268. else:
  269. files = Files.get_files_by_user_id(user.id)
  270. # Get matching files
  271. matching_files = [
  272. file for file in files if fnmatch(file.filename.lower(), filename.lower())
  273. ]
  274. if not matching_files:
  275. raise HTTPException(
  276. status_code=status.HTTP_404_NOT_FOUND,
  277. detail="No files found matching the pattern.",
  278. )
  279. if not content:
  280. for file in matching_files:
  281. if "content" in file.data:
  282. del file.data["content"]
  283. return matching_files
  284. ############################
  285. # Delete All Files
  286. ############################
  287. @router.delete("/all")
  288. async def delete_all_files(user=Depends(get_admin_user)):
  289. result = Files.delete_all_files()
  290. if result:
  291. try:
  292. Storage.delete_all_files()
  293. VECTOR_DB_CLIENT.reset()
  294. except Exception as e:
  295. log.exception(e)
  296. log.error("Error deleting files")
  297. raise HTTPException(
  298. status_code=status.HTTP_400_BAD_REQUEST,
  299. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  300. )
  301. return {"message": "All files deleted successfully"}
  302. else:
  303. raise HTTPException(
  304. status_code=status.HTTP_400_BAD_REQUEST,
  305. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  306. )
  307. ############################
  308. # Get File By Id
  309. ############################
  310. @router.get("/{id}", response_model=Optional[FileModel])
  311. async def get_file_by_id(id: str, user=Depends(get_verified_user)):
  312. file = Files.get_file_by_id(id)
  313. if not file:
  314. raise HTTPException(
  315. status_code=status.HTTP_404_NOT_FOUND,
  316. detail=ERROR_MESSAGES.NOT_FOUND,
  317. )
  318. if (
  319. file.user_id == user.id
  320. or user.role == "admin"
  321. or has_access_to_file(id, "read", user)
  322. ):
  323. return file
  324. else:
  325. raise HTTPException(
  326. status_code=status.HTTP_404_NOT_FOUND,
  327. detail=ERROR_MESSAGES.NOT_FOUND,
  328. )
  329. @router.get("/{id}/process/status")
  330. async def get_file_process_status(
  331. id: str, stream: bool = Query(False), user=Depends(get_verified_user)
  332. ):
  333. file = Files.get_file_by_id(id)
  334. if not file:
  335. raise HTTPException(
  336. status_code=status.HTTP_404_NOT_FOUND,
  337. detail=ERROR_MESSAGES.NOT_FOUND,
  338. )
  339. if (
  340. file.user_id == user.id
  341. or user.role == "admin"
  342. or has_access_to_file(id, "read", user)
  343. ):
  344. if stream:
  345. MAX_FILE_PROCESSING_DURATION = 3600 * 2
  346. async def event_stream(file_item):
  347. if file_item:
  348. for _ in range(MAX_FILE_PROCESSING_DURATION):
  349. file_item = Files.get_file_by_id(file_item.id)
  350. if file_item:
  351. data = file_item.model_dump().get("data", {})
  352. status = data.get("status")
  353. if status:
  354. event = {"status": status}
  355. if status == "failed":
  356. event["error"] = data.get("error")
  357. yield f"data: {json.dumps(event)}\n\n"
  358. if status in ("completed", "failed"):
  359. break
  360. else:
  361. # Legacy
  362. break
  363. await asyncio.sleep(0.5)
  364. else:
  365. yield f"data: {json.dumps({'status': 'not_found'})}\n\n"
  366. return StreamingResponse(
  367. event_stream(file),
  368. media_type="text/event-stream",
  369. )
  370. else:
  371. return {"status": file.data.get("status", "pending")}
  372. else:
  373. raise HTTPException(
  374. status_code=status.HTTP_404_NOT_FOUND,
  375. detail=ERROR_MESSAGES.NOT_FOUND,
  376. )
  377. ############################
  378. # Get File Data Content By Id
  379. ############################
  380. @router.get("/{id}/data/content")
  381. async def get_file_data_content_by_id(id: str, user=Depends(get_verified_user)):
  382. file = Files.get_file_by_id(id)
  383. if not file:
  384. raise HTTPException(
  385. status_code=status.HTTP_404_NOT_FOUND,
  386. detail=ERROR_MESSAGES.NOT_FOUND,
  387. )
  388. if (
  389. file.user_id == user.id
  390. or user.role == "admin"
  391. or has_access_to_file(id, "read", user)
  392. ):
  393. return {"content": file.data.get("content", "")}
  394. else:
  395. raise HTTPException(
  396. status_code=status.HTTP_404_NOT_FOUND,
  397. detail=ERROR_MESSAGES.NOT_FOUND,
  398. )
  399. ############################
  400. # Update File Data Content By Id
  401. ############################
  402. class ContentForm(BaseModel):
  403. content: str
  404. @router.post("/{id}/data/content/update")
  405. async def update_file_data_content_by_id(
  406. request: Request, id: str, form_data: ContentForm, user=Depends(get_verified_user)
  407. ):
  408. file = Files.get_file_by_id(id)
  409. if not file:
  410. raise HTTPException(
  411. status_code=status.HTTP_404_NOT_FOUND,
  412. detail=ERROR_MESSAGES.NOT_FOUND,
  413. )
  414. if (
  415. file.user_id == user.id
  416. or user.role == "admin"
  417. or has_access_to_file(id, "write", user)
  418. ):
  419. try:
  420. process_file(
  421. request,
  422. ProcessFileForm(file_id=id, content=form_data.content),
  423. user=user,
  424. )
  425. file = Files.get_file_by_id(id=id)
  426. except Exception as e:
  427. log.exception(e)
  428. log.error(f"Error processing file: {file.id}")
  429. return {"content": file.data.get("content", "")}
  430. else:
  431. raise HTTPException(
  432. status_code=status.HTTP_404_NOT_FOUND,
  433. detail=ERROR_MESSAGES.NOT_FOUND,
  434. )
  435. ############################
  436. # Get File Content By Id
  437. ############################
  438. @router.get("/{id}/content")
  439. async def get_file_content_by_id(
  440. id: str, user=Depends(get_verified_user), attachment: bool = Query(False)
  441. ):
  442. file = Files.get_file_by_id(id)
  443. if not file:
  444. raise HTTPException(
  445. status_code=status.HTTP_404_NOT_FOUND,
  446. detail=ERROR_MESSAGES.NOT_FOUND,
  447. )
  448. if (
  449. file.user_id == user.id
  450. or user.role == "admin"
  451. or has_access_to_file(id, "read", user)
  452. ):
  453. try:
  454. file_path = Storage.get_file(file.path)
  455. file_path = Path(file_path)
  456. # Check if the file already exists in the cache
  457. if file_path.is_file():
  458. # Handle Unicode filenames
  459. filename = file.meta.get("name", file.filename)
  460. encoded_filename = quote(filename) # RFC5987 encoding
  461. content_type = file.meta.get("content_type")
  462. filename = file.meta.get("name", file.filename)
  463. encoded_filename = quote(filename)
  464. headers = {}
  465. if attachment:
  466. headers["Content-Disposition"] = (
  467. f"attachment; filename*=UTF-8''{encoded_filename}"
  468. )
  469. else:
  470. if content_type == "application/pdf" or filename.lower().endswith(
  471. ".pdf"
  472. ):
  473. headers["Content-Disposition"] = (
  474. f"inline; filename*=UTF-8''{encoded_filename}"
  475. )
  476. content_type = "application/pdf"
  477. elif content_type != "text/plain":
  478. headers["Content-Disposition"] = (
  479. f"attachment; filename*=UTF-8''{encoded_filename}"
  480. )
  481. return FileResponse(file_path, headers=headers, media_type=content_type)
  482. else:
  483. raise HTTPException(
  484. status_code=status.HTTP_404_NOT_FOUND,
  485. detail=ERROR_MESSAGES.NOT_FOUND,
  486. )
  487. except Exception as e:
  488. log.exception(e)
  489. log.error("Error getting file content")
  490. raise HTTPException(
  491. status_code=status.HTTP_400_BAD_REQUEST,
  492. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  493. )
  494. else:
  495. raise HTTPException(
  496. status_code=status.HTTP_404_NOT_FOUND,
  497. detail=ERROR_MESSAGES.NOT_FOUND,
  498. )
  499. @router.get("/{id}/content/html")
  500. async def get_html_file_content_by_id(id: str, user=Depends(get_verified_user)):
  501. file = Files.get_file_by_id(id)
  502. if not file:
  503. raise HTTPException(
  504. status_code=status.HTTP_404_NOT_FOUND,
  505. detail=ERROR_MESSAGES.NOT_FOUND,
  506. )
  507. file_user = Users.get_user_by_id(file.user_id)
  508. if not file_user.role == "admin":
  509. raise HTTPException(
  510. status_code=status.HTTP_404_NOT_FOUND,
  511. detail=ERROR_MESSAGES.NOT_FOUND,
  512. )
  513. if (
  514. file.user_id == user.id
  515. or user.role == "admin"
  516. or has_access_to_file(id, "read", user)
  517. ):
  518. try:
  519. file_path = Storage.get_file(file.path)
  520. file_path = Path(file_path)
  521. # Check if the file already exists in the cache
  522. if file_path.is_file():
  523. log.info(f"file_path: {file_path}")
  524. return FileResponse(file_path)
  525. else:
  526. raise HTTPException(
  527. status_code=status.HTTP_404_NOT_FOUND,
  528. detail=ERROR_MESSAGES.NOT_FOUND,
  529. )
  530. except Exception as e:
  531. log.exception(e)
  532. log.error("Error getting file content")
  533. raise HTTPException(
  534. status_code=status.HTTP_400_BAD_REQUEST,
  535. detail=ERROR_MESSAGES.DEFAULT("Error getting file content"),
  536. )
  537. else:
  538. raise HTTPException(
  539. status_code=status.HTTP_404_NOT_FOUND,
  540. detail=ERROR_MESSAGES.NOT_FOUND,
  541. )
  542. @router.get("/{id}/content/{file_name}")
  543. async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
  544. file = Files.get_file_by_id(id)
  545. if not file:
  546. raise HTTPException(
  547. status_code=status.HTTP_404_NOT_FOUND,
  548. detail=ERROR_MESSAGES.NOT_FOUND,
  549. )
  550. if (
  551. file.user_id == user.id
  552. or user.role == "admin"
  553. or has_access_to_file(id, "read", user)
  554. ):
  555. file_path = file.path
  556. # Handle Unicode filenames
  557. filename = file.meta.get("name", file.filename)
  558. encoded_filename = quote(filename) # RFC5987 encoding
  559. headers = {
  560. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
  561. }
  562. if file_path:
  563. file_path = Storage.get_file(file_path)
  564. file_path = Path(file_path)
  565. # Check if the file already exists in the cache
  566. if file_path.is_file():
  567. return FileResponse(file_path, headers=headers)
  568. else:
  569. raise HTTPException(
  570. status_code=status.HTTP_404_NOT_FOUND,
  571. detail=ERROR_MESSAGES.NOT_FOUND,
  572. )
  573. else:
  574. # File path doesn’t exist, return the content as .txt if possible
  575. file_content = file.content.get("content", "")
  576. file_name = file.filename
  577. # Create a generator that encodes the file content
  578. def generator():
  579. yield file_content.encode("utf-8")
  580. return StreamingResponse(
  581. generator(),
  582. media_type="text/plain",
  583. headers=headers,
  584. )
  585. else:
  586. raise HTTPException(
  587. status_code=status.HTTP_404_NOT_FOUND,
  588. detail=ERROR_MESSAGES.NOT_FOUND,
  589. )
  590. ############################
  591. # Delete File By Id
  592. ############################
  593. @router.delete("/{id}")
  594. async def delete_file_by_id(id: str, user=Depends(get_verified_user)):
  595. file = Files.get_file_by_id(id)
  596. if not file:
  597. raise HTTPException(
  598. status_code=status.HTTP_404_NOT_FOUND,
  599. detail=ERROR_MESSAGES.NOT_FOUND,
  600. )
  601. if (
  602. file.user_id == user.id
  603. or user.role == "admin"
  604. or has_access_to_file(id, "write", user)
  605. ):
  606. result = Files.delete_file_by_id(id)
  607. if result:
  608. try:
  609. Storage.delete_file(file.path)
  610. VECTOR_DB_CLIENT.delete(collection_name=f"file-{id}")
  611. except Exception as e:
  612. log.exception(e)
  613. log.error("Error deleting files")
  614. raise HTTPException(
  615. status_code=status.HTTP_400_BAD_REQUEST,
  616. detail=ERROR_MESSAGES.DEFAULT("Error deleting files"),
  617. )
  618. return {"message": "File deleted successfully"}
  619. else:
  620. raise HTTPException(
  621. status_code=status.HTTP_400_BAD_REQUEST,
  622. detail=ERROR_MESSAGES.DEFAULT("Error deleting file"),
  623. )
  624. else:
  625. raise HTTPException(
  626. status_code=status.HTTP_404_NOT_FOUND,
  627. detail=ERROR_MESSAGES.NOT_FOUND,
  628. )