Timothy J. Baek 7 months ago
parent
commit
9d2ed3d2be

+ 0 - 2
backend/open_webui/apps/retrieval/loader/main.py

@@ -2,7 +2,6 @@ import requests
 import logging
 import ftfy
 
-
 from langchain_community.document_loaders import (
     BSHTMLLoader,
     CSVLoader,
@@ -24,7 +23,6 @@ from open_webui.env import SRC_LOG_LEVELS
 log = logging.getLogger(__name__)
 log.setLevel(SRC_LOG_LEVELS["RAG"])
 
-
 known_source_ext = [
     "go",
     "py",

+ 10 - 2
backend/open_webui/apps/retrieval/main.py

@@ -725,8 +725,16 @@ def process_file(
             PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
         )
         docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
-        raw_content = " ".join([doc.page_content for doc in docs])
-        print(raw_content)
+        raw_text_content = " ".join([doc.page_content for doc in docs])
+
+        Files.update_files_metadata_by_id(
+            form_data.file_id,
+            {
+                "content": {
+                    "text": raw_text_content,
+                }
+            },
+        )
 
         try:
             result = save_docs_to_vector_db(

+ 11 - 0
backend/open_webui/apps/webui/models/files.py

@@ -97,6 +97,17 @@ class FilesTable:
                 for file in db.query(File).filter_by(user_id=user_id).all()
             ]
 
+    def update_files_metadata_by_id(self, id: str, meta: dict) -> Optional[FileModel]:
+        with get_db() as db:
+            try:
+                file = db.query(File).filter_by(id=id).first()
+                file.meta = {**file.meta, **meta}
+                db.commit()
+
+                return FileModel.model_validate(file)
+            except Exception:
+                return None
+
     def delete_file_by_id(self, id: str) -> bool:
         with get_db() as db:
             try:

+ 13 - 0
backend/open_webui/apps/webui/routers/files.py

@@ -171,6 +171,19 @@ async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
         )
 
 
+@router.get("/{id}/content/text")
+async def get_file_text_content_by_id(id: str, user=Depends(get_verified_user)):
+    file = Files.get_file_by_id(id)
+
+    if file and (file.user_id == user.id or user.role == "admin"):
+        return {"text": file.meta.get("content", {}).get("text", None)}
+    else:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=ERROR_MESSAGES.NOT_FOUND,
+        )
+
+
 @router.get("/{id}/content/{file_name}", response_model=Optional[FileModel])
 async def get_file_content_by_id(id: str, user=Depends(get_verified_user)):
     file = Files.get_file_by_id(id)