Преглед изворни кода

feat: S3 vector support tested

0xThresh.eth пре 2 месеци
родитељ
комит
d463a29ba1

+ 1 - 1
backend/dev.sh

@@ -1,3 +1,3 @@
-export CORS_ALLOW_ORIGIN=http://localhost:5173/
+export CORS_ALLOW_ORIGIN="http://localhost:5173"
 PORT="${PORT:-8080}"
 uvicorn open_webui.main:app --port $PORT --host 0.0.0.0 --forwarded-allow-ips '*' --reload

+ 3 - 39
backend/open_webui/retrieval/vector/s3/s3vector.py → backend/open_webui/retrieval/vector/dbs/s3vector.py

@@ -38,15 +38,16 @@ class S3VectorClient(VectorDBBase):
 
     def _filter_metadata(self, metadata: Dict[str, Any], item_id: str) -> Dict[str, Any]:
         """
-        Filter metadata to comply with S3 Vector API limit of 10 keys maximum.
+        Filter vector metadata keys to comply with S3 Vector API limit of 10 keys maximum.
         If AWS S3 Vector feature starts supporting more than 10 keys, this should be adjusted, and preferably removed.
+        Limitation is documented here: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-limitations.html
         """
         if not isinstance(metadata, dict) or len(metadata) <= 10:
             return metadata
             
         # Keep only the first 10 keys, prioritizing important ones based on actual Open WebUI metadata            
         important_keys = [
-            'text',             # THE MOST IMPORTANT - the actual document content
+            'text',             # The actual document content
             'file_id',          # File ID
             'source',           # Document source file
             'title',            # Document title
@@ -77,34 +78,6 @@ class S3VectorClient(VectorDBBase):
         log.warning(f"Metadata for key '{item_id}' had {len(metadata)} keys, limited to 10 keys")
         return filtered_metadata
 
-    def _check_for_duplicate_file_collections(self, knowledge_collection_name: str, new_items: List[Dict[str, Any]]) -> None:
-        """
-        Check for existing file-specific collections that might create duplicates.
-        """
-        # Extract file IDs from the new items to find corresponding file collections
-        file_ids = set()
-        for item in new_items:
-            metadata = item.get("metadata", {})
-            file_id = metadata.get("file_id")
-            if file_id:
-                file_ids.add(file_id)
-        
-        # Check for existing file-specific collections
-        duplicate_collections = []
-        for file_id in file_ids:
-            file_collection_name = f"file-{file_id}"
-            if self.has_collection(file_collection_name):
-                duplicate_collections.append(file_collection_name)
-        
-        if duplicate_collections:
-            log.warning(f"Found existing file-specific collections that may contain duplicate vectors: {duplicate_collections}")
-            log.warning(f"Consider manually deleting these collections to avoid duplicate storage:")
-            for collection in duplicate_collections:
-                log.warning(f"  - {collection}")
-            log.warning(f"Continuing with insertion to knowledge collection '{knowledge_collection_name}'")
-        else:
-            log.info(f"No duplicate file-specific collections found for knowledge collection '{knowledge_collection_name}'")
-
     def has_collection(self, collection_name: str) -> bool:
         """
         Check if a vector index (collection) exists in the S3 vector bucket.
@@ -157,9 +130,6 @@ class S3VectorClient(VectorDBBase):
                     distance_metric="cosine",
                 )
             
-            # Check for any existing file-specific collections that might create duplicates
-            self._check_for_duplicate_file_collections(collection_name, items)
-            
             # Prepare vectors for insertion
             vectors = []
             for item in items:
@@ -202,9 +172,6 @@ class S3VectorClient(VectorDBBase):
     def upsert(self, collection_name: str, items: List[Dict[str, Any]]) -> None:
         """
         Insert or update vector items in the S3 Vector index. Create index if it does not exist.
-        
-        Supports both knowledge collections and file-specific collections for compatibility
-        with existing Open WebUI backend logic.
         """
         if not items:
             log.warning("No items to upsert")
@@ -223,9 +190,6 @@ class S3VectorClient(VectorDBBase):
                     distance_metric="cosine",
                 )
             
-            # Check for any existing file-specific collections that might create duplicates
-            self._check_for_duplicate_file_collections(collection_name, items)
-            
             # Prepare vectors for upsert
             vectors = []
             for item in items:

+ 1 - 1
backend/open_webui/retrieval/vector/factory.py

@@ -30,7 +30,7 @@ class Vector:
                 from open_webui.retrieval.vector.dbs.pinecone import PineconeClient
                 return PineconeClient()
             case VectorType.S3VECTOR:
-                from open_webui.retrieval.vector.s3.s3vector import S3VectorClient
+                from open_webui.retrieval.vector.dbs.s3vector import S3VectorClient
                 return S3VectorClient()
             case VectorType.OPENSEARCH:
                 from open_webui.retrieval.vector.dbs.opensearch import OpenSearchClient