瀏覽代碼

refac: metadata handling in vectordb

Timothy Jaeryang Baek 2 月之前
父節點
當前提交
6a17ba5b7a

+ 4 - 2
backend/open_webui/retrieval/vector/dbs/chroma.py

@@ -11,6 +11,8 @@ from open_webui.retrieval.vector.main import (
     SearchResult,
     GetResult,
 )
+from open_webui.retrieval.vector.utils import stringify_metadata
+
 from open_webui.config import (
     CHROMA_DATA_PATH,
     CHROMA_HTTP_HOST,
@@ -144,7 +146,7 @@ class ChromaClient(VectorDBBase):
         ids = [item["id"] for item in items]
         documents = [item["text"] for item in items]
         embeddings = [item["vector"] for item in items]
-        metadatas = [item["metadata"] for item in items]
+        metadatas = [stringify_metadata(item["metadata"]) for item in items]
 
         for batch in create_batches(
             api=self.client,
@@ -164,7 +166,7 @@ class ChromaClient(VectorDBBase):
         ids = [item["id"] for item in items]
         documents = [item["text"] for item in items]
         embeddings = [item["vector"] for item in items]
-        metadatas = [item["metadata"] for item in items]
+        metadatas = [stringify_metadata(item["metadata"]) for item in items]
 
         collection.upsert(
             ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas

+ 4 - 2
backend/open_webui/retrieval/vector/dbs/milvus.py

@@ -3,6 +3,8 @@ from pymilvus import FieldSchema, DataType
 import json
 import logging
 from typing import Optional
+
+from open_webui.retrieval.vector.utils import stringify_metadata
 from open_webui.retrieval.vector.main import (
     VectorDBBase,
     VectorItem,
@@ -311,7 +313,7 @@ class MilvusClient(VectorDBBase):
                     "id": item["id"],
                     "vector": item["vector"],
                     "data": {"text": item["text"]},
-                    "metadata": item["metadata"],
+                    "metadata": stringify_metadata(item["metadata"]),
                 }
                 for item in items
             ],
@@ -347,7 +349,7 @@ class MilvusClient(VectorDBBase):
                     "id": item["id"],
                     "vector": item["vector"],
                     "data": {"text": item["text"]},
-                    "metadata": item["metadata"],
+                    "metadata": stringify_metadata(item["metadata"]),
                 }
                 for item in items
             ],

+ 5 - 3
backend/open_webui/retrieval/vector/dbs/pgvector.py

@@ -26,6 +26,8 @@ from pgvector.sqlalchemy import Vector
 from sqlalchemy.ext.mutable import MutableDict
 from sqlalchemy.exc import NoSuchTableError
 
+
+from open_webui.retrieval.vector.utils import stringify_metadata
 from open_webui.retrieval.vector.main import (
     VectorDBBase,
     VectorItem,
@@ -235,7 +237,7 @@ class PgvectorClient(VectorDBBase):
                         vector=vector,
                         collection_name=collection_name,
                         text=item["text"],
-                        vmetadata=item["metadata"],
+                        vmetadata=stringify_metadata(item["metadata"]),
                     )
                     new_items.append(new_chunk)
                 self.session.bulk_save_objects(new_items)
@@ -292,7 +294,7 @@ class PgvectorClient(VectorDBBase):
                     if existing:
                         existing.vector = vector
                         existing.text = item["text"]
-                        existing.vmetadata = item["metadata"]
+                        existing.vmetadata = stringify_metadata(item["metadata"])
                         existing.collection_name = (
                             collection_name  # Update collection_name if necessary
                         )
@@ -302,7 +304,7 @@ class PgvectorClient(VectorDBBase):
                             vector=vector,
                             collection_name=collection_name,
                             text=item["text"],
-                            vmetadata=item["metadata"],
+                            vmetadata=stringify_metadata(item["metadata"]),
                         )
                         self.session.add(new_chunk)
                 self.session.commit()

+ 14 - 0
backend/open_webui/retrieval/vector/utils.py

@@ -0,0 +1,14 @@
+from datetime import datetime
+
+
+def stringify_metadata(
+    metadata: dict[str, any],
+) -> dict[str, any]:
+    for key, value in metadata.items():
+        if (
+            isinstance(value, datetime)
+            or isinstance(value, list)
+            or isinstance(value, dict)
+        ):
+            metadata[key] = str(value)
+    return metadata

+ 4 - 17
backend/open_webui/routers/retrieval.py

@@ -1229,27 +1229,14 @@ def save_docs_to_vector_db(
         {
             **doc.metadata,
             **(metadata if metadata else {}),
-            "embedding_config": json.dumps(
-                {
-                    "engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
-                    "model": request.app.state.config.RAG_EMBEDDING_MODEL,
-                }
-            ),
+            "embedding_config": {
+                "engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
+                "model": request.app.state.config.RAG_EMBEDDING_MODEL,
+            },
         }
         for doc in docs
     ]
 
-    # ChromaDB does not like datetime formats
-    # for meta-data so convert them to string.
-    for metadata in metadatas:
-        for key, value in metadata.items():
-            if (
-                isinstance(value, datetime)
-                or isinstance(value, list)
-                or isinstance(value, dict)
-            ):
-                metadata[key] = str(value)
-
     try:
         if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
             log.info(f"collection {collection_name} already exists")