milvus.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. from pymilvus import MilvusClient as Client
  2. from pymilvus import FieldSchema, DataType
  3. import json
  4. import logging
  5. from typing import Optional
  6. from open_webui.retrieval.vector.main import VectorItem, SearchResult, GetResult
  7. from open_webui.config import (
  8. MILVUS_URI,
  9. MILVUS_DB,
  10. MILVUS_TOKEN,
  11. )
  12. from open_webui.env import SRC_LOG_LEVELS
  13. log = logging.getLogger(__name__)
  14. log.setLevel(SRC_LOG_LEVELS["RAG"])
  15. class MilvusClient:
  16. def __init__(self):
  17. self.collection_prefix = "open_webui"
  18. if MILVUS_TOKEN is None:
  19. self.client = Client(uri=MILVUS_URI, db_name=MILVUS_DB)
  20. else:
  21. self.client = Client(uri=MILVUS_URI, db_name=MILVUS_DB, token=MILVUS_TOKEN)
  22. def _result_to_get_result(self, result) -> GetResult:
  23. ids = []
  24. documents = []
  25. metadatas = []
  26. for match in result:
  27. _ids = []
  28. _documents = []
  29. _metadatas = []
  30. for item in match:
  31. _ids.append(item.get("id"))
  32. _documents.append(item.get("data", {}).get("text"))
  33. _metadatas.append(item.get("metadata"))
  34. ids.append(_ids)
  35. documents.append(_documents)
  36. metadatas.append(_metadatas)
  37. return GetResult(
  38. **{
  39. "ids": ids,
  40. "documents": documents,
  41. "metadatas": metadatas,
  42. }
  43. )
  44. def _result_to_search_result(self, result) -> SearchResult:
  45. ids = []
  46. distances = []
  47. documents = []
  48. metadatas = []
  49. for match in result:
  50. _ids = []
  51. _distances = []
  52. _documents = []
  53. _metadatas = []
  54. for item in match:
  55. _ids.append(item.get("id"))
  56. # normalize milvus score from [-1, 1] to [0, 1] range
  57. # https://milvus.io/docs/de/metric.md
  58. _dist = (item.get("distance") + 1.0) / 2.0
  59. _distances.append(_dist)
  60. _documents.append(item.get("entity", {}).get("data", {}).get("text"))
  61. _metadatas.append(item.get("entity", {}).get("metadata"))
  62. ids.append(_ids)
  63. distances.append(_distances)
  64. documents.append(_documents)
  65. metadatas.append(_metadatas)
  66. return SearchResult(
  67. **{
  68. "ids": ids,
  69. "distances": distances,
  70. "documents": documents,
  71. "metadatas": metadatas,
  72. }
  73. )
  74. def _create_collection(self, collection_name: str, dimension: int):
  75. schema = self.client.create_schema(
  76. auto_id=False,
  77. enable_dynamic_field=True,
  78. )
  79. schema.add_field(
  80. field_name="id",
  81. datatype=DataType.VARCHAR,
  82. is_primary=True,
  83. max_length=65535,
  84. )
  85. schema.add_field(
  86. field_name="vector",
  87. datatype=DataType.FLOAT_VECTOR,
  88. dim=dimension,
  89. description="vector",
  90. )
  91. schema.add_field(field_name="data", datatype=DataType.JSON, description="data")
  92. schema.add_field(
  93. field_name="metadata", datatype=DataType.JSON, description="metadata"
  94. )
  95. index_params = self.client.prepare_index_params()
  96. index_params.add_index(
  97. field_name="vector",
  98. index_type="HNSW",
  99. metric_type="COSINE",
  100. params={"M": 16, "efConstruction": 100},
  101. )
  102. self.client.create_collection(
  103. collection_name=f"{self.collection_prefix}_{collection_name}",
  104. schema=schema,
  105. index_params=index_params,
  106. )
  107. def has_collection(self, collection_name: str) -> bool:
  108. # Check if the collection exists based on the collection name.
  109. collection_name = collection_name.replace("-", "_")
  110. return self.client.has_collection(
  111. collection_name=f"{self.collection_prefix}_{collection_name}"
  112. )
  113. def delete_collection(self, collection_name: str):
  114. # Delete the collection based on the collection name.
  115. collection_name = collection_name.replace("-", "_")
  116. return self.client.drop_collection(
  117. collection_name=f"{self.collection_prefix}_{collection_name}"
  118. )
  119. def search(
  120. self, collection_name: str, vectors: list[list[float | int]], limit: int
  121. ) -> Optional[SearchResult]:
  122. # Search for the nearest neighbor items based on the vectors and return 'limit' number of results.
  123. collection_name = collection_name.replace("-", "_")
  124. result = self.client.search(
  125. collection_name=f"{self.collection_prefix}_{collection_name}",
  126. data=vectors,
  127. limit=limit,
  128. output_fields=["data", "metadata"],
  129. )
  130. return self._result_to_search_result(result)
  131. def query(self, collection_name: str, filter: dict, limit: Optional[int] = None):
  132. # Construct the filter string for querying
  133. collection_name = collection_name.replace("-", "_")
  134. if not self.has_collection(collection_name):
  135. return None
  136. filter_string = " && ".join(
  137. [
  138. f'metadata["{key}"] == {json.dumps(value)}'
  139. for key, value in filter.items()
  140. ]
  141. )
  142. max_limit = 16383 # The maximum number of records per request
  143. all_results = []
  144. if limit is None:
  145. limit = float("inf") # Use infinity as a placeholder for no limit
  146. # Initialize offset and remaining to handle pagination
  147. offset = 0
  148. remaining = limit
  149. try:
  150. # Loop until there are no more items to fetch or the desired limit is reached
  151. while remaining > 0:
  152. log.info(f"remaining: {remaining}")
  153. current_fetch = min(
  154. max_limit, remaining
  155. ) # Determine how many items to fetch in this iteration
  156. results = self.client.query(
  157. collection_name=f"{self.collection_prefix}_{collection_name}",
  158. filter=filter_string,
  159. output_fields=["*"],
  160. limit=current_fetch,
  161. offset=offset,
  162. )
  163. if not results:
  164. break
  165. all_results.extend(results)
  166. results_count = len(results)
  167. remaining -= (
  168. results_count # Decrease remaining by the number of items fetched
  169. )
  170. offset += results_count
  171. # Break the loop if the results returned are less than the requested fetch count
  172. if results_count < current_fetch:
  173. break
  174. log.debug(all_results)
  175. return self._result_to_get_result([all_results])
  176. except Exception as e:
  177. log.exception(
  178. f"Error querying collection {collection_name} with limit {limit}: {e}"
  179. )
  180. return None
  181. def get(self, collection_name: str) -> Optional[GetResult]:
  182. # Get all the items in the collection.
  183. collection_name = collection_name.replace("-", "_")
  184. result = self.client.query(
  185. collection_name=f"{self.collection_prefix}_{collection_name}",
  186. filter='id != ""',
  187. )
  188. return self._result_to_get_result([result])
  189. def insert(self, collection_name: str, items: list[VectorItem]):
  190. # Insert the items into the collection, if the collection does not exist, it will be created.
  191. collection_name = collection_name.replace("-", "_")
  192. if not self.client.has_collection(
  193. collection_name=f"{self.collection_prefix}_{collection_name}"
  194. ):
  195. self._create_collection(
  196. collection_name=collection_name, dimension=len(items[0]["vector"])
  197. )
  198. return self.client.insert(
  199. collection_name=f"{self.collection_prefix}_{collection_name}",
  200. data=[
  201. {
  202. "id": item["id"],
  203. "vector": item["vector"],
  204. "data": {"text": item["text"]},
  205. "metadata": item["metadata"],
  206. }
  207. for item in items
  208. ],
  209. )
  210. def upsert(self, collection_name: str, items: list[VectorItem]):
  211. # Update the items in the collection, if the items are not present, insert them. If the collection does not exist, it will be created.
  212. collection_name = collection_name.replace("-", "_")
  213. if not self.client.has_collection(
  214. collection_name=f"{self.collection_prefix}_{collection_name}"
  215. ):
  216. self._create_collection(
  217. collection_name=collection_name, dimension=len(items[0]["vector"])
  218. )
  219. return self.client.upsert(
  220. collection_name=f"{self.collection_prefix}_{collection_name}",
  221. data=[
  222. {
  223. "id": item["id"],
  224. "vector": item["vector"],
  225. "data": {"text": item["text"]},
  226. "metadata": item["metadata"],
  227. }
  228. for item in items
  229. ],
  230. )
  231. def delete(
  232. self,
  233. collection_name: str,
  234. ids: Optional[list[str]] = None,
  235. filter: Optional[dict] = None,
  236. ):
  237. # Delete the items from the collection based on the ids.
  238. collection_name = collection_name.replace("-", "_")
  239. if ids:
  240. return self.client.delete(
  241. collection_name=f"{self.collection_prefix}_{collection_name}",
  242. ids=ids,
  243. )
  244. elif filter:
  245. # Convert the filter dictionary to a string using JSON_CONTAINS.
  246. filter_string = " && ".join(
  247. [
  248. f'metadata["{key}"] == {json.dumps(value)}'
  249. for key, value in filter.items()
  250. ]
  251. )
  252. return self.client.delete(
  253. collection_name=f"{self.collection_prefix}_{collection_name}",
  254. filter=filter_string,
  255. )
  256. def reset(self):
  257. # Resets the database. This will delete all collections and item entries.
  258. collection_names = self.client.list_collections()
  259. for collection_name in collection_names:
  260. if collection_name.startswith(self.collection_prefix):
  261. self.client.drop_collection(collection_name=collection_name)