prometheus.service.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. import axios from 'axios';
  2. interface IPrometheusNode {
  3. type: string;
  4. pod: string;
  5. cpu: number[];
  6. memory: number[];
  7. }
  8. interface IPrometheusAllData {
  9. totalVectorsCount: number[];
  10. searchVectorsCount: number[];
  11. searchFailedVectorsCount?: number[];
  12. sqLatency: number[];
  13. meta: number[];
  14. msgstream: number[];
  15. objstorage: number[];
  16. rootNodes: IPrometheusNode[];
  17. queryNodes: IPrometheusNode[];
  18. indexNodes: IPrometheusNode[];
  19. dataNodes: IPrometheusNode[];
  20. }
  21. const metaMetric = 'milvus_meta_op_count';
  22. const msgstreamMetric = 'milvus_msgstream_op_count';
  23. const objstorageMetric = 'milvus_storage_op_count';
  24. const totalVectorsCountMetric = 'milvus_proxy_insert_vectors_count';
  25. const searchVectorsCountMetric = 'milvus_proxy_search_vectors_count';
  26. const sqLatencyMetric = 'milvus_proxy_sq_latency_bucket';
  27. const cpuMetric = 'process_cpu_seconds_total';
  28. const memoryMetric = 'process_resident_memory_bytes';
  29. export class PrometheusService {
  30. static address: string = '';
  31. static instance: string = '';
  32. static namespace: string = '';
  33. static isReady: boolean = false;
  34. static get url() {
  35. return `http://${PrometheusService.address}`;
  36. }
  37. static get selector() {
  38. return (
  39. '{' +
  40. `app_kubernetes_io_instance="${PrometheusService.instance}"` +
  41. `,namespace="${PrometheusService.namespace}"` +
  42. '}'
  43. );
  44. }
  45. constructor() {
  46. // todo
  47. }
  48. async setPrometheus({
  49. prometheusAddress,
  50. prometheusInstance,
  51. prometheusNamespace,
  52. }: {
  53. prometheusAddress: string;
  54. prometheusInstance: string;
  55. prometheusNamespace: string;
  56. }) {
  57. PrometheusService.isReady = await this.checkPrometheus(prometheusAddress);
  58. if (PrometheusService.isReady) {
  59. PrometheusService.address = prometheusAddress;
  60. PrometheusService.instance = prometheusInstance;
  61. PrometheusService.namespace = prometheusNamespace;
  62. }
  63. return {
  64. isReady: PrometheusService.isReady,
  65. };
  66. }
  67. async checkPrometheus(prometheusAddress: string) {
  68. const result = await axios
  69. .get(`http://${prometheusAddress}/-/ready`)
  70. .then(res => res?.status === 200)
  71. .catch(err => {
  72. // console.log(err);
  73. return false;
  74. });
  75. return result;
  76. }
  77. async queryRange(expr: string, start: number, end: number, step: number) {
  78. const url =
  79. PrometheusService.url +
  80. '/api/v1/query_range?query=' +
  81. expr +
  82. `&start=${new Date(+start).toISOString()}` +
  83. `&end=${new Date(+end).toISOString()}` +
  84. `&step=${step / 1000}s`;
  85. console.log(url);
  86. const result = await axios
  87. .get(url)
  88. .then(res => res.data)
  89. .catch(err => {
  90. // console.log(err);
  91. return { status: 'failed' };
  92. });
  93. return result;
  94. }
  95. async getVectorsCount(
  96. metric: string,
  97. start: number,
  98. end: number,
  99. step: number
  100. ) {
  101. const expr = `${metric}${PrometheusService.selector}`;
  102. const result = await this.queryRange(expr, start, end, step);
  103. const data = result.data.result;
  104. const length = Math.floor((end - start) / step);
  105. if (data.length === 0) return Array(length).fill(0);
  106. const res = result.data.result[0].values.map((d: any) => +d[1]).slice(1);
  107. let leftLossCount;
  108. let rightLossCount;
  109. leftLossCount = Math.floor((data[0].values[0][0] * 1000 - start) / step);
  110. res.unshift(...Array(leftLossCount).fill(-1));
  111. rightLossCount = Math.floor(
  112. (end - data[0].values[data[0].values.length - 1][0] * 1000) / step
  113. );
  114. res.push(...Array(rightLossCount).fill(-2));
  115. return res;
  116. }
  117. getInsertVectorsCount = (start: number, end: number, step: number) =>
  118. this.getVectorsCount(totalVectorsCountMetric, start, end, step);
  119. async getSearchVectorsCount(start: number, end: number, step: number) {
  120. const expr = `${searchVectorsCountMetric}${PrometheusService.selector}`;
  121. const result = await this.queryRange(expr, start, end, step);
  122. const data = result.data.result;
  123. const length = Math.floor((end - start) / step);
  124. if (data.length === 0) return Array(length).fill(0);
  125. const totalCount = data[0].values.map((d: any) => +d[1]);
  126. const res = totalCount
  127. .map((d: number, i: number) => (i > 0 ? d - totalCount[i - 1] : d))
  128. .slice(1);
  129. let leftLossCount;
  130. let rightLossCount;
  131. leftLossCount = Math.floor((data[0].values[0][0] * 1000 - start) / step);
  132. res.unshift(...Array(leftLossCount).fill(-1));
  133. rightLossCount = Math.floor(
  134. (end - data[0].values[data[0].values.length - 1][0] * 1000) / step
  135. );
  136. res.push(...Array(rightLossCount).fill(-2));
  137. return res;
  138. }
  139. async getSQLatency(start: number, end: number, step: number) {
  140. const expr =
  141. `histogram_quantile(0.99, sum by (le, pod, node_id)` +
  142. `(rate(${sqLatencyMetric}${PrometheusService.selector}[${
  143. step / 1000
  144. }s])))`;
  145. const result = await this.queryRange(expr, start, end, step);
  146. const data = result.data.result;
  147. const length = Math.floor((end - start) / step);
  148. if (data.length === 0) return Array(length).fill(0);
  149. const res = data[0].values.map((d: any) => (isNaN(d[1]) ? 0 : +d[1]));
  150. // .slice(1);
  151. let leftLossCount;
  152. let rightLossCount;
  153. leftLossCount = Math.floor((data[0].values[0][0] * 1000 - start) / step);
  154. res.unshift(...Array(leftLossCount).fill(-1));
  155. rightLossCount = Math.floor(
  156. (end - data[0].values[data[0].values.length - 1][0] * 1000) / step
  157. );
  158. res.push(...Array(rightLossCount).fill(-2));
  159. return res;
  160. }
  161. async getThirdPartyServiceHealthStatus(
  162. metricName: string,
  163. start: number,
  164. end: number,
  165. step: number
  166. ) {
  167. const expr = `sum by (status) (${metricName}${PrometheusService.selector})`;
  168. const result = await this.queryRange(expr, start, end, step);
  169. const data = result.data.result;
  170. const length = Math.floor((end - start) / step);
  171. const totalCount = data
  172. .find((d: any) => d.metric.status === 'total')
  173. .values.map((d: any) => +d[1]);
  174. const totalSlices = totalCount
  175. .map((d: number, i: number) => (i > 0 ? d - totalCount[i - 1] : d))
  176. .slice(1);
  177. const successCount = data
  178. .find((d: any) => d.metric.status === 'success')
  179. .values.map((d: any) => +d[1]);
  180. const successSlices = successCount
  181. .map((d: number, i: number) => (i > 0 ? d - successCount[i - 1] : d))
  182. .slice(1);
  183. const res = totalSlices.map((d: number, i: number) =>
  184. d === 0 ? 1 : successSlices[i] / d
  185. );
  186. res.unshift(...Array(length - res.length).fill(-1));
  187. return res;
  188. }
  189. async getInternalNodesCPUData(start: number, end: number, step: number) {
  190. const expr = `${cpuMetric}${PrometheusService.selector}`;
  191. const result = await this.queryRange(expr, start, end, step);
  192. return result.data.result;
  193. }
  194. async getInternalNodesMemoryData(start: number, end: number, step: number) {
  195. const expr = `${memoryMetric}${PrometheusService.selector}`;
  196. const result = await this.queryRange(expr, start, end, step);
  197. return result.data.result;
  198. }
  199. reconstructNodeData(
  200. cpuNodesData: any,
  201. memoryNodesData: any,
  202. type: string,
  203. start: number,
  204. end: number,
  205. step: number
  206. ): IPrometheusNode[] {
  207. const cpuNodes = cpuNodesData.filter(
  208. (d: any) => d.metric.container.indexOf(type) >= 0
  209. );
  210. const memoryNodes = memoryNodesData.filter(
  211. (d: any) => d.metric.container.indexOf(type) >= 0
  212. );
  213. const nodesData = cpuNodes.map((d: any) => {
  214. const nodeType =
  215. d.metric.container.indexOf('coord') >= 0 ? 'coord' : 'node';
  216. const pod = d.metric.pod;
  217. const cpuProcessTotal = d.values.map((v: any) => +v[1]);
  218. const cpu = cpuProcessTotal
  219. .map((v: number, i: number) => (i > 0 ? v - cpuProcessTotal[i - 1] : 0))
  220. .slice(1)
  221. .map((v: number) => v / (step / 1000));
  222. let leftLossCount;
  223. let rightLossCount;
  224. leftLossCount = Math.floor((d.values[0][0] * 1000 - start) / step);
  225. cpu.unshift(...Array(leftLossCount).fill(-1));
  226. rightLossCount = Math.floor(
  227. (end - d.values[d.values.length - 1][0] * 1000) / step
  228. );
  229. cpu.push(...Array(rightLossCount).fill(-2));
  230. const node = memoryNodes.find((data: any) => data.metric.pod === pod);
  231. const memory = node.values.map((v: any) => +v[1]);
  232. leftLossCount = Math.floor((node.values[0][0] * 1000 - start) / step);
  233. memory.unshift(...Array(leftLossCount).fill(-1));
  234. rightLossCount = Math.floor(
  235. (end - node.values[node.values.length - 1][0] * 1000) / step
  236. );
  237. memory.push(...Array(rightLossCount).fill(-2));
  238. return {
  239. type: nodeType,
  240. pod,
  241. cpu,
  242. memory: memory.slice(1),
  243. } as IPrometheusNode;
  244. });
  245. return nodesData;
  246. }
  247. async getInternalNodesData(start: number, end: number, step: number) {
  248. const cpuNodes = await this.getInternalNodesCPUData(start, end, step);
  249. const memoryNodes = await this.getInternalNodesMemoryData(start, end, step);
  250. const [rootNodes, queryNodes, indexNodes, dataNodes] = [
  251. 'root',
  252. 'query',
  253. 'index',
  254. 'data',
  255. ].map((metric: string) =>
  256. this.reconstructNodeData(cpuNodes, memoryNodes, metric, start, end, step)
  257. );
  258. return { rootNodes, queryNodes, indexNodes, dataNodes };
  259. }
  260. async getMilvusHealthyData({
  261. start,
  262. end,
  263. step,
  264. }: {
  265. start: number;
  266. end: number;
  267. step: number;
  268. }) {
  269. const meta = await this.getThirdPartyServiceHealthStatus(
  270. metaMetric,
  271. start,
  272. end,
  273. step
  274. );
  275. const msgstream = await this.getThirdPartyServiceHealthStatus(
  276. msgstreamMetric,
  277. start,
  278. end,
  279. step
  280. );
  281. const objstorage = await this.getThirdPartyServiceHealthStatus(
  282. objstorageMetric,
  283. start,
  284. end,
  285. step
  286. );
  287. const totalVectorsCount = await this.getInsertVectorsCount(
  288. start,
  289. end,
  290. step
  291. );
  292. const searchVectorsCount = await this.getSearchVectorsCount(
  293. start,
  294. end,
  295. step
  296. );
  297. const sqLatency = await this.getSQLatency(start, end, step);
  298. const { rootNodes, queryNodes, indexNodes, dataNodes } =
  299. await this.getInternalNodesData(start, end, step);
  300. return {
  301. totalVectorsCount,
  302. searchVectorsCount,
  303. sqLatency,
  304. meta,
  305. msgstream,
  306. objstorage,
  307. rootNodes,
  308. queryNodes,
  309. indexNodes,
  310. dataNodes,
  311. } as IPrometheusAllData;
  312. }
  313. }