|
@@ -8,109 +8,121 @@ import (
|
|
|
"sync/atomic"
|
|
|
"time"
|
|
|
|
|
|
+ "github.com/shirou/gopsutil/v4/cpu"
|
|
|
"github.com/uozi-tech/cosy/logger"
|
|
|
)
|
|
|
|
|
|
+// IndexerActivityPoller defines an interface to check if the indexer is busy.
|
|
|
+type IndexerActivityPoller interface {
|
|
|
+ IsBusy() bool
|
|
|
+}
|
|
|
+
|
|
|
// AdaptiveOptimizer provides intelligent batch size adjustment and CPU monitoring
|
|
|
type AdaptiveOptimizer struct {
|
|
|
- config *Config
|
|
|
- cpuMonitor *CPUMonitor
|
|
|
- batchSizeController *BatchSizeController
|
|
|
- performanceHistory *PerformanceHistory
|
|
|
-
|
|
|
+ config *Config
|
|
|
+ cpuMonitor *CPUMonitor
|
|
|
+ batchSizeController *BatchSizeController
|
|
|
+ performanceHistory *PerformanceHistory
|
|
|
+
|
|
|
// State
|
|
|
- running int32
|
|
|
- ctx context.Context
|
|
|
- cancel context.CancelFunc
|
|
|
- wg sync.WaitGroup
|
|
|
-
|
|
|
+ running int32
|
|
|
+ ctx context.Context
|
|
|
+ cancel context.CancelFunc
|
|
|
+ wg sync.WaitGroup
|
|
|
+
|
|
|
// Metrics
|
|
|
- optimizationsMade int64
|
|
|
- avgThroughput float64
|
|
|
- avgLatency time.Duration
|
|
|
- metricsMutex sync.RWMutex
|
|
|
+ optimizationsMade int64
|
|
|
+ avgThroughput float64
|
|
|
+ avgLatency time.Duration
|
|
|
+ metricsMutex sync.RWMutex
|
|
|
+
|
|
|
+ // Callbacks
|
|
|
+ onWorkerCountChange func(oldCount, newCount int)
|
|
|
+
|
|
|
+ // Activity Poller
|
|
|
+ activityPoller IndexerActivityPoller
|
|
|
}
|
|
|
|
|
|
// CPUMonitor monitors CPU utilization and suggests worker adjustments
|
|
|
type CPUMonitor struct {
|
|
|
- targetUtilization float64
|
|
|
- measurementInterval time.Duration
|
|
|
- adjustmentThreshold float64
|
|
|
+ targetUtilization float64
|
|
|
+ measurementInterval time.Duration
|
|
|
+ adjustmentThreshold float64
|
|
|
maxWorkers int
|
|
|
minWorkers int
|
|
|
-
|
|
|
- currentUtilization float64
|
|
|
- measurements []float64
|
|
|
- measurementsMutex sync.RWMutex
|
|
|
+
|
|
|
+ currentUtilization float64
|
|
|
+ measurements []float64
|
|
|
+ measurementsMutex sync.RWMutex
|
|
|
}
|
|
|
|
|
|
// BatchSizeController dynamically adjusts batch sizes based on performance metrics
|
|
|
type BatchSizeController struct {
|
|
|
- baseBatchSize int
|
|
|
- minBatchSize int
|
|
|
- maxBatchSize int
|
|
|
- adjustmentFactor float64
|
|
|
-
|
|
|
- currentBatchSize int32
|
|
|
- latencyThreshold time.Duration
|
|
|
- throughputTarget float64
|
|
|
-
|
|
|
- adjustmentHistory []BatchAdjustment
|
|
|
- historyMutex sync.RWMutex
|
|
|
+ baseBatchSize int
|
|
|
+ minBatchSize int
|
|
|
+ maxBatchSize int
|
|
|
+ adjustmentFactor float64
|
|
|
+
|
|
|
+ currentBatchSize int32
|
|
|
+ latencyThreshold time.Duration
|
|
|
+ throughputTarget float64
|
|
|
+
|
|
|
+ adjustmentHistory []BatchAdjustment
|
|
|
+ historyMutex sync.RWMutex
|
|
|
}
|
|
|
|
|
|
// PerformanceHistory tracks performance metrics for optimization decisions
|
|
|
type PerformanceHistory struct {
|
|
|
- samples []PerformanceSample
|
|
|
- maxSamples int
|
|
|
- mutex sync.RWMutex
|
|
|
-
|
|
|
- movingAvgWindow int
|
|
|
- avgThroughput float64
|
|
|
- avgLatency time.Duration
|
|
|
+ samples []PerformanceSample
|
|
|
+ maxSamples int
|
|
|
+ mutex sync.RWMutex
|
|
|
+
|
|
|
+ movingAvgWindow int
|
|
|
+ avgThroughput float64
|
|
|
+ avgLatency time.Duration
|
|
|
}
|
|
|
|
|
|
// PerformanceSample represents a single performance measurement
|
|
|
type PerformanceSample struct {
|
|
|
- Timestamp time.Time `json:"timestamp"`
|
|
|
- Throughput float64 `json:"throughput"`
|
|
|
- Latency time.Duration `json:"latency"`
|
|
|
- CPUUsage float64 `json:"cpu_usage"`
|
|
|
- BatchSize int `json:"batch_size"`
|
|
|
- WorkerCount int `json:"worker_count"`
|
|
|
+ Timestamp time.Time `json:"timestamp"`
|
|
|
+ Throughput float64 `json:"throughput"`
|
|
|
+ Latency time.Duration `json:"latency"`
|
|
|
+ CPUUsage float64 `json:"cpu_usage"`
|
|
|
+ BatchSize int `json:"batch_size"`
|
|
|
+ WorkerCount int `json:"worker_count"`
|
|
|
}
|
|
|
|
|
|
// BatchAdjustment represents a batch size adjustment decision
|
|
|
type BatchAdjustment struct {
|
|
|
- Timestamp time.Time `json:"timestamp"`
|
|
|
- OldBatchSize int `json:"old_batch_size"`
|
|
|
- NewBatchSize int `json:"new_batch_size"`
|
|
|
- Reason string `json:"reason"`
|
|
|
- ThroughputImpact float64 `json:"throughput_impact"`
|
|
|
+ Timestamp time.Time `json:"timestamp"`
|
|
|
+ OldBatchSize int `json:"old_batch_size"`
|
|
|
+ NewBatchSize int `json:"new_batch_size"`
|
|
|
+ Reason string `json:"reason"`
|
|
|
+ ThroughputImpact float64 `json:"throughput_impact"`
|
|
|
}
|
|
|
|
|
|
// NewAdaptiveOptimizer creates a new adaptive optimizer
|
|
|
func NewAdaptiveOptimizer(config *Config) *AdaptiveOptimizer {
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
-
|
|
|
- return &AdaptiveOptimizer{
|
|
|
+
|
|
|
+ ao := &AdaptiveOptimizer{
|
|
|
config: config,
|
|
|
cpuMonitor: &CPUMonitor{
|
|
|
- targetUtilization: 0.80, // Target 80% CPU utilization
|
|
|
+ targetUtilization: 0.75, // Target 75% CPU utilization (more conservative)
|
|
|
measurementInterval: 5 * time.Second,
|
|
|
- adjustmentThreshold: 0.15, // Adjust if 15% deviation from target
|
|
|
- maxWorkers: runtime.NumCPU() * 3,
|
|
|
- minWorkers: max(2, runtime.NumCPU()/2),
|
|
|
- measurements: make([]float64, 0, 12), // 1 minute history at 5s intervals
|
|
|
+ adjustmentThreshold: 0.10, // Adjust if 10% deviation from target (more sensitive)
|
|
|
+ maxWorkers: runtime.GOMAXPROCS(0) * 3,
|
|
|
+ minWorkers: max(1, runtime.GOMAXPROCS(0)/8), // Allow aggressive scaling down: 1/8 of available processors or 1, whichever is higher
|
|
|
+ measurements: make([]float64, 0, 12), // 1 minute history at 5s intervals
|
|
|
},
|
|
|
batchSizeController: &BatchSizeController{
|
|
|
- baseBatchSize: config.BatchSize,
|
|
|
- minBatchSize: max(100, config.BatchSize/4),
|
|
|
- maxBatchSize: config.BatchSize * 3,
|
|
|
- adjustmentFactor: 0.2, // 20% adjustment steps
|
|
|
- currentBatchSize: int32(config.BatchSize),
|
|
|
- latencyThreshold: 5 * time.Second,
|
|
|
- throughputTarget: 25.0, // Target 25 MB/s
|
|
|
+ baseBatchSize: config.BatchSize,
|
|
|
+ minBatchSize: max(100, config.BatchSize/4),
|
|
|
+ maxBatchSize: config.BatchSize * 3,
|
|
|
+ adjustmentFactor: 0.2, // 20% adjustment steps
|
|
|
+ currentBatchSize: int32(config.BatchSize),
|
|
|
+ latencyThreshold: 5 * time.Second,
|
|
|
+ throughputTarget: 25.0, // Target 25 MB/s
|
|
|
},
|
|
|
performanceHistory: &PerformanceHistory{
|
|
|
samples: make([]PerformanceSample, 0, 120), // 2 minutes of 1s samples
|
|
@@ -120,6 +132,13 @@ func NewAdaptiveOptimizer(config *Config) *AdaptiveOptimizer {
|
|
|
ctx: ctx,
|
|
|
cancel: cancel,
|
|
|
}
|
|
|
+
|
|
|
+ // Log initialization parameters for debugging
|
|
|
+ logger.Infof("Adaptive optimizer initialized: workers=[%d, %d, %d] (min, current, max), target_cpu=%.1f%%, threshold=%.1f%%",
|
|
|
+ ao.cpuMonitor.minWorkers, config.WorkerCount, ao.cpuMonitor.maxWorkers,
|
|
|
+ ao.cpuMonitor.targetUtilization*100, ao.cpuMonitor.adjustmentThreshold*100)
|
|
|
+
|
|
|
+ return ao
|
|
|
}
|
|
|
|
|
|
// Start begins the adaptive optimization process
|
|
@@ -128,19 +147,19 @@ func (ao *AdaptiveOptimizer) Start() error {
|
|
|
logger.Error("Adaptive optimizer already running")
|
|
|
return fmt.Errorf("adaptive optimizer already running")
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
// Start CPU monitoring
|
|
|
ao.wg.Add(1)
|
|
|
go ao.cpuMonitoringLoop()
|
|
|
-
|
|
|
+
|
|
|
// Start batch size optimization
|
|
|
ao.wg.Add(1)
|
|
|
go ao.batchOptimizationLoop()
|
|
|
-
|
|
|
+
|
|
|
// Start performance tracking
|
|
|
ao.wg.Add(1)
|
|
|
go ao.performanceTrackingLoop()
|
|
|
-
|
|
|
+
|
|
|
logger.Info("Adaptive optimizer started")
|
|
|
return nil
|
|
|
}
|
|
@@ -150,20 +169,20 @@ func (ao *AdaptiveOptimizer) Stop() {
|
|
|
if !atomic.CompareAndSwapInt32(&ao.running, 1, 0) {
|
|
|
return
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
ao.cancel()
|
|
|
ao.wg.Wait()
|
|
|
-
|
|
|
+
|
|
|
logger.Info("Adaptive optimizer stopped")
|
|
|
}
|
|
|
|
|
|
// cpuMonitoringLoop continuously monitors CPU utilization
|
|
|
func (ao *AdaptiveOptimizer) cpuMonitoringLoop() {
|
|
|
defer ao.wg.Done()
|
|
|
-
|
|
|
+
|
|
|
ticker := time.NewTicker(ao.cpuMonitor.measurementInterval)
|
|
|
defer ticker.Stop()
|
|
|
-
|
|
|
+
|
|
|
for {
|
|
|
select {
|
|
|
case <-ticker.C:
|
|
@@ -178,7 +197,7 @@ func (ao *AdaptiveOptimizer) cpuMonitoringLoop() {
|
|
|
func (ao *AdaptiveOptimizer) measureAndAdjustCPU() {
|
|
|
// Get current CPU utilization
|
|
|
cpuUsage := ao.getCurrentCPUUtilization()
|
|
|
-
|
|
|
+
|
|
|
ao.cpuMonitor.measurementsMutex.Lock()
|
|
|
ao.cpuMonitor.measurements = append(ao.cpuMonitor.measurements, cpuUsage)
|
|
|
if len(ao.cpuMonitor.measurements) > cap(ao.cpuMonitor.measurements) {
|
|
@@ -187,19 +206,19 @@ func (ao *AdaptiveOptimizer) measureAndAdjustCPU() {
|
|
|
}
|
|
|
ao.cpuMonitor.currentUtilization = cpuUsage
|
|
|
ao.cpuMonitor.measurementsMutex.Unlock()
|
|
|
-
|
|
|
+
|
|
|
// Calculate average CPU utilization
|
|
|
ao.cpuMonitor.measurementsMutex.RLock()
|
|
|
avgCPU := ao.calculateAverageCPU()
|
|
|
ao.cpuMonitor.measurementsMutex.RUnlock()
|
|
|
-
|
|
|
+
|
|
|
// Determine if adjustment is needed
|
|
|
targetCPU := ao.cpuMonitor.targetUtilization
|
|
|
if avgCPU < targetCPU-ao.cpuMonitor.adjustmentThreshold {
|
|
|
// CPU underutilized - suggest increasing workers
|
|
|
ao.suggestWorkerIncrease(avgCPU, targetCPU)
|
|
|
} else if avgCPU > targetCPU+ao.cpuMonitor.adjustmentThreshold {
|
|
|
- // CPU over-utilized - suggest decreasing workers
|
|
|
+ // CPU over-utilized - suggest decreasing workers
|
|
|
ao.suggestWorkerDecrease(avgCPU, targetCPU)
|
|
|
}
|
|
|
}
|
|
@@ -207,10 +226,10 @@ func (ao *AdaptiveOptimizer) measureAndAdjustCPU() {
|
|
|
// batchOptimizationLoop continuously optimizes batch sizes
|
|
|
func (ao *AdaptiveOptimizer) batchOptimizationLoop() {
|
|
|
defer ao.wg.Done()
|
|
|
-
|
|
|
+
|
|
|
ticker := time.NewTicker(10 * time.Second) // Adjust batch size every 10 seconds
|
|
|
defer ticker.Stop()
|
|
|
-
|
|
|
+
|
|
|
for {
|
|
|
select {
|
|
|
case <-ticker.C:
|
|
@@ -228,15 +247,15 @@ func (ao *AdaptiveOptimizer) optimizeBatchSize() {
|
|
|
ao.performanceHistory.mutex.RUnlock()
|
|
|
return // Not enough data
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
recentSamples := ao.performanceHistory.samples[max(0, len(ao.performanceHistory.samples)-5):]
|
|
|
avgThroughput := ao.calculateAverageThroughput(recentSamples)
|
|
|
avgLatency := ao.calculateAverageLatency(recentSamples)
|
|
|
ao.performanceHistory.mutex.RUnlock()
|
|
|
-
|
|
|
+
|
|
|
currentBatchSize := int(atomic.LoadInt32(&ao.batchSizeController.currentBatchSize))
|
|
|
newBatchSize := ao.calculateOptimalBatchSize(avgThroughput, avgLatency, currentBatchSize)
|
|
|
-
|
|
|
+
|
|
|
if newBatchSize != currentBatchSize {
|
|
|
ao.adjustBatchSize(currentBatchSize, newBatchSize, avgThroughput, avgLatency)
|
|
|
atomic.AddInt64(&ao.optimizationsMade, 1)
|
|
@@ -246,21 +265,21 @@ func (ao *AdaptiveOptimizer) optimizeBatchSize() {
|
|
|
// calculateOptimalBatchSize determines the optimal batch size based on current performance
|
|
|
func (ao *AdaptiveOptimizer) calculateOptimalBatchSize(throughput float64, latency time.Duration, currentBatch int) int {
|
|
|
controller := ao.batchSizeController
|
|
|
-
|
|
|
+
|
|
|
// If latency is too high, reduce batch size
|
|
|
if latency > controller.latencyThreshold {
|
|
|
reduction := int(float64(currentBatch) * controller.adjustmentFactor)
|
|
|
newSize := currentBatch - max(50, reduction)
|
|
|
return max(controller.minBatchSize, newSize)
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
// If throughput is below target and latency is acceptable, increase batch size
|
|
|
if throughput < controller.throughputTarget && latency < controller.latencyThreshold/2 {
|
|
|
increase := int(float64(currentBatch) * controller.adjustmentFactor)
|
|
|
newSize := currentBatch + max(100, increase)
|
|
|
return min(controller.maxBatchSize, newSize)
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
// Current batch size seems optimal
|
|
|
return currentBatch
|
|
|
}
|
|
@@ -268,23 +287,23 @@ func (ao *AdaptiveOptimizer) calculateOptimalBatchSize(throughput float64, laten
|
|
|
// adjustBatchSize applies the batch size adjustment
|
|
|
func (ao *AdaptiveOptimizer) adjustBatchSize(oldSize, newSize int, throughput float64, latency time.Duration) {
|
|
|
atomic.StoreInt32(&ao.batchSizeController.currentBatchSize, int32(newSize))
|
|
|
-
|
|
|
+
|
|
|
var reason string
|
|
|
if newSize > oldSize {
|
|
|
reason = "Increasing batch size to improve throughput"
|
|
|
} else {
|
|
|
reason = "Reducing batch size to improve latency"
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
// Record adjustment
|
|
|
adjustment := BatchAdjustment{
|
|
|
Timestamp: time.Now(),
|
|
|
OldBatchSize: oldSize,
|
|
|
NewBatchSize: newSize,
|
|
|
- Reason: reason,
|
|
|
+ Reason: reason,
|
|
|
ThroughputImpact: throughput,
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
ao.batchSizeController.historyMutex.Lock()
|
|
|
ao.batchSizeController.adjustmentHistory = append(ao.batchSizeController.adjustmentHistory, adjustment)
|
|
|
if len(ao.batchSizeController.adjustmentHistory) > 50 {
|
|
@@ -292,17 +311,17 @@ func (ao *AdaptiveOptimizer) adjustBatchSize(oldSize, newSize int, throughput fl
|
|
|
ao.batchSizeController.adjustmentHistory = ao.batchSizeController.adjustmentHistory[1:]
|
|
|
}
|
|
|
ao.batchSizeController.historyMutex.Unlock()
|
|
|
-
|
|
|
+
|
|
|
logger.Infof("Batch size adjusted: old_size=%d, new_size=%d, reason=%s", oldSize, newSize, reason)
|
|
|
}
|
|
|
|
|
|
// performanceTrackingLoop continuously tracks performance metrics
|
|
|
func (ao *AdaptiveOptimizer) performanceTrackingLoop() {
|
|
|
defer ao.wg.Done()
|
|
|
-
|
|
|
+
|
|
|
ticker := time.NewTicker(1 * time.Second) // Sample every second
|
|
|
defer ticker.Stop()
|
|
|
-
|
|
|
+
|
|
|
for {
|
|
|
select {
|
|
|
case <-ticker.C:
|
|
@@ -323,19 +342,29 @@ func (ao *AdaptiveOptimizer) recordPerformanceSample() {
|
|
|
BatchSize: int(atomic.LoadInt32(&ao.batchSizeController.currentBatchSize)),
|
|
|
WorkerCount: ao.config.WorkerCount,
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
ao.performanceHistory.mutex.Lock()
|
|
|
ao.performanceHistory.samples = append(ao.performanceHistory.samples, sample)
|
|
|
if len(ao.performanceHistory.samples) > ao.performanceHistory.maxSamples {
|
|
|
// Remove oldest sample
|
|
|
ao.performanceHistory.samples = ao.performanceHistory.samples[1:]
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
// Update moving averages
|
|
|
ao.updateMovingAverages()
|
|
|
ao.performanceHistory.mutex.Unlock()
|
|
|
}
|
|
|
|
|
|
+// SetWorkerCountChangeCallback sets the callback function for worker count changes
|
|
|
+func (ao *AdaptiveOptimizer) SetWorkerCountChangeCallback(callback func(oldCount, newCount int)) {
|
|
|
+ ao.onWorkerCountChange = callback
|
|
|
+}
|
|
|
+
|
|
|
+// SetActivityPoller sets the poller to check for indexer activity.
|
|
|
+func (ao *AdaptiveOptimizer) SetActivityPoller(poller IndexerActivityPoller) {
|
|
|
+ ao.activityPoller = poller
|
|
|
+}
|
|
|
+
|
|
|
// GetOptimalBatchSize returns the current optimal batch size
|
|
|
func (ao *AdaptiveOptimizer) GetOptimalBatchSize() int {
|
|
|
return int(atomic.LoadInt32(&ao.batchSizeController.currentBatchSize))
|
|
@@ -352,13 +381,13 @@ func (ao *AdaptiveOptimizer) GetCPUUtilization() float64 {
|
|
|
func (ao *AdaptiveOptimizer) GetOptimizationStats() AdaptiveOptimizationStats {
|
|
|
ao.metricsMutex.RLock()
|
|
|
defer ao.metricsMutex.RUnlock()
|
|
|
-
|
|
|
+
|
|
|
return AdaptiveOptimizationStats{
|
|
|
OptimizationsMade: atomic.LoadInt64(&ao.optimizationsMade),
|
|
|
CurrentBatchSize: int(atomic.LoadInt32(&ao.batchSizeController.currentBatchSize)),
|
|
|
- AvgThroughput: ao.avgThroughput,
|
|
|
- AvgLatency: ao.avgLatency,
|
|
|
- CPUUtilization: ao.cpuMonitor.currentUtilization,
|
|
|
+ AvgThroughput: ao.avgThroughput,
|
|
|
+ AvgLatency: ao.avgLatency,
|
|
|
+ CPUUtilization: ao.cpuMonitor.currentUtilization,
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -366,29 +395,34 @@ func (ao *AdaptiveOptimizer) GetOptimizationStats() AdaptiveOptimizationStats {
|
|
|
type AdaptiveOptimizationStats struct {
|
|
|
OptimizationsMade int64 `json:"optimizations_made"`
|
|
|
CurrentBatchSize int `json:"current_batch_size"`
|
|
|
- AvgThroughput float64 `json:"avg_throughput"`
|
|
|
- AvgLatency time.Duration `json:"avg_latency"`
|
|
|
- CPUUtilization float64 `json:"cpu_utilization"`
|
|
|
+ AvgThroughput float64 `json:"avg_throughput"`
|
|
|
+ AvgLatency time.Duration `json:"avg_latency"`
|
|
|
+ CPUUtilization float64 `json:"cpu_utilization"`
|
|
|
}
|
|
|
|
|
|
// Helper functions
|
|
|
func (ao *AdaptiveOptimizer) getCurrentCPUUtilization() float64 {
|
|
|
- // This is a simplified implementation
|
|
|
- // In production, you'd use a proper CPU monitoring library
|
|
|
- runtime.GC()
|
|
|
- var m runtime.MemStats
|
|
|
- runtime.ReadMemStats(&m)
|
|
|
-
|
|
|
- // Approximate CPU usage based on GC activity and goroutines
|
|
|
- numGoroutines := float64(runtime.NumGoroutine())
|
|
|
- numCPU := float64(runtime.NumCPU())
|
|
|
-
|
|
|
- // Simple heuristic: more goroutines = higher CPU usage
|
|
|
- utilization := numGoroutines / (numCPU * 10)
|
|
|
- if utilization > 0.95 {
|
|
|
- utilization = 0.95
|
|
|
- }
|
|
|
- return utilization
|
|
|
+ // Get CPU utilization since the last call.
|
|
|
+ // Interval 0 means non-blocking and compares to the last measurement.
|
|
|
+ // The first call will return 0.
|
|
|
+ percentages, err := cpu.Percent(0, false)
|
|
|
+ if err != nil || len(percentages) == 0 {
|
|
|
+ logger.Warnf("Failed to get real CPU utilization, falling back to goroutine heuristic: %v", err)
|
|
|
+ // Fallback to the old, less accurate method
|
|
|
+ numGoroutines := float64(runtime.NumGoroutine())
|
|
|
+ maxProcs := float64(runtime.GOMAXPROCS(0))
|
|
|
+
|
|
|
+ // Simple heuristic: more goroutines = higher CPU usage
|
|
|
+ utilization := numGoroutines / (maxProcs * 10)
|
|
|
+ if utilization > 0.95 {
|
|
|
+ utilization = 0.95
|
|
|
+ }
|
|
|
+ return utilization
|
|
|
+ }
|
|
|
+
|
|
|
+ // gopsutil returns a slice, for overall usage (percpu=false), it's the first element.
|
|
|
+ // The value is a percentage (e.g., 8.3), so we convert it to a 0.0-1.0 scale for our calculations.
|
|
|
+ return percentages[0] / 100.0
|
|
|
}
|
|
|
|
|
|
func (ao *AdaptiveOptimizer) getCurrentThroughput() float64 {
|
|
@@ -409,7 +443,7 @@ func (ao *AdaptiveOptimizer) calculateAverageCPU() float64 {
|
|
|
if len(ao.cpuMonitor.measurements) == 0 {
|
|
|
return 0
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
sum := 0.0
|
|
|
for _, cpu := range ao.cpuMonitor.measurements {
|
|
|
sum += cpu
|
|
@@ -421,7 +455,7 @@ func (ao *AdaptiveOptimizer) calculateAverageThroughput(samples []PerformanceSam
|
|
|
if len(samples) == 0 {
|
|
|
return 0
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
sum := 0.0
|
|
|
for _, sample := range samples {
|
|
|
sum += sample.Throughput
|
|
@@ -433,7 +467,7 @@ func (ao *AdaptiveOptimizer) calculateAverageLatency(samples []PerformanceSample
|
|
|
if len(samples) == 0 {
|
|
|
return 0
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
var sum time.Duration
|
|
|
for _, sample := range samples {
|
|
|
sum += sample.Latency
|
|
@@ -445,22 +479,104 @@ func (ao *AdaptiveOptimizer) updateMovingAverages() {
|
|
|
if len(ao.performanceHistory.samples) == 0 {
|
|
|
return
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
windowSize := min(ao.performanceHistory.movingAvgWindow, len(ao.performanceHistory.samples))
|
|
|
recentSamples := ao.performanceHistory.samples[len(ao.performanceHistory.samples)-windowSize:]
|
|
|
-
|
|
|
+
|
|
|
ao.avgThroughput = ao.calculateAverageThroughput(recentSamples)
|
|
|
ao.avgLatency = ao.calculateAverageLatency(recentSamples)
|
|
|
}
|
|
|
|
|
|
func (ao *AdaptiveOptimizer) suggestWorkerIncrease(currentCPU, targetCPU float64) {
|
|
|
- logger.Debug("CPU underutilized, consider increasing workers",
|
|
|
+ // If already at max workers, do nothing.
|
|
|
+ if ao.config.WorkerCount >= ao.cpuMonitor.maxWorkers {
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ // If the indexer is not busy, don't scale up workers even if CPU is low.
|
|
|
+ if ao.activityPoller != nil && !ao.activityPoller.IsBusy() {
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.Debug("CPU underutilized, adjusting workers upward",
|
|
|
"current_cpu", currentCPU, "target_cpu", targetCPU)
|
|
|
+
|
|
|
+ // Calculate suggested increase (conservative approach)
|
|
|
+ cpuUtilizationGap := targetCPU - currentCPU
|
|
|
+ increaseRatio := cpuUtilizationGap / targetCPU
|
|
|
+
|
|
|
+ // Limit increase to maximum 25% at a time and at least 1 worker
|
|
|
+ maxIncrease := max(1, int(float64(ao.config.WorkerCount)*0.25))
|
|
|
+ suggestedIncrease := max(1, int(float64(ao.config.WorkerCount)*increaseRatio))
|
|
|
+ actualIncrease := min(maxIncrease, suggestedIncrease)
|
|
|
+
|
|
|
+ newWorkerCount := min(ao.cpuMonitor.maxWorkers, ao.config.WorkerCount+actualIncrease)
|
|
|
+
|
|
|
+ if newWorkerCount > ao.config.WorkerCount {
|
|
|
+ ao.adjustWorkerCount(newWorkerCount)
|
|
|
+ logger.Infof("Increased workers from %d to %d due to CPU underutilization",
|
|
|
+ ao.config.WorkerCount, newWorkerCount)
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
func (ao *AdaptiveOptimizer) suggestWorkerDecrease(currentCPU, targetCPU float64) {
|
|
|
- logger.Debug("CPU over-utilized, consider decreasing workers",
|
|
|
+ // If already at min workers, do nothing.
|
|
|
+ if ao.config.WorkerCount <= ao.cpuMonitor.minWorkers {
|
|
|
+ logger.Debugf("Worker count is already at its minimum (%d), skipping decrease.", ao.config.WorkerCount)
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.Debug("CPU over-utilized, adjusting workers downward",
|
|
|
"current_cpu", currentCPU, "target_cpu", targetCPU)
|
|
|
+
|
|
|
+ // Calculate suggested decrease (conservative approach)
|
|
|
+ cpuOverUtilization := currentCPU - targetCPU
|
|
|
+ decreaseRatio := cpuOverUtilization / targetCPU // Use target CPU as base for more accurate calculation
|
|
|
+
|
|
|
+ // Limit decrease to maximum 25% at a time and at least 1 worker
|
|
|
+ maxDecrease := max(1, int(float64(ao.config.WorkerCount)*0.25))
|
|
|
+ suggestedDecrease := max(1, int(float64(ao.config.WorkerCount)*decreaseRatio*0.5)) // More conservative decrease
|
|
|
+ actualDecrease := min(maxDecrease, suggestedDecrease)
|
|
|
+
|
|
|
+ newWorkerCount := max(ao.cpuMonitor.minWorkers, ao.config.WorkerCount-actualDecrease)
|
|
|
+
|
|
|
+ logger.Debugf("Worker decrease calculation: current=%d, suggested=%d, min=%d, new=%d",
|
|
|
+ ao.config.WorkerCount, suggestedDecrease, ao.cpuMonitor.minWorkers, newWorkerCount)
|
|
|
+
|
|
|
+ if newWorkerCount < ao.config.WorkerCount {
|
|
|
+ logger.Debugf("About to adjust worker count from %d to %d", ao.config.WorkerCount, newWorkerCount)
|
|
|
+ ao.adjustWorkerCount(newWorkerCount)
|
|
|
+ logger.Infof("Decreased workers from %d to %d due to CPU over-utilization",
|
|
|
+ ao.config.WorkerCount, newWorkerCount)
|
|
|
+ } else {
|
|
|
+ logger.Debugf("Worker count adjustment skipped: new=%d not less than current=%d", newWorkerCount, ao.config.WorkerCount)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// adjustWorkerCount dynamically adjusts the worker count at runtime
|
|
|
+func (ao *AdaptiveOptimizer) adjustWorkerCount(newCount int) {
|
|
|
+ if newCount <= 0 || newCount == ao.config.WorkerCount {
|
|
|
+ logger.Debugf("Skipping worker adjustment: newCount=%d, currentCount=%d", newCount, ao.config.WorkerCount)
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.Infof("Adjusting worker count from %d to %d", ao.config.WorkerCount, newCount)
|
|
|
+
|
|
|
+ // Update configuration
|
|
|
+ oldCount := ao.config.WorkerCount
|
|
|
+ ao.config.WorkerCount = newCount
|
|
|
+
|
|
|
+ // Notify the indexer about worker count change
|
|
|
+ // This would typically trigger a worker pool resize in the parallel indexer
|
|
|
+ if ao.onWorkerCountChange != nil {
|
|
|
+ logger.Debugf("Calling worker count change callback: %d -> %d", oldCount, newCount)
|
|
|
+ ao.onWorkerCountChange(oldCount, newCount)
|
|
|
+ } else {
|
|
|
+ logger.Warnf("Worker count change callback is nil - worker adjustment will not take effect")
|
|
|
+ }
|
|
|
+
|
|
|
+ // Log the adjustment for monitoring
|
|
|
+ atomic.AddInt64(&ao.optimizationsMade, 1)
|
|
|
}
|
|
|
|
|
|
// Utility functions
|
|
@@ -476,4 +592,4 @@ func min(a, b int) int {
|
|
|
return a
|
|
|
}
|
|
|
return b
|
|
|
-}
|
|
|
+}
|