Ver Fonte

requests/images in progress stats and queue time metric

DarthSim há 3 anos atrás
pai
commit
cf2b3c5195

+ 2 - 0
CHANGELOG.md

@@ -7,6 +7,8 @@
 - Add support of JPEG files with differential Huffman coding or arithmetic coding.
 - Add `IMGPROXY_PREFERRED_FORMATS` config.
 - Add `IMGPROXY_REQUESTS_QUEUE_SIZE` config.
+- Add `requests_in_progress` and `images_in_progress` metrics.
+- Add queue segment/span to request traces.
 - Add sending additional metrics to Datadog and `IMGPROXY_DATADOG_ENABLE_ADDITIONAL_METRICS` config.
 - Add sending additional metrics to New Relic.
 

+ 3 - 0
docs/datadog.md

@@ -23,6 +23,7 @@ imgproxy can send its metrics to Datadog. To use this feature, do the following:
 imgproxy will send the following info to Datadog:
 
 * Response time
+* Queue time
 * Image downloading time
 * Image processing time
 * Errors that occurred while downloading and processing image
@@ -31,6 +32,8 @@ imgproxy will send the following info to Datadog:
 
 When the `IMGPROXY_DATADOG_ENABLE_ADDITIONAL_METRICS` environment variable is set to `true` imgproxy will send the following additional metrics to Datadog:
 
+* `imgproxy.requests_in_progress`: a number of requests currently being in progress
+* `imgproxy.images_in_progress`: a number of images currently being in progress
 * `imgproxy.buffer.size`: a histogram of the download/gzip buffers sizes (in bytes)
 * `imgproxy.buffer.default_size`: calibrated default buffer size (in bytes)
 * `imgproxy.buffer.max_size`: calibrated maximum buffer size (in bytes)

+ 3 - 0
docs/new_relic.md

@@ -11,12 +11,15 @@ imgproxy will send the following info to New Relic:
 
 * CPU and memory usage
 * Response time
+* Queue time
 * Image downloading time
 * Image processing time
 * Errors that occurred while downloading and processing an image
 
 Additionally, imgproxy sends the following metrics over [Metrics API](https://docs.newrelic.com/docs/data-apis/ingest-apis/metric-api/introduction-metric-api/):
 
+* `imgproxy.requests_in_progress`: a number of requests currently being in progress
+* `imgproxy.images_in_progress`: a number of images currently being in progress
 * `imgproxy.buffer.size`: a summary of the download/gzip buffers sizes (in bytes)
 * `imgproxy.buffer.default_size`: calibrated default buffer size (in bytes)
 * `imgproxy.buffer.max_size`: calibrated maximum buffer size (in bytes)

+ 13 - 4
docs/prometheus.md

@@ -3,16 +3,17 @@
 imgproxy can collect metrics for Prometheus. To use this feature, do the following:
 
 1. Set the `IMGPROXY_PROMETHEUS_BIND` environment variable to the address and port that will be listened to by the Prometheus server. Note that you can't bind the main server and Prometheus to the same port.
-2. _(optional)_ Set the `IMGPROXY_PROMETHEUS_NAMESPACE` to prepend prefix to the names of metrics, i.e. with `IMGPROXY_PROMETHEUS_NAMESPACE=imgproxy` Names will appear like `imgproxy_requests_total`.
+2. _(optional)_ Set the `IMGPROXY_PROMETHEUS_NAMESPACE` to prepend prefix to the names of metrics, i.e. with `IMGPROXY_PROMETHEUS_NAMESPACE=imgproxy` names will appear like `imgproxy_requests_total`.
 3. Collect the metrics from any path on the specified binding.
 
 imgproxy will collect the following metrics:
 
 * `requests_total`: a counter with the total number of HTTP requests imgproxy has processed
 * `errors_total`: a counter of the occurred errors separated by type (timeout, downloading, processing)
-* `request_duration_seconds`: a histogram of the response latency (in seconds)
-* `download_duration_seconds`: a histogram of the source image downloading latency (in seconds)
-* `processing_duration_seconds`: a histogram of the image processing latency (in seconds)
+* `request_duration_seconds`: a histogram of the request latency (in seconds)
+* `request_span_duration_seconds`: a histogram of the request latency (in seconds) separated by span (queue, downloading, processing)
+* `requests_in_progress`: a number of requests currently being in progress
+* `images_in_progress`: a number of images currently being in progress
 * `buffer_size_bytes`: a histogram of the download/gzip buffers sizes (in bytes)
 * `buffer_default_size_bytes`: calibrated default buffer size (in bytes)
 * `buffer_max_size_bytes`: calibrated maximum buffer size (in bytes)
@@ -20,3 +21,11 @@ imgproxy will collect the following metrics:
 * `vips_max_memory_bytes`: libvips maximum memory usage
 * `vips_allocs`: the number of active vips allocations
 * Some useful Go metrics like memstats and goroutines count
+
+### Deprecated metrics
+
+The following metrics are deprecated and can be removed in future versions. Use `request_span_duration_seconds` instead.
+
+* `download_duration_seconds`: a histogram of the source image downloading latency (in seconds)
+* `processing_duration_seconds`: a histogram of the image processing latency (in seconds)
+

+ 4 - 0
metrics/datadog/datadog.go

@@ -15,6 +15,7 @@ import (
 
 	"github.com/imgproxy/imgproxy/v3/config"
 	"github.com/imgproxy/imgproxy/v3/metrics/errformat"
+	"github.com/imgproxy/imgproxy/v3/metrics/stats"
 	"github.com/imgproxy/imgproxy/v3/version"
 )
 
@@ -176,6 +177,9 @@ func runMetricsCollector() {
 					statsdClient.Gauge(name, f(), nil, 1)
 				}
 			}()
+
+			statsdClient.Gauge("imgproxy.requests_in_progress", stats.RequestsInProgress(), nil, 1)
+			statsdClient.Gauge("imgproxy.images_in_progress", stats.ImagesInProgress(), nil, 1)
 		case <-statsdClientStop:
 			return
 		}

+ 14 - 0
metrics/metrics.go

@@ -46,6 +46,20 @@ func StartRequest(ctx context.Context, rw http.ResponseWriter, r *http.Request)
 	return ctx, cancel, rw
 }
 
+func StartQueueSegment(ctx context.Context) context.CancelFunc {
+	promCancel := prometheus.StartQueueSegment()
+	nrCancel := newrelic.StartSegment(ctx, "Queue")
+	ddCancel := datadog.StartSpan(ctx, "queue")
+
+	cancel := func() {
+		promCancel()
+		nrCancel()
+		ddCancel()
+	}
+
+	return cancel
+}
+
 func StartDownloadingSegment(ctx context.Context) context.CancelFunc {
 	promCancel := prometheus.StartDownloadingSegment()
 	nrCancel := newrelic.StartSegment(ctx, "Downloading image")

+ 13 - 0
metrics/newrelic/newrelic.go

@@ -15,6 +15,7 @@ import (
 
 	"github.com/imgproxy/imgproxy/v3/config"
 	"github.com/imgproxy/imgproxy/v3/metrics/errformat"
+	"github.com/imgproxy/imgproxy/v3/metrics/stats"
 )
 
 type transactionCtxKey struct{}
@@ -246,6 +247,18 @@ func runMetricsCollector() {
 				}
 			}()
 
+			harvester.RecordMetric(telemetry.Gauge{
+				Name:      "imgproxy.requests_in_progress",
+				Value:     stats.RequestsInProgress(),
+				Timestamp: time.Now(),
+			})
+
+			harvester.RecordMetric(telemetry.Gauge{
+				Name:      "imgproxy.images_in_progress",
+				Value:     stats.ImagesInProgress(),
+				Timestamp: time.Now(),
+			})
+
 			harvester.HarvestNow(harvesterCtx)
 		case <-harvesterCtx.Done():
 			return

+ 65 - 13
metrics/prometheus/prometheus.go

@@ -11,20 +11,27 @@ import (
 	log "github.com/sirupsen/logrus"
 
 	"github.com/imgproxy/imgproxy/v3/config"
+	"github.com/imgproxy/imgproxy/v3/metrics/stats"
 	"github.com/imgproxy/imgproxy/v3/reuseport"
 )
 
 var (
 	enabled = false
 
-	requestsTotal      prometheus.Counter
-	errorsTotal        *prometheus.CounterVec
-	requestDuration    prometheus.Histogram
-	downloadDuration   prometheus.Histogram
-	processingDuration prometheus.Histogram
-	bufferSize         *prometheus.HistogramVec
-	bufferDefaultSize  *prometheus.GaugeVec
-	bufferMaxSize      *prometheus.GaugeVec
+	requestsTotal prometheus.Counter
+	errorsTotal   *prometheus.CounterVec
+
+	requestDuration     prometheus.Histogram
+	requestSpanDuration *prometheus.HistogramVec
+	downloadDuration    prometheus.Histogram
+	processingDuration  prometheus.Histogram
+
+	bufferSize        *prometheus.HistogramVec
+	bufferDefaultSize *prometheus.GaugeVec
+	bufferMaxSize     *prometheus.GaugeVec
+
+	requestsInProgress prometheus.GaugeFunc
+	imagesInProgress   prometheus.GaugeFunc
 )
 
 func Init() {
@@ -50,6 +57,12 @@ func Init() {
 		Help:      "A histogram of the response latency.",
 	})
 
+	requestSpanDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Namespace: config.PrometheusNamespace,
+		Name:      "request_span_duration_seconds",
+		Help:      "A histogram of the queue latency.",
+	}, []string{"span"})
+
 	downloadDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
 		Namespace: config.PrometheusNamespace,
 		Name:      "download_duration_seconds",
@@ -81,15 +94,30 @@ func Init() {
 		Help:      "A gauge of the buffer max size in bytes.",
 	}, []string{"type"})
 
+	requestsInProgress = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: config.PrometheusNamespace,
+		Name:      "requests_in_progress",
+		Help:      "A gauge of the number of requests currently being in progress.",
+	}, stats.RequestsInProgress)
+
+	imagesInProgress = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: config.PrometheusNamespace,
+		Name:      "images_in_progress",
+		Help:      "A gauge of the number of images currently being in progress.",
+	}, stats.ImagesInProgress)
+
 	prometheus.MustRegister(
 		requestsTotal,
 		errorsTotal,
 		requestDuration,
+		requestSpanDuration,
 		downloadDuration,
 		processingDuration,
 		bufferSize,
 		bufferDefaultSize,
 		bufferMaxSize,
+		requestsInProgress,
+		imagesInProgress,
 	)
 
 	enabled = true
@@ -131,19 +159,43 @@ func StartRequest() context.CancelFunc {
 	return startDuration(requestDuration)
 }
 
-func StartDownloadingSegment() context.CancelFunc {
-	return startDuration(downloadDuration)
+func StartQueueSegment() context.CancelFunc {
+	if !enabled {
+		return func() {}
+	}
+
+	return startDuration(requestSpanDuration.With(prometheus.Labels{"span": "queue"}))
 }
 
-func StartProcessingSegment() context.CancelFunc {
-	return startDuration(processingDuration)
+func StartDownloadingSegment() context.CancelFunc {
+	if !enabled {
+		return func() {}
+	}
+
+	cancel := startDuration(requestSpanDuration.With(prometheus.Labels{"span": "downloading"}))
+	cancelLegacy := startDuration(downloadDuration)
+
+	return func() {
+		cancel()
+		cancelLegacy()
+	}
 }
 
-func startDuration(m prometheus.Histogram) context.CancelFunc {
+func StartProcessingSegment() context.CancelFunc {
 	if !enabled {
 		return func() {}
 	}
 
+	cancel := startDuration(requestSpanDuration.With(prometheus.Labels{"span": "processing"}))
+	cancelLegacy := startDuration(processingDuration)
+
+	return func() {
+		cancel()
+		cancelLegacy()
+	}
+}
+
+func startDuration(m prometheus.Observer) context.CancelFunc {
 	t := time.Now()
 	return func() {
 		m.Observe(time.Since(t).Seconds())

+ 32 - 0
metrics/stats/stats.go

@@ -0,0 +1,32 @@
+package stats
+
+import "sync/atomic"
+
+var (
+	requestsInProgress int64
+	imagesInProgress   int64
+)
+
+func RequestsInProgress() float64 {
+	return float64(atomic.LoadInt64(&requestsInProgress))
+}
+
+func IncRequestsInProgress() {
+	atomic.AddInt64(&requestsInProgress, 1)
+}
+
+func DecRequestsInProgress() {
+	atomic.AddInt64(&requestsInProgress, -1)
+}
+
+func ImagesInProgress() float64 {
+	return float64(atomic.LoadInt64(&imagesInProgress))
+}
+
+func IncImagesInProgress() {
+	atomic.AddInt64(&imagesInProgress, 1)
+}
+
+func DecImagesInProgress() {
+	atomic.AddInt64(&imagesInProgress, -1)
+}

+ 20 - 7
processing_handler.go

@@ -19,6 +19,7 @@ import (
 	"github.com/imgproxy/imgproxy/v3/imagedata"
 	"github.com/imgproxy/imgproxy/v3/imagetype"
 	"github.com/imgproxy/imgproxy/v3/metrics"
+	"github.com/imgproxy/imgproxy/v3/metrics/stats"
 	"github.com/imgproxy/imgproxy/v3/options"
 	"github.com/imgproxy/imgproxy/v3/processing"
 	"github.com/imgproxy/imgproxy/v3/router"
@@ -179,6 +180,9 @@ func checkErr(ctx context.Context, errType string, err error) {
 }
 
 func handleProcessing(reqID string, rw http.ResponseWriter, r *http.Request) {
+	stats.IncRequestsInProgress()
+	defer stats.DecRequestsInProgress()
+
 	ctx := r.Context()
 
 	if queueSem != nil {
@@ -249,15 +253,24 @@ func handleProcessing(reqID string, rw http.ResponseWriter, r *http.Request) {
 	}
 
 	// The heavy part start here, so we need to restrict concurrency
-	processingSemToken, aquired := processingSem.Aquire(ctx)
-	if !aquired {
-		// We don't actually need to check timeout here,
-		// but it's an easy way to check if this is an actual timeout
-		// or the request was cancelled
-		checkErr(ctx, "queue", router.CheckTimeout(ctx))
-	}
+	var processingSemToken *semaphore.Token
+	func() {
+		defer metrics.StartQueueSegment(ctx)()
+
+		var aquired bool
+		processingSemToken, aquired = processingSem.Aquire(ctx)
+		if !aquired {
+			// We don't actually need to check timeout here,
+			// but it's an easy way to check if this is an actual timeout
+			// or the request was cancelled
+			checkErr(ctx, "queue", router.CheckTimeout(ctx))
+		}
+	}()
 	defer processingSemToken.Release()
 
+	stats.IncImagesInProgress()
+	defer stats.DecImagesInProgress()
+
 	statusCode := http.StatusOK
 
 	originData, err := func() (*imagedata.ImageData, error) {