app/vlinsert: follow-up for d570763c91

- Switch from summary to histogram for vl_http_request_duration_seconds metric. This allows calculating request duration quantiles across multiple hosts via histogram_quantile(0.99, sum(vl_http_request_duration_seconds_bucket) by (vmrange)). - Take into account only successfully processed data ingestion requests when updating vl_http_request_duration_seconds histogram. Failed requests are ignored, since they may significantly skew measurements. - Clarify the description of the change at docs/VictoriaLogs/CHANGELOG.md. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4934
2024-12-15 00:13:30 +01:00 · 2023-09-18 23:58:32 +02:00 · 2023-09-18 23:58:32 +02:00 · cc8f2bee0d
commit cc8f2bee0d
parent de2b3ff9b0
6 changed files with 50 additions and 31 deletions
--- a/app/vlinsert/elasticsearch/elasticsearch.go
+++ b/app/vlinsert/elasticsearch/elasticsearch.go
@ -86,7 +86,6 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
 		return true
 	case "/_bulk":
 		startTime := time.Now()
 		defer bulkRequestDuration.UpdateDuration(startTime)
 		bulkRequestsTotal.Inc()
 		cp, err := insertutils.GetCommonParams(r)
@ -110,6 +109,12 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
 		defer bufferedwriter.Put(bw)
 		WriteBulkResponse(bw, n, tookMs)
 		_ = bw.Flush()
 		// update bulkRequestDuration only for successfully parsed requests
 		// There is no need in updating bulkRequestDuration for request errors,
 		// since their timings are usually much smaller than the timing for successful request parsing.
 		bulkRequestDuration.UpdateDuration(startTime)
 		return true
 	default:
 		return false
@ -118,7 +123,8 @@ func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
 var (
 	bulkRequestsTotal   = metrics.NewCounter(`vl_http_requests_total{path="/insert/elasticsearch/_bulk"}`)
-	bulkRequestDuration = metrics.NewSummary(`vl_http_request_duration_seconds{path="/insert/elasticsearch/_bulk"}`)
+	rowsIngestedTotal   = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
 	bulkRequestDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/elasticsearch/_bulk"}`)
 )
 func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
@ -164,8 +170,6 @@ func readBulkRequest(r io.Reader, isGzip bool, timeField, msgField string,
 var lineBufferPool bytesutil.ByteBufferPool
 var rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="elasticsearch_bulk"}`)
 func readBulkLine(sc *bufio.Scanner, timeField, msgField string,
 	processLogMessage func(timestamp int64, fields []logstorage.Field),
 ) (bool, error) {
--- a/app/vlinsert/jsonline/jsonline.go
+++ b/app/vlinsert/jsonline/jsonline.go
@ -19,11 +19,9 @@ import (
 	"github.com/VictoriaMetrics/metrics"
 )
 var jsonlineRequestDuration = metrics.NewSummary(`vl_http_request_duration_seconds{path="/insert/jsonline"}`)
 // RequestHandler processes jsonline insert requests
 func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
-	defer jsonlineRequestDuration.UpdateDuration(time.Now())
+	startTime := time.Now()
 	w.Header().Add("Content-Type", "application/json")
 	if r.Method != "POST" {
@ -80,6 +78,11 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 	vlstorage.MustAddRows(lr)
 	logstorage.PutLogRows(lr)
 	// update jsonlineRequestDuration only for successfully parsed requests.
 	// There is no need in updating jsonlineRequestDuration for request errors,
 	// since their timings are usually much smaller than the timing for successful request parsing.
 	jsonlineRequestDuration.UpdateDuration(startTime)
 	return true
 }
@ -147,6 +150,7 @@ func parseISO8601Timestamp(s string) (int64, error) {
 var lineBufferPool bytesutil.ByteBufferPool
 var (
-	requestsTotal     = metrics.NewCounter(`vl_http_requests_total{path="/insert/jsonline"}`)
+	requestsTotal           = metrics.NewCounter(`vl_http_requests_total{path="/insert/jsonline"}`)
-	rowsIngestedTotal = metrics.NewCounter(`vl_rows_ingested_total{type="jsonline"}`)
+	rowsIngestedTotal       = metrics.NewCounter(`vl_rows_ingested_total{type="jsonline"}`)
 	jsonlineRequestDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/jsonline"}`)
 )
--- a/app/vlinsert/loki/loki.go
+++ b/app/vlinsert/loki/loki.go
@ -2,21 +2,11 @@ package loki
 import (
 	"net/http"
 	"time"
 	"github.com/VictoriaMetrics/metrics"
 	"github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/insertutils"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
 )
 var (
 	lokiRequestsJSONTotal       = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="json"}`)
 	lokiRequestsProtobufTotal   = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="protobuf"}`)
 	lokiRequestJSONDuration     = metrics.NewSummary(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="json"}`)
 	lokiRequestProtobufDuration = metrics.NewSummary(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="protobuf"}`)
 )
 // RequestHandler processes Loki insert requests
 func RequestHandler(path string, w http.ResponseWriter, r *http.Request) bool {
 	switch path {
@ -37,13 +27,9 @@ func handleInsert(r *http.Request, w http.ResponseWriter) bool {
 	contentType := r.Header.Get("Content-Type")
 	switch contentType {
 	case "application/json":
 		defer lokiRequestJSONDuration.UpdateDuration(time.Now())
 		lokiRequestsJSONTotal.Inc()
 		return handleJSON(r, w)
 	default:
 		// Protobuf request body should be handled by default according to https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki
 		defer lokiRequestProtobufDuration.UpdateDuration(time.Now())
 		lokiRequestsProtobufTotal.Inc()
 		return handleProtobuf(r, w)
 	}
 }
--- a/app/vlinsert/loki/loki_json.go
+++ b/app/vlinsert/loki/loki_json.go
@ -18,12 +18,11 @@ import (
 	"github.com/valyala/fastjson"
 )
-var (
+var parserPool fastjson.ParserPool
 	rowsIngestedJSONTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="json"}`)
 	parserPool            fastjson.ParserPool
 )
 func handleJSON(r *http.Request, w http.ResponseWriter) bool {
 	startTime := time.Now()
 	lokiRequestsJSONTotal.Inc()
 	reader := r.Body
 	if r.Header.Get("Content-Encoding") == "gzip" {
 		zr, err := common.GetGzipReader(reader)
@ -58,9 +57,21 @@ func handleJSON(r *http.Request, w http.ResponseWriter) bool {
 		return true
 	}
 	rowsIngestedJSONTotal.Add(n)
 	// update lokiRequestJSONDuration only for successfully parsed requests
 	// There is no need in updating lokiRequestJSONDuration for request errors,
 	// since their timings are usually much smaller than the timing for successful request parsing.
 	lokiRequestJSONDuration.UpdateDuration(startTime)
 	return true
 }
 var (
 	lokiRequestsJSONTotal   = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="json"}`)
 	rowsIngestedJSONTotal   = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="json"}`)
 	lokiRequestJSONDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="json"}`)
 )
 func parseJSONRequest(data []byte, processLogMessage func(timestamp int64, fields []logstorage.Field)) (int, error) {
 	p := parserPool.Get()
 	defer parserPool.Put(p)
--- a/app/vlinsert/loki/loki_protobuf.go
+++ b/app/vlinsert/loki/loki_protobuf.go
@ -19,12 +19,13 @@ import (
 )
 var (
-	rowsIngestedProtobufTotal = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="protobuf"}`)
+	bytesBufPool bytesutil.ByteBufferPool
-	bytesBufPool              bytesutil.ByteBufferPool
+	pushReqsPool sync.Pool
 	pushReqsPool              sync.Pool
 )
 func handleProtobuf(r *http.Request, w http.ResponseWriter) bool {
 	startTime := time.Now()
 	lokiRequestsProtobufTotal.Inc()
 	wcr := writeconcurrencylimiter.GetReader(r.Body)
 	data, err := io.ReadAll(wcr)
 	writeconcurrencylimiter.PutReader(wcr)
@ -47,10 +48,23 @@ func handleProtobuf(r *http.Request, w http.ResponseWriter) bool {
 		httpserver.Errorf(w, r, "cannot parse loki request: %s", err)
 		return true
 	}
 	rowsIngestedProtobufTotal.Add(n)
 	// update lokiRequestProtobufDuration only for successfully parsed requests
 	// There is no need in updating lokiRequestProtobufDuration for request errors,
 	// since their timings are usually much smaller than the timing for successful request parsing.
 	lokiRequestProtobufDuration.UpdateDuration(startTime)
 	return true
 }
 var (
 	lokiRequestsProtobufTotal   = metrics.NewCounter(`vl_http_requests_total{path="/insert/loki/api/v1/push",format="protobuf"}`)
 	rowsIngestedProtobufTotal   = metrics.NewCounter(`vl_rows_ingested_total{type="loki",format="protobuf"}`)
 	lokiRequestProtobufDuration = metrics.NewHistogram(`vl_http_request_duration_seconds{path="/insert/loki/api/v1/push",format="protobuf"}`)
 )
 func parseProtobufRequest(data []byte, processLogMessage func(timestamp int64, fields []logstorage.Field)) (int, error) {
 	bb := bytesBufPool.Get()
 	defer bytesBufPool.Put(bb)
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@ -10,7 +10,7 @@ according to [these docs](https://docs.victoriametrics.com/VictoriaLogs/QuickSta
  * `vl_data_size_bytes{type="storage"}` - on-disk size for data excluding [log stream](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) indexes.
  * `vl_data_size_bytes{type="indexdb"}` - on-disk size for [log stream](https://docs.victoriametrics.com/VictoriaLogs/keyConcepts.html#stream-fields) indexes.
 * FEATURE: add `-insert.maxFieldsPerLine` command-line flag, which can be used for limiting the number of fields per line in logs sent to VictoriaLogs via ingestion protocols. This helps to avoid issues like [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4762).
-* FEATURE: expose `vl_http_request_duration_seconds` metric at the [/metrics](monitoring). See this [PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4934) for details.
+* FEATURE: expose `vl_http_request_duration_seconds` histogram at the [/metrics](https://docs.victoriametrics.com/VictoriaLogs/#monitoring) page. Thanks to @crossoverJie for [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4934).
 * BUGFIX: fix possible panic when no data is written to VictoriaLogs for a long time. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4895). Thanks to @crossoverJie for filing and fixing the issue.
 * BUGFIX: add `/insert/loky/ready` endpoint, which is used by Promtail for healthchecks. This should remove `unsupported path requested: /insert/loki/ready` warning logs. See [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4762#issuecomment-1690966722).