2020-02-23 12:35:47 +01:00
package promscrape
import (
2023-01-24 06:52:57 +01:00
"bytes"
2020-04-16 22:41:16 +02:00
"flag"
2020-04-16 22:34:37 +02:00
"fmt"
2020-08-10 11:31:59 +02:00
"math"
2020-08-16 21:27:26 +02:00
"math/bits"
2021-09-12 11:49:19 +02:00
"strings"
2020-08-13 22:12:22 +02:00
"sync"
2020-02-23 12:35:47 +01:00
"time"
2023-01-07 07:59:15 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
2021-09-01 13:14:37 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
2020-02-23 12:35:47 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
2023-01-07 07:59:15 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
2021-08-13 11:10:00 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
2021-10-16 11:58:34 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
2021-10-14 11:29:12 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
2020-08-13 22:12:22 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/leveledbytebufferpool"
2020-02-23 12:35:47 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
2020-04-13 11:59:05 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
2020-02-23 12:35:47 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
2022-11-30 06:22:12 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
2020-02-23 12:35:47 +01:00
parser "github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/prometheus"
2023-02-13 18:26:07 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/prometheus/stream"
2020-12-24 09:56:10 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
2021-01-26 23:23:10 +01:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
2020-02-23 12:35:47 +01:00
"github.com/VictoriaMetrics/metrics"
2022-05-26 16:24:01 +02:00
"github.com/cespare/xxhash/v2"
2020-02-23 12:35:47 +01:00
)
2020-04-16 22:41:16 +02:00
var (
suppressScrapeErrors = flag . Bool ( "promscrape.suppressScrapeErrors" , false , "Whether to suppress scrape errors logging. " +
2022-05-25 21:58:30 +02:00
"The last error for each target is always available at '/targets' page even if scrape errors logging is suppressed. " +
"See also -promscrape.suppressScrapeErrorsDelay" )
suppressScrapeErrorsDelay = flag . Duration ( "promscrape.suppressScrapeErrorsDelay" , 0 , "The delay for suppressing repeated scrape errors logging per each scrape targets. " +
"This may be used for reducing the number of log lines related to scrape errors. See also -promscrape.suppressScrapeErrors" )
2024-04-18 01:31:37 +02:00
minResponseSizeForStreamParse = flagutil . NewBytes ( "promscrape.minResponseSizeForStreamParse" , 1e6 , "The minimum target response size for automatic switching to stream parsing mode, which can reduce memory usage. See https://docs.victoriametrics.com/vmagent/#stream-parsing-mode" )
2020-04-16 22:41:16 +02:00
)
2020-02-23 12:35:47 +01:00
// ScrapeWork represents a unit of work for scraping Prometheus metrics.
2020-12-17 13:30:33 +01:00
//
// It must be immutable during its lifetime, since it is read from concurrently running goroutines.
2020-02-23 12:35:47 +01:00
type ScrapeWork struct {
// Full URL (including query args) for the scrape.
ScrapeURL string
// Interval for scraping the ScrapeURL.
ScrapeInterval time . Duration
// Timeout for scraping the ScrapeURL.
ScrapeTimeout time . Duration
2024-06-20 13:58:42 +02:00
// MaxScrapeSize sets max amount of data, that can be scraped by a job
MaxScrapeSize int64
2020-02-23 12:35:47 +01:00
// How to deal with conflicting labels.
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
HonorLabels bool
// How to deal with scraped timestamps.
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
HonorTimestamps bool
2021-04-02 18:56:38 +02:00
// Whether to deny redirects during requests to scrape config.
DenyRedirects bool
2023-07-07 00:59:56 +02:00
// Do not support enable_http2 option because of the following reasons:
//
// - http2 is used very rarely comparing to http for Prometheus metrics exposition and service discovery
// - http2 is much harder to debug than http
// - http2 has very bad security record because of its complexity - see https://portswigger.net/research/http2
//
// EnableHTTP2 bool
2023-06-05 15:56:49 +02:00
2020-10-08 17:50:22 +02:00
// OriginalLabels contains original labels before relabeling.
//
// These labels are needed for relabeling troubleshooting at /targets page.
2022-10-09 13:51:14 +02:00
//
// OriginalLabels are sorted by name.
2022-11-30 06:22:12 +01:00
OriginalLabels * promutils . Labels
2020-10-08 17:50:22 +02:00
2020-02-23 12:35:47 +01:00
// Labels to add to the scraped metrics.
//
2022-10-07 21:39:28 +02:00
// The list contains at least the following labels according to https://www.robustperception.io/life-of-a-label/
2020-02-23 12:35:47 +01:00
//
// * job
2022-10-07 21:39:28 +02:00
// * instance
2020-02-23 12:35:47 +01:00
// * user-defined labels set via `relabel_configs` section in `scrape_config`
//
// See also https://prometheus.io/docs/concepts/jobs_instances/
2022-10-07 21:39:28 +02:00
//
2022-10-09 13:51:14 +02:00
// Labels are sorted by name.
2022-11-30 06:22:12 +01:00
Labels * promutils . Labels
2020-02-23 12:35:47 +01:00
2022-10-01 15:13:17 +02:00
// ExternalLabels contains labels from global->external_labels section of -promscrape.config
//
// These labels are added to scraped metrics after the relabeling.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3137
2022-10-09 13:51:14 +02:00
//
// ExternalLabels are sorted by name.
2022-11-30 06:22:12 +01:00
ExternalLabels * promutils . Labels
2022-10-01 15:13:17 +02:00
2020-12-24 09:52:37 +01:00
// ProxyURL HTTP proxy url
2021-10-26 20:21:08 +02:00
ProxyURL * proxy . URL
2020-12-24 09:52:37 +01:00
2021-03-12 02:35:49 +01:00
// Auth config for ProxyUR:
ProxyAuthConfig * promauth . Config
// Auth config
AuthConfig * promauth . Config
2022-12-10 11:09:21 +01:00
// Optional `relabel_configs`.
RelabelConfigs * promrelabel . ParsedConfigs
2020-02-23 12:35:47 +01:00
// Optional `metric_relabel_configs`.
2021-02-22 15:33:55 +01:00
MetricRelabelConfigs * promrelabel . ParsedConfigs
2020-02-23 12:35:47 +01:00
// The maximum number of metrics to scrape after relabeling.
2020-04-14 10:58:15 +02:00
SampleLimit int
2020-06-23 14:35:19 +02:00
2020-07-02 13:19:11 +02:00
// Whether to disable response compression when querying ScrapeURL.
DisableCompression bool
// Whether to disable HTTP keep-alive when querying ScrapeURL.
DisableKeepAlive bool
2020-11-01 22:12:13 +01:00
// Whether to parse target responses in a streaming manner.
StreamParse bool
2021-02-18 22:51:29 +01:00
// The interval for aligning the first scrape.
ScrapeAlignInterval time . Duration
2021-03-08 10:58:25 +01:00
// The offset for the first scrape.
ScrapeOffset time . Duration
2021-09-01 13:14:37 +02:00
// Optional limit on the number of unique series the scrape target can expose.
SeriesLimit int
2022-10-07 22:36:11 +02:00
// Whether to process stale markers for the given target.
2024-04-18 01:31:37 +02:00
// See https://docs.victoriametrics.com/vmagent/#prometheus-staleness-markers
2022-10-07 22:36:11 +02:00
NoStaleMarkers bool
2023-02-01 20:21:44 +01:00
// The Tenant Info
2022-08-08 13:10:18 +02:00
AuthToken * auth . Token
2020-06-23 14:35:19 +02:00
// The original 'job_name'
jobNameOriginal string
2020-02-23 12:35:47 +01:00
}
2021-10-15 14:26:22 +02:00
func ( sw * ScrapeWork ) canSwitchToStreamParseMode ( ) bool {
// Deny switching to stream parse mode if `sample_limit` or `series_limit` options are set,
// since these limits cannot be applied in stream parsing mode.
2021-10-14 11:29:12 +02:00
return sw . SampleLimit <= 0 && sw . SeriesLimit <= 0
}
2020-05-03 11:41:13 +02:00
// key returns unique identifier for the given sw.
//
2021-12-20 17:38:03 +01:00
// It can be used for comparing for equality for two ScrapeWork objects.
2020-05-03 11:41:13 +02:00
func ( sw * ScrapeWork ) key ( ) string {
2021-12-20 17:38:03 +01:00
// Do not take into account OriginalLabels, since they can be changed with relabeling.
2022-12-10 11:09:21 +01:00
// Do not take into account RelabelConfigs, since it is already applied to Labels.
2021-12-20 17:38:03 +01:00
// Take into account JobNameOriginal in order to capture the case when the original job_name is changed via relabeling.
2024-10-18 11:35:23 +02:00
key := fmt . Sprintf ( "JobNameOriginal=%s, ScrapeURL=%s, ScrapeInterval=%s, ScrapeTimeout=%s, HonorLabels=%v, " +
"HonorTimestamps=%v, DenyRedirects=%v, Labels=%s, ExternalLabels=%s, MaxScrapeSize=%d, " +
2022-12-10 11:09:21 +01:00
"ProxyURL=%s, ProxyAuthConfig=%s, AuthConfig=%s, MetricRelabelConfigs=%q, " +
"SampleLimit=%d, DisableCompression=%v, DisableKeepAlive=%v, StreamParse=%v, " +
2022-10-07 22:36:11 +02:00
"ScrapeAlignInterval=%s, ScrapeOffset=%s, SeriesLimit=%d, NoStaleMarkers=%v" ,
2024-10-18 11:35:23 +02:00
sw . jobNameOriginal , sw . ScrapeURL , sw . ScrapeInterval , sw . ScrapeTimeout , sw . HonorLabels ,
sw . HonorTimestamps , sw . DenyRedirects , sw . Labels . String ( ) , sw . ExternalLabels . String ( ) , sw . MaxScrapeSize ,
2022-12-10 11:09:21 +01:00
sw . ProxyURL . String ( ) , sw . ProxyAuthConfig . String ( ) , sw . AuthConfig . String ( ) , sw . MetricRelabelConfigs . String ( ) ,
sw . SampleLimit , sw . DisableCompression , sw . DisableKeepAlive , sw . StreamParse ,
2022-10-07 22:36:11 +02:00
sw . ScrapeAlignInterval , sw . ScrapeOffset , sw . SeriesLimit , sw . NoStaleMarkers )
2020-05-03 11:41:13 +02:00
return key
}
2020-04-14 12:32:55 +02:00
// Job returns job for the ScrapeWork
func ( sw * ScrapeWork ) Job ( ) string {
2022-11-30 06:22:12 +01:00
return sw . Labels . Get ( "job" )
2020-04-14 12:32:55 +02:00
}
2020-02-23 12:35:47 +01:00
type scrapeWork struct {
// Config for the scrape.
2020-12-17 13:30:33 +01:00
Config * ScrapeWork
2020-02-23 12:35:47 +01:00
2024-01-30 16:51:44 +01:00
// ReadData is called for reading the scrape response data into dst.
ReadData func ( dst * bytesutil . ByteBuffer ) error
2020-11-01 22:12:13 +01:00
2020-02-23 12:35:47 +01:00
// PushData is called for pushing collected data.
2022-08-08 13:10:18 +02:00
PushData func ( at * auth . Token , wr * prompbmarshal . WriteRequest )
2020-02-23 12:35:47 +01:00
2020-07-13 20:52:03 +02:00
// ScrapeGroup is name of ScrapeGroup that
// scrapeWork belongs to
ScrapeGroup string
2020-08-13 22:12:22 +02:00
tmpRow parser . Row
2020-08-09 11:44:49 +02:00
2021-09-12 11:49:19 +02:00
// This flag is set to true if series_limit is exceeded.
seriesLimitExceeded bool
// labelsHashBuf is used for calculating the hash on series labels
2020-08-10 18:47:43 +02:00
labelsHashBuf [ ] byte
2020-08-13 22:12:22 +02:00
2021-09-01 13:14:37 +02:00
// Optional limiter on the number of unique series per scrape target.
seriesLimiter * bloomfilter . Limiter
2020-08-14 00:16:18 +02:00
// prevBodyLen contains the previous response body length for the given scrape work.
2020-08-13 22:12:22 +02:00
// It is used as a hint in order to reduce memory usage for body buffers.
2020-08-14 00:16:18 +02:00
prevBodyLen int
2020-08-16 21:27:26 +02:00
2021-03-14 21:56:23 +01:00
// prevLabelsLen contains the number labels scraped during the previous scrape.
2020-08-16 21:27:26 +02:00
// It is used as a hint in order to reduce memory usage when parsing scrape responses.
2021-03-14 21:56:23 +01:00
prevLabelsLen int
2021-08-13 11:10:00 +02:00
2021-08-21 20:16:50 +02:00
// lastScrape holds the last response from scrape target.
2021-10-22 12:10:26 +02:00
// It is used for staleness tracking and for populating scrape_series_added metric.
// The lastScrape isn't populated if -promscrape.noStaleMarkers is set. This reduces memory usage.
2021-08-21 20:16:50 +02:00
lastScrape [ ] byte
2021-10-16 11:58:34 +02:00
// lastScrapeCompressed is used for storing the compressed lastScrape between scrapes
// in stream parsing mode in order to reduce memory usage when the lastScrape size
// equals to or exceeds -promscrape.minResponseSizeForStreamParse
lastScrapeCompressed [ ] byte
2022-05-25 21:58:30 +02:00
2023-01-12 10:09:26 +01:00
// nextErrorLogTime is the timestamp in millisecond when the next scrape error should be logged.
nextErrorLogTime int64
2022-05-25 21:58:30 +02:00
2023-01-12 10:09:26 +01:00
// failureRequestsCount is the number of suppressed scrape errors during the last suppressScrapeErrorsDelay
failureRequestsCount int
// successRequestsCount is the number of success requests during the last suppressScrapeErrorsDelay
successRequestsCount int
2021-10-16 11:58:34 +02:00
}
2021-10-22 12:10:26 +02:00
func ( sw * scrapeWork ) loadLastScrape ( ) string {
if len ( sw . lastScrapeCompressed ) > 0 {
b , err := encoding . DecompressZSTD ( sw . lastScrape [ : 0 ] , sw . lastScrapeCompressed )
if err != nil {
logger . Panicf ( "BUG: cannot unpack compressed previous response: %s" , err )
}
sw . lastScrape = b
2021-10-16 11:58:34 +02:00
}
2021-10-22 12:10:26 +02:00
return bytesutil . ToUnsafeString ( sw . lastScrape )
2021-10-16 11:58:34 +02:00
}
func ( sw * scrapeWork ) storeLastScrape ( lastScrape [ ] byte ) {
2022-12-15 04:26:24 +01:00
mustCompress := minResponseSizeForStreamParse . N > 0 && len ( lastScrape ) >= minResponseSizeForStreamParse . IntN ( )
2021-10-16 11:58:34 +02:00
if mustCompress {
sw . lastScrapeCompressed = encoding . CompressZSTDLevel ( sw . lastScrapeCompressed [ : 0 ] , lastScrape , 1 )
sw . lastScrape = nil
} else {
sw . lastScrape = append ( sw . lastScrape [ : 0 ] , lastScrape ... )
sw . lastScrapeCompressed = nil
}
}
func ( sw * scrapeWork ) finalizeLastScrape ( ) {
if len ( sw . lastScrapeCompressed ) > 0 {
// The compressed lastScrape is available in sw.lastScrapeCompressed.
// Release the memory occupied by sw.lastScrape, so it won't be occupied between scrapes.
sw . lastScrape = nil
}
if len ( sw . lastScrape ) > 0 {
// Release the memory occupied by sw.lastScrapeCompressed, so it won't be occupied between scrapes.
sw . lastScrapeCompressed = nil
}
2020-02-23 12:35:47 +01:00
}
2022-01-07 00:17:55 +01:00
func ( sw * scrapeWork ) run ( stopCh <- chan struct { } , globalStopCh <- chan struct { } ) {
2021-02-18 22:51:29 +01:00
var randSleep uint64
2021-03-08 10:58:25 +01:00
scrapeInterval := sw . Config . ScrapeInterval
scrapeAlignInterval := sw . Config . ScrapeAlignInterval
scrapeOffset := sw . Config . ScrapeOffset
if scrapeOffset > 0 {
scrapeAlignInterval = scrapeInterval
}
if scrapeAlignInterval <= 0 {
2021-02-18 22:51:29 +01:00
// Calculate start time for the first scrape from ScrapeURL and labels.
// This should spread load when scraping many targets with different
// scrape urls and labels.
// This also makes consistent scrape times across restarts
// for a target with the same ScrapeURL and labels.
2021-12-22 23:20:34 +01:00
//
2022-06-03 23:35:51 +02:00
// Include clusterName to the key in order to guarantee that the same
// scrape target is scraped at different offsets per each cluster.
// This guarantees that the deduplication consistently leaves samples received from the same vmagent.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2679
//
// Include clusterMemberID to the key in order to guarantee that each member in vmagent cluster
2021-12-22 23:20:34 +01:00
// scrapes replicated targets at different time offsets. This guarantees that the deduplication consistently leaves samples
// received from the same vmagent replica.
2024-04-18 01:31:37 +02:00
// See https://docs.victoriametrics.com/vmagent/#scraping-big-number-of-targets
2022-12-10 11:09:21 +01:00
key := fmt . Sprintf ( "clusterName=%s, clusterMemberID=%d, ScrapeURL=%s, Labels=%s" , * clusterName , clusterMemberID , sw . Config . ScrapeURL , sw . Config . Labels . String ( ) )
2021-10-27 18:59:13 +02:00
h := xxhash . Sum64 ( bytesutil . ToUnsafeBytes ( key ) )
randSleep = uint64 ( float64 ( scrapeInterval ) * ( float64 ( h ) / ( 1 << 64 ) ) )
2021-02-18 22:51:29 +01:00
sleepOffset := uint64 ( time . Now ( ) . UnixNano ( ) ) % uint64 ( scrapeInterval )
if randSleep < sleepOffset {
randSleep += uint64 ( scrapeInterval )
}
randSleep -= sleepOffset
} else {
2021-03-08 10:58:25 +01:00
d := uint64 ( scrapeAlignInterval )
2021-02-18 22:51:29 +01:00
randSleep = d - uint64 ( time . Now ( ) . UnixNano ( ) ) % d
2021-03-08 10:58:25 +01:00
if scrapeOffset > 0 {
randSleep += uint64 ( scrapeOffset )
}
2021-02-18 22:51:29 +01:00
randSleep %= uint64 ( scrapeInterval )
2020-05-03 13:29:26 +02:00
}
2021-01-26 23:23:10 +01:00
timer := timerpool . Get ( time . Duration ( randSleep ) )
2020-04-01 15:10:35 +02:00
var timestamp int64
2020-02-23 12:35:47 +01:00
var ticker * time . Ticker
select {
case <- stopCh :
2021-01-26 23:23:10 +01:00
timerpool . Put ( timer )
2020-02-23 12:35:47 +01:00
return
2020-04-01 15:10:35 +02:00
case <- timer . C :
2021-01-26 23:23:10 +01:00
timerpool . Put ( timer )
2020-04-01 15:10:35 +02:00
ticker = time . NewTicker ( scrapeInterval )
timestamp = time . Now ( ) . UnixNano ( ) / 1e6
2020-08-10 11:31:59 +02:00
sw . scrapeAndLogError ( timestamp , timestamp )
2020-02-23 12:35:47 +01:00
}
defer ticker . Stop ( )
for {
2020-04-01 15:10:35 +02:00
timestamp += scrapeInterval . Milliseconds ( )
2020-02-23 12:35:47 +01:00
select {
case <- stopCh :
2021-08-19 13:18:02 +02:00
t := time . Now ( ) . UnixNano ( ) / 1e6
2021-10-22 12:10:26 +02:00
lastScrape := sw . loadLastScrape ( )
2022-01-07 00:17:55 +01:00
select {
case <- globalStopCh :
// Do not send staleness markers on graceful shutdown as Prometheus does.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2013#issuecomment-1006994079
default :
2024-01-21 23:38:21 +01:00
// Send staleness markers to all the metrics scraped last time from the target
// when the given target disappears as Prometheus does.
// Use the current real timestamp for staleness markers, so queries
// stop returning data just after the time the target disappears.
sw . sendStaleSeries ( lastScrape , "" , t , true )
2022-01-07 00:17:55 +01:00
}
2021-09-01 13:14:37 +02:00
if sw . seriesLimiter != nil {
sw . seriesLimiter . MustStop ( )
2022-08-17 12:18:47 +02:00
sw . seriesLimiter = nil
2021-09-01 13:14:37 +02:00
}
2020-02-23 12:35:47 +01:00
return
2020-08-10 11:31:59 +02:00
case tt := <- ticker . C :
t := tt . UnixNano ( ) / 1e6
2023-10-02 20:59:48 +02:00
if d := math . Abs ( float64 ( t - timestamp ) ) ; d > 0 && d / float64 ( scrapeInterval . Milliseconds ( ) ) > 0.1 {
// Too big jitter. Adjust timestamp
timestamp = t
2020-02-23 12:35:47 +01:00
}
2020-08-10 11:31:59 +02:00
sw . scrapeAndLogError ( timestamp , t )
2020-02-23 12:35:47 +01:00
}
}
}
func ( sw * scrapeWork ) logError ( s string ) {
2020-04-16 22:41:16 +02:00
if ! * suppressScrapeErrors {
2020-12-06 22:26:34 +01:00
logger . ErrorfSkipframes ( 1 , "error when scraping %q from job %q with labels %s: %s; " +
"scrape errors can be disabled by -promscrape.suppressScrapeErrors command-line flag" ,
2022-12-10 11:09:21 +01:00
sw . Config . ScrapeURL , sw . Config . Job ( ) , sw . Config . Labels . String ( ) , s )
2020-04-16 22:41:16 +02:00
}
2020-02-23 12:35:47 +01:00
}
2020-08-10 11:31:59 +02:00
func ( sw * scrapeWork ) scrapeAndLogError ( scrapeTimestamp , realTimestamp int64 ) {
2022-05-25 21:58:30 +02:00
err := sw . scrapeInternal ( scrapeTimestamp , realTimestamp )
2023-01-12 10:09:26 +01:00
if * suppressScrapeErrors {
2022-05-25 21:58:30 +02:00
return
}
2023-01-12 10:09:26 +01:00
if err == nil {
sw . successRequestsCount ++
2022-05-25 21:58:30 +02:00
return
}
2023-01-12 10:09:26 +01:00
sw . failureRequestsCount ++
if sw . nextErrorLogTime == 0 {
sw . nextErrorLogTime = realTimestamp + suppressScrapeErrorsDelay . Milliseconds ( )
}
if realTimestamp < sw . nextErrorLogTime {
return
2020-02-23 12:35:47 +01:00
}
2023-01-12 10:09:26 +01:00
totalRequests := sw . failureRequestsCount + sw . successRequestsCount
logger . Warnf ( "cannot scrape target %q (%s) %d out of %d times during -promscrape.suppressScrapeErrorsDelay=%s; the last error: %s" ,
sw . Config . ScrapeURL , sw . Config . Labels . String ( ) , sw . failureRequestsCount , totalRequests , * suppressScrapeErrorsDelay , err )
sw . nextErrorLogTime = realTimestamp + suppressScrapeErrorsDelay . Milliseconds ( )
sw . failureRequestsCount = 0
sw . successRequestsCount = 0
2020-02-23 12:35:47 +01:00
}
var (
scrapeDuration = metrics . NewHistogram ( "vm_promscrape_scrape_duration_seconds" )
scrapeResponseSize = metrics . NewHistogram ( "vm_promscrape_scrape_response_size_bytes" )
scrapedSamples = metrics . NewHistogram ( "vm_promscrape_scraped_samples" )
2020-04-14 10:58:15 +02:00
scrapesSkippedBySampleLimit = metrics . NewCounter ( "vm_promscrape_scrapes_skipped_by_sample_limit_total" )
2020-02-23 12:35:47 +01:00
scrapesFailed = metrics . NewCounter ( "vm_promscrape_scrapes_failed_total" )
pushDataDuration = metrics . NewHistogram ( "vm_promscrape_push_data_duration_seconds" )
)
2024-01-30 16:51:44 +01:00
func ( sw * scrapeWork ) needStreamParseMode ( responseSize int ) bool {
if * streamParse || sw . Config . StreamParse {
return true
}
2021-10-15 14:26:22 +02:00
if minResponseSizeForStreamParse . N <= 0 {
return false
}
2022-12-15 04:26:24 +01:00
return sw . Config . canSwitchToStreamParseMode ( ) && responseSize >= minResponseSizeForStreamParse . IntN ( )
2021-10-15 14:26:22 +02:00
}
2022-02-03 17:57:36 +01:00
// getTargetResponse() fetches response from sw target in the same way as when scraping the target.
func ( sw * scrapeWork ) getTargetResponse ( ) ( [ ] byte , error ) {
2024-01-30 16:51:44 +01:00
var bb bytesutil . ByteBuffer
if err := sw . ReadData ( & bb ) ; err != nil {
return nil , err
2022-02-03 17:57:36 +01:00
}
2024-01-30 16:51:44 +01:00
return bb . B , nil
2022-02-03 17:57:36 +01:00
}
2020-08-10 11:31:59 +02:00
func ( sw * scrapeWork ) scrapeInternal ( scrapeTimestamp , realTimestamp int64 ) error {
2020-08-14 00:16:18 +02:00
body := leveledbytebufferpool . Get ( sw . prevBodyLen )
2024-01-30 16:51:44 +01:00
// Read the scrape response into body.
// It is OK to do for stream parsing parsing mode, since the most of RAM
// is occupied during parsing of the read response body below.
// This also allows measuring the real scrape duration, which doesn't include
// the time needed for processing of the read response.
err := sw . ReadData ( body )
// Measure scrape duration.
endTimestamp := time . Now ( ) . UnixNano ( ) / 1e6
scrapeDurationSeconds := float64 ( endTimestamp - realTimestamp ) / 1e3
scrapeDuration . Update ( scrapeDurationSeconds )
scrapeResponseSize . Update ( float64 ( len ( body . B ) ) )
// The code below is CPU-bound, while it may allocate big amounts of memory.
// That's why it is a good idea to limit the number of concurrent goroutines,
// which may execute this code, in order to limit memory usage under high load
// without sacrificing the performance.
processScrapedDataConcurrencyLimitCh <- struct { } { }
if err == nil && sw . needStreamParseMode ( len ( body . B ) ) {
// Process response body from scrape target in streaming manner.
// This case is optimized for targets exposing more than ten thousand of metrics per target,
// such as kube-state-metrics.
err = sw . processDataInStreamMode ( scrapeTimestamp , realTimestamp , body , scrapeDurationSeconds )
} else {
// Process response body from scrape target at once.
// This case should work more optimally than stream parse for common case when scrape target exposes
// up to a few thousand metrics.
err = sw . processDataOneShot ( scrapeTimestamp , realTimestamp , body . B , scrapeDurationSeconds , err )
2023-01-07 07:59:15 +01:00
}
2024-01-30 16:51:44 +01:00
<- processScrapedDataConcurrencyLimitCh
leveledbytebufferpool . Put ( body )
2023-01-07 07:59:15 +01:00
return err
}
2023-01-18 08:09:34 +01:00
var processScrapedDataConcurrencyLimitCh = make ( chan struct { } , cgroup . AvailableCPUs ( ) )
2023-01-07 07:59:15 +01:00
2024-01-30 16:51:44 +01:00
func ( sw * scrapeWork ) processDataOneShot ( scrapeTimestamp , realTimestamp int64 , body [ ] byte , scrapeDurationSeconds float64 , err error ) error {
2020-02-23 12:35:47 +01:00
up := 1
2021-03-14 21:56:23 +01:00
wc := writeRequestCtxPool . Get ( sw . prevLabelsLen )
2021-10-22 12:10:26 +02:00
lastScrape := sw . loadLastScrape ( )
2024-01-30 16:51:44 +01:00
bodyString := bytesutil . ToUnsafeString ( body )
2023-01-17 19:14:46 +01:00
areIdenticalSeries := sw . areIdenticalSeries ( lastScrape , bodyString )
2020-02-23 12:35:47 +01:00
if err != nil {
up = 0
scrapesFailed . Inc ( )
} else {
2020-08-13 22:12:22 +02:00
wc . rows . UnmarshalWithErrLogger ( bodyString , sw . logError )
2020-02-23 12:35:47 +01:00
}
2020-08-13 22:12:22 +02:00
srcRows := wc . rows . Rows
2020-02-23 12:35:47 +01:00
samplesScraped := len ( srcRows )
scrapedSamples . Update ( float64 ( samplesScraped ) )
for i := range srcRows {
2020-08-13 22:12:22 +02:00
sw . addRowToTimeseries ( wc , & srcRows [ i ] , scrapeTimestamp , true )
2020-02-23 12:35:47 +01:00
}
2021-03-14 21:56:23 +01:00
samplesPostRelabeling := len ( wc . writeRequest . Timeseries )
2021-03-09 14:47:15 +01:00
if sw . Config . SampleLimit > 0 && samplesPostRelabeling > sw . Config . SampleLimit {
wc . resetNoRows ( )
up = 0
scrapesSkippedBySampleLimit . Inc ( )
2021-05-27 13:52:44 +02:00
err = fmt . Errorf ( "the response from %q exceeds sample_limit=%d; " +
"either reduce the sample count for the target or increase sample_limit" , sw . Config . ScrapeURL , sw . Config . SampleLimit )
2021-03-09 14:47:15 +01:00
}
2021-09-12 11:49:19 +02:00
if up == 0 {
bodyString = ""
}
seriesAdded := 0
2024-01-21 23:38:21 +01:00
if ! areIdenticalSeries {
2021-09-12 11:49:19 +02:00
// The returned value for seriesAdded may be bigger than the real number of added series
// if some series were removed during relabeling.
// This is a trade-off between performance and accuracy.
2021-10-22 12:10:26 +02:00
seriesAdded = sw . getSeriesAdded ( lastScrape , bodyString )
2021-09-12 11:49:19 +02:00
}
2022-08-17 12:18:47 +02:00
samplesDropped := 0
2021-09-12 11:49:19 +02:00
if sw . seriesLimitExceeded || ! areIdenticalSeries {
2022-08-17 12:18:47 +02:00
samplesDropped = sw . applySeriesLimit ( wc )
2021-09-12 11:49:19 +02:00
}
2024-07-16 12:24:14 +02:00
responseSize := len ( bodyString )
2022-08-17 12:18:47 +02:00
am := & autoMetrics {
up : up ,
2024-01-30 16:51:44 +01:00
scrapeDurationSeconds : scrapeDurationSeconds ,
2024-07-16 12:24:14 +02:00
scrapeResponseSize : responseSize ,
2022-08-17 12:18:47 +02:00
samplesScraped : samplesScraped ,
samplesPostRelabeling : samplesPostRelabeling ,
seriesAdded : seriesAdded ,
seriesLimitSamplesDropped : samplesDropped ,
2022-07-06 11:37:13 +02:00
}
2022-08-17 12:18:47 +02:00
sw . addAutoMetrics ( am , wc , scrapeTimestamp )
2022-08-08 13:10:18 +02:00
sw . pushData ( sw . Config . AuthToken , & wc . writeRequest )
2021-03-14 21:56:23 +01:00
sw . prevLabelsLen = len ( wc . labels )
2024-07-16 12:24:14 +02:00
sw . prevBodyLen = responseSize
2020-08-13 22:12:22 +02:00
wc . reset ( )
2024-01-30 16:51:44 +01:00
writeRequestCtxPool . Put ( wc )
2020-08-13 22:12:22 +02:00
// body must be released only after wc is released, since wc refers to body.
2024-01-21 23:38:21 +01:00
if ! areIdenticalSeries {
2022-06-23 09:55:14 +02:00
// Send stale markers for disappeared metrics with the real scrape timestamp
// in order to guarantee that query doesn't return data after this time for the disappeared metrics.
sw . sendStaleSeries ( lastScrape , bodyString , realTimestamp , false )
2024-01-30 16:51:44 +01:00
sw . storeLastScrape ( body )
2021-09-12 11:49:19 +02:00
}
2021-10-16 11:58:34 +02:00
sw . finalizeLastScrape ( )
2024-07-16 12:24:14 +02:00
tsmGlobal . Update ( sw , up == 1 , realTimestamp , int64 ( scrapeDurationSeconds * 1000 ) , responseSize , samplesScraped , err )
2024-01-30 16:51:44 +01:00
return err
2021-10-15 14:26:22 +02:00
}
2024-01-30 16:51:44 +01:00
func ( sw * scrapeWork ) processDataInStreamMode ( scrapeTimestamp , realTimestamp int64 , body * bytesutil . ByteBuffer , scrapeDurationSeconds float64 ) error {
2020-11-01 22:12:13 +01:00
samplesScraped := 0
samplesPostRelabeling := 0
2021-03-14 21:56:23 +01:00
wc := writeRequestCtxPool . Get ( sw . prevLabelsLen )
2020-12-15 13:08:06 +01:00
2022-12-09 01:33:33 +01:00
lastScrape := sw . loadLastScrape ( )
2024-01-30 16:51:44 +01:00
bodyString := bytesutil . ToUnsafeString ( body . B )
areIdenticalSeries := sw . areIdenticalSeries ( lastScrape , bodyString )
2022-12-09 01:33:33 +01:00
samplesDropped := 0
2024-01-30 16:51:44 +01:00
r := body . NewReader ( )
var mu sync . Mutex
err := stream . Parse ( r , scrapeTimestamp , false , false , func ( rows [ ] parser . Row ) error {
mu . Lock ( )
defer mu . Unlock ( )
samplesScraped += len ( rows )
for i := range rows {
sw . addRowToTimeseries ( wc , & rows [ i ] , scrapeTimestamp , true )
}
samplesPostRelabeling += len ( wc . writeRequest . Timeseries )
if sw . Config . SampleLimit > 0 && samplesPostRelabeling > sw . Config . SampleLimit {
wc . resetNoRows ( )
scrapesSkippedBySampleLimit . Inc ( )
return fmt . Errorf ( "the response from %q exceeds sample_limit=%d; " +
"either reduce the sample count for the target or increase sample_limit" , sw . Config . ScrapeURL , sw . Config . SampleLimit )
}
if sw . seriesLimitExceeded || ! areIdenticalSeries {
samplesDropped += sw . applySeriesLimit ( wc )
2022-09-14 12:14:04 +02:00
}
2024-01-30 16:51:44 +01:00
// Push the collected rows to sw before returning from the callback, since they cannot be held
// after returning from the callback - this will result in data race.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825#issuecomment-723198247
sw . pushData ( sw . Config . AuthToken , & wc . writeRequest )
wc . resetNoRows ( )
return nil
} , sw . logError )
2020-12-15 13:08:06 +01:00
2020-11-01 22:12:13 +01:00
scrapedSamples . Update ( float64 ( samplesScraped ) )
up := 1
if err != nil {
2022-07-22 12:28:57 +02:00
// Mark the scrape as failed even if it already read and pushed some samples
// to remote storage. This makes the logic compatible with Prometheus.
up = 0
2020-11-01 22:12:13 +01:00
scrapesFailed . Inc ( )
}
2021-10-15 14:26:22 +02:00
seriesAdded := 0
2024-01-21 23:38:21 +01:00
if ! areIdenticalSeries {
2021-10-15 14:26:22 +02:00
// The returned value for seriesAdded may be bigger than the real number of added series
// if some series were removed during relabeling.
// This is a trade-off between performance and accuracy.
2021-10-22 12:10:26 +02:00
seriesAdded = sw . getSeriesAdded ( lastScrape , bodyString )
2021-10-15 14:26:22 +02:00
}
2024-07-16 12:24:14 +02:00
responseSize := len ( bodyString )
2022-08-17 12:18:47 +02:00
am := & autoMetrics {
2022-12-09 01:33:33 +01:00
up : up ,
2024-01-30 16:51:44 +01:00
scrapeDurationSeconds : scrapeDurationSeconds ,
2024-07-16 12:24:14 +02:00
scrapeResponseSize : responseSize ,
2022-12-09 01:33:33 +01:00
samplesScraped : samplesScraped ,
samplesPostRelabeling : samplesPostRelabeling ,
seriesAdded : seriesAdded ,
seriesLimitSamplesDropped : samplesDropped ,
2022-08-17 12:18:47 +02:00
}
sw . addAutoMetrics ( am , wc , scrapeTimestamp )
2022-08-08 13:10:18 +02:00
sw . pushData ( sw . Config . AuthToken , & wc . writeRequest )
2021-03-14 21:56:23 +01:00
sw . prevLabelsLen = len ( wc . labels )
2024-07-16 12:24:14 +02:00
sw . prevBodyLen = responseSize
2020-11-01 22:12:13 +01:00
wc . reset ( )
writeRequestCtxPool . Put ( wc )
2024-01-21 23:38:21 +01:00
if ! areIdenticalSeries {
2022-06-23 09:55:14 +02:00
// Send stale markers for disappeared metrics with the real scrape timestamp
// in order to guarantee that query doesn't return data after this time for the disappeared metrics.
sw . sendStaleSeries ( lastScrape , bodyString , realTimestamp , false )
2024-01-30 16:51:44 +01:00
sw . storeLastScrape ( body . B )
2021-10-15 14:26:22 +02:00
}
2021-10-16 11:58:34 +02:00
sw . finalizeLastScrape ( )
2024-07-16 12:24:14 +02:00
tsmGlobal . Update ( sw , up == 1 , realTimestamp , int64 ( scrapeDurationSeconds * 1000 ) , responseSize , samplesScraped , err )
2021-08-21 20:16:50 +02:00
// Do not track active series in streaming mode, since this may need too big amounts of memory
// when the target exports too big number of metrics.
2021-01-12 12:31:47 +01:00
return err
2020-11-01 22:12:13 +01:00
}
2024-01-30 16:51:44 +01:00
func ( sw * scrapeWork ) pushData ( at * auth . Token , wr * prompbmarshal . WriteRequest ) {
startTime := time . Now ( )
sw . PushData ( at , wr )
pushDataDuration . UpdateDuration ( startTime )
}
2023-01-17 19:14:46 +01:00
func ( sw * scrapeWork ) areIdenticalSeries ( prevData , currData string ) bool {
if sw . Config . NoStaleMarkers && sw . Config . SeriesLimit <= 0 {
// Do not spend CPU time on tracking the changes in series if stale markers are disabled.
// The check for series_limit is needed for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3660
return true
}
return parser . AreIdenticalSeriesFast ( prevData , currData )
}
2020-09-01 09:55:21 +02:00
// leveledWriteRequestCtxPool allows reducing memory usage when writeRequesCtx
2020-08-16 21:27:26 +02:00
// structs contain mixed number of labels.
//
// Its logic has been copied from leveledbytebufferpool.
type leveledWriteRequestCtxPool struct {
2021-03-14 21:56:23 +01:00
pools [ 13 ] sync . Pool
2020-08-16 21:27:26 +02:00
}
2021-03-14 21:56:23 +01:00
func ( lwp * leveledWriteRequestCtxPool ) Get ( labelsCapacity int ) * writeRequestCtx {
id , capacityNeeded := lwp . getPoolIDAndCapacity ( labelsCapacity )
2020-08-16 21:27:26 +02:00
for i := 0 ; i < 2 ; i ++ {
if id < 0 || id >= len ( lwp . pools ) {
break
}
if v := lwp . pools [ id ] . Get ( ) ; v != nil {
return v . ( * writeRequestCtx )
}
id ++
}
return & writeRequestCtx {
labels : make ( [ ] prompbmarshal . Label , 0 , capacityNeeded ) ,
}
}
func ( lwp * leveledWriteRequestCtxPool ) Put ( wc * writeRequestCtx ) {
2021-03-14 21:56:23 +01:00
capacity := cap ( wc . labels )
id , poolCapacity := lwp . getPoolIDAndCapacity ( capacity )
if capacity <= poolCapacity {
wc . reset ( )
lwp . pools [ id ] . Put ( wc )
}
2020-08-16 21:27:26 +02:00
}
2020-08-28 08:44:08 +02:00
func ( lwp * leveledWriteRequestCtxPool ) getPoolIDAndCapacity ( size int ) ( int , int ) {
2020-08-16 21:27:26 +02:00
size --
if size < 0 {
size = 0
}
size >>= 3
id := bits . Len ( uint ( size ) )
2021-03-14 21:56:23 +01:00
if id >= len ( lwp . pools ) {
2020-08-16 21:27:26 +02:00
id = len ( lwp . pools ) - 1
}
return id , ( 1 << ( id + 3 ) )
}
2020-08-13 22:12:22 +02:00
type writeRequestCtx struct {
rows parser . Rows
writeRequest prompbmarshal . WriteRequest
labels [ ] prompbmarshal . Label
samples [ ] prompbmarshal . Sample
}
func ( wc * writeRequestCtx ) reset ( ) {
wc . rows . Reset ( )
2020-09-11 22:36:24 +02:00
wc . resetNoRows ( )
}
func ( wc * writeRequestCtx ) resetNoRows ( ) {
2024-01-14 22:04:45 +01:00
wc . writeRequest . Reset ( )
2022-10-28 20:34:07 +02:00
2024-04-20 21:00:00 +02:00
clear ( wc . labels )
wc . labels = wc . labels [ : 0 ]
2022-10-28 20:34:07 +02:00
2020-08-13 22:12:22 +02:00
wc . samples = wc . samples [ : 0 ]
}
2020-08-16 21:27:26 +02:00
var writeRequestCtxPool leveledWriteRequestCtxPool
2020-08-13 22:12:22 +02:00
2021-10-22 12:10:26 +02:00
func ( sw * scrapeWork ) getSeriesAdded ( lastScrape , currScrape string ) int {
2021-09-12 11:49:19 +02:00
if currScrape == "" {
return 0
2020-09-11 22:36:24 +02:00
}
2021-09-12 11:49:19 +02:00
bodyString := parser . GetRowsDiff ( currScrape , lastScrape )
return strings . Count ( bodyString , "\n" )
}
2022-08-17 12:18:47 +02:00
func ( sw * scrapeWork ) applySeriesLimit ( wc * writeRequestCtx ) int {
2023-01-17 19:14:46 +01:00
if sw . Config . SeriesLimit <= 0 {
return 0
2021-09-01 13:14:37 +02:00
}
2023-01-17 19:14:46 +01:00
if sw . seriesLimiter == nil {
sw . seriesLimiter = bloomfilter . NewLimiter ( sw . Config . SeriesLimit , 24 * time . Hour )
2022-08-17 12:18:47 +02:00
}
sl := sw . seriesLimiter
2021-09-01 13:14:37 +02:00
dstSeries := wc . writeRequest . Timeseries [ : 0 ]
2022-08-17 12:18:47 +02:00
samplesDropped := 0
2020-08-13 22:12:22 +02:00
for _ , ts := range wc . writeRequest . Timeseries {
2020-08-10 18:47:43 +02:00
h := sw . getLabelsHash ( ts . Labels )
2022-08-17 12:18:47 +02:00
if ! sl . Add ( h ) {
samplesDropped ++
2021-09-01 13:14:37 +02:00
continue
}
dstSeries = append ( dstSeries , ts )
2020-08-09 11:44:49 +02:00
}
2024-03-04 23:45:22 +01:00
clear ( wc . writeRequest . Timeseries [ len ( dstSeries ) : ] )
2021-09-01 13:14:37 +02:00
wc . writeRequest . Timeseries = dstSeries
2023-01-17 19:14:46 +01:00
if samplesDropped > 0 && ! sw . seriesLimitExceeded {
sw . seriesLimitExceeded = true
}
2022-08-17 12:18:47 +02:00
return samplesDropped
2020-09-11 22:36:24 +02:00
}
2020-08-09 11:44:49 +02:00
2023-01-18 08:09:34 +01:00
var sendStaleSeriesConcurrencyLimitCh = make ( chan struct { } , cgroup . AvailableCPUs ( ) )
2021-10-22 12:10:26 +02:00
func ( sw * scrapeWork ) sendStaleSeries ( lastScrape , currScrape string , timestamp int64 , addAutoSeries bool ) {
2023-01-18 08:09:34 +01:00
// This function is CPU-bound, while it may allocate big amounts of memory.
// That's why it is a good idea to limit the number of concurrent calls to this function
// in order to limit memory usage under high load without sacrificing the performance.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668
sendStaleSeriesConcurrencyLimitCh <- struct { } { }
defer func ( ) {
<- sendStaleSeriesConcurrencyLimitCh
} ( )
2024-01-21 23:38:21 +01:00
if sw . Config . NoStaleMarkers {
return
}
2021-09-11 09:51:04 +02:00
bodyString := lastScrape
if currScrape != "" {
2021-09-12 11:49:19 +02:00
bodyString = parser . GetRowsDiff ( lastScrape , currScrape )
2021-09-11 09:51:04 +02:00
}
2023-01-18 08:09:34 +01:00
wc := writeRequestCtxPool . Get ( sw . prevLabelsLen )
defer func ( ) {
wc . reset ( )
writeRequestCtxPool . Put ( wc )
} ( )
2021-09-11 09:51:04 +02:00
if bodyString != "" {
2023-01-24 06:52:57 +01:00
// Send stale markers in streaming mode in order to reduce memory usage
// when stale markers for targets exposing big number of metrics must be generated.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3668
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3675
var mu sync . Mutex
br := bytes . NewBufferString ( bodyString )
2023-10-02 21:32:11 +02:00
err := stream . Parse ( br , timestamp , false , false , func ( rows [ ] parser . Row ) error {
2023-01-24 06:52:57 +01:00
mu . Lock ( )
defer mu . Unlock ( )
for i := range rows {
sw . addRowToTimeseries ( wc , & rows [ i ] , timestamp , true )
}
// Apply series limit to stale markers in order to prevent sending stale markers for newly created series.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3660
if sw . seriesLimitExceeded {
sw . applySeriesLimit ( wc )
}
// Push the collected rows to sw before returning from the callback, since they cannot be held
// after returning from the callback - this will result in data race.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/825#issuecomment-723198247
setStaleMarkersForRows ( wc . writeRequest . Timeseries )
sw . pushData ( sw . Config . AuthToken , & wc . writeRequest )
wc . resetNoRows ( )
return nil
} , sw . logError )
if err != nil {
2023-10-25 21:24:01 +02:00
sw . logError ( fmt . Errorf ( "cannot send stale markers: %w" , err ) . Error ( ) )
2023-01-24 06:15:59 +01:00
}
2023-01-24 06:52:57 +01:00
}
if addAutoSeries {
am := & autoMetrics { }
sw . addAutoMetrics ( am , wc , timestamp )
2024-01-21 04:18:08 +01:00
setStaleMarkersForRows ( wc . writeRequest . Timeseries )
sw . pushData ( sw . Config . AuthToken , & wc . writeRequest )
2023-01-24 06:52:57 +01:00
}
}
2023-01-24 06:15:59 +01:00
2023-01-24 06:52:57 +01:00
func setStaleMarkersForRows ( series [ ] prompbmarshal . TimeSeries ) {
for _ , tss := range series {
samples := tss . Samples
for i := range samples {
samples [ i ] . Value = decimal . StaleNaN
2021-08-21 20:16:50 +02:00
}
2023-01-24 06:52:57 +01:00
staleSamplesCreated . Add ( len ( samples ) )
2021-08-13 11:10:00 +02:00
}
2020-08-09 11:44:49 +02:00
}
2022-05-06 14:33:13 +02:00
var staleSamplesCreated = metrics . NewCounter ( ` vm_promscrape_stale_samples_created_total ` )
2022-01-13 23:57:44 +01:00
2020-08-10 18:47:43 +02:00
func ( sw * scrapeWork ) getLabelsHash ( labels [ ] prompbmarshal . Label ) uint64 {
2020-08-09 11:44:49 +02:00
// It is OK if there will be hash collisions for distinct sets of labels,
// since the accuracy for `scrape_series_added` metric may be lower than 100%.
2020-08-10 18:47:43 +02:00
b := sw . labelsHashBuf [ : 0 ]
2020-08-09 11:44:49 +02:00
for _ , label := range labels {
2020-08-10 18:47:43 +02:00
b = append ( b , label . Name ... )
b = append ( b , label . Value ... )
2020-08-09 11:44:49 +02:00
}
2020-08-10 18:47:43 +02:00
sw . labelsHashBuf = b
return xxhash . Sum64 ( b )
2020-08-09 11:44:49 +02:00
}
2022-08-17 12:18:47 +02:00
type autoMetrics struct {
up int
scrapeDurationSeconds float64
2024-07-16 12:24:14 +02:00
scrapeResponseSize int
2022-08-17 12:18:47 +02:00
samplesScraped int
samplesPostRelabeling int
seriesAdded int
seriesLimitSamplesDropped int
}
2022-11-29 03:37:06 +01:00
func isAutoMetric ( s string ) bool {
2024-07-16 12:24:14 +02:00
if s == "up" {
return true
}
if ! strings . HasPrefix ( s , "scrape_" ) {
return false
}
2022-11-29 03:37:06 +01:00
switch s {
2024-07-16 12:24:14 +02:00
case "scrape_duration_seconds" ,
"scrape_response_size_bytes" ,
"scrape_samples_limit" ,
"scrape_samples_post_metric_relabeling" ,
"scrape_samples_scraped" ,
"scrape_series_added" ,
"scrape_series_current" ,
"scrape_series_limit" ,
"scrape_series_limit_samples_dropped" ,
"scrape_timeout_seconds" :
2022-11-29 03:37:06 +01:00
return true
2024-07-16 12:24:14 +02:00
default :
return false
2022-11-29 03:37:06 +01:00
}
}
2022-08-17 12:18:47 +02:00
func ( sw * scrapeWork ) addAutoMetrics ( am * autoMetrics , wc * writeRequestCtx , timestamp int64 ) {
sw . addAutoTimeseries ( wc , "scrape_duration_seconds" , am . scrapeDurationSeconds , timestamp )
2024-07-16 12:24:14 +02:00
sw . addAutoTimeseries ( wc , "scrape_response_size_bytes" , float64 ( am . scrapeResponseSize ) , timestamp )
2022-08-17 12:18:47 +02:00
if sampleLimit := sw . Config . SampleLimit ; sampleLimit > 0 {
2023-12-06 08:44:39 +01:00
// Expose scrape_samples_limit metric if sample_limit config is set for the target.
2022-08-17 12:18:47 +02:00
// See https://github.com/VictoriaMetrics/operator/issues/497
sw . addAutoTimeseries ( wc , "scrape_samples_limit" , float64 ( sampleLimit ) , timestamp )
}
2024-07-16 12:24:14 +02:00
sw . addAutoTimeseries ( wc , "scrape_samples_post_metric_relabeling" , float64 ( am . samplesPostRelabeling ) , timestamp )
sw . addAutoTimeseries ( wc , "scrape_samples_scraped" , float64 ( am . samplesScraped ) , timestamp )
sw . addAutoTimeseries ( wc , "scrape_series_added" , float64 ( am . seriesAdded ) , timestamp )
2022-08-17 12:18:47 +02:00
if sl := sw . seriesLimiter ; sl != nil {
2024-07-16 12:24:14 +02:00
sw . addAutoTimeseries ( wc , "scrape_series_current" , float64 ( sl . CurrentItems ( ) ) , timestamp )
2022-08-17 12:18:47 +02:00
sw . addAutoTimeseries ( wc , "scrape_series_limit_samples_dropped" , float64 ( am . seriesLimitSamplesDropped ) , timestamp )
sw . addAutoTimeseries ( wc , "scrape_series_limit" , float64 ( sl . MaxItems ( ) ) , timestamp )
}
2024-07-16 12:24:14 +02:00
sw . addAutoTimeseries ( wc , "scrape_timeout_seconds" , sw . Config . ScrapeTimeout . Seconds ( ) , timestamp )
sw . addAutoTimeseries ( wc , "up" , float64 ( am . up ) , timestamp )
2022-08-17 12:18:47 +02:00
}
2020-02-23 12:35:47 +01:00
// addAutoTimeseries adds automatically generated time series with the given name, value and timestamp.
//
// See https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series
2020-08-13 22:12:22 +02:00
func ( sw * scrapeWork ) addAutoTimeseries ( wc * writeRequestCtx , name string , value float64 , timestamp int64 ) {
2020-02-23 12:35:47 +01:00
sw . tmpRow . Metric = name
sw . tmpRow . Tags = nil
sw . tmpRow . Value = value
sw . tmpRow . Timestamp = timestamp
2020-08-13 22:12:22 +02:00
sw . addRowToTimeseries ( wc , & sw . tmpRow , timestamp , false )
2020-02-23 12:35:47 +01:00
}
2020-08-13 22:12:22 +02:00
func ( sw * scrapeWork ) addRowToTimeseries ( wc * writeRequestCtx , r * parser . Row , timestamp int64 , needRelabel bool ) {
2022-11-29 03:37:06 +01:00
metric := r . Metric
2023-02-01 21:00:50 +01:00
// Add `exported_` prefix to metrics, which clash with the automatically generated
// metric names only if the following conditions are met:
//
// - The `honor_labels` option isn't set to true in the scrape_config.
// If `honor_labels: true`, then the scraped metric name must remain unchanged
// because the user explicitly asked about it in the config.
// - The metric has no labels (tags). If it has labels, then the metric value
// will be written into a separate time series comparing to automatically generated time series.
//
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3557
// and https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3406
if needRelabel && ! sw . Config . HonorLabels && len ( r . Tags ) == 0 && isAutoMetric ( metric ) {
bb := bbPool . Get ( )
bb . B = append ( bb . B , "exported_" ... )
bb . B = append ( bb . B , metric ... )
metric = bytesutil . InternBytes ( bb . B )
bbPool . Put ( bb )
2022-11-29 03:37:06 +01:00
}
2020-08-13 22:12:22 +02:00
labelsLen := len ( wc . labels )
2022-11-30 06:22:12 +01:00
targetLabels := sw . Config . Labels . GetLabels ( )
wc . labels = appendLabels ( wc . labels , metric , r . Tags , targetLabels , sw . Config . HonorLabels )
2020-06-29 21:29:29 +02:00
if needRelabel {
2022-10-09 13:51:14 +02:00
wc . labels = sw . Config . MetricRelabelConfigs . Apply ( wc . labels , labelsLen )
2020-06-29 21:29:29 +02:00
}
2022-10-09 13:51:14 +02:00
wc . labels = promrelabel . FinalizeLabels ( wc . labels [ : labelsLen ] , wc . labels [ labelsLen : ] )
2020-08-13 22:12:22 +02:00
if len ( wc . labels ) == labelsLen {
2020-02-23 12:35:47 +01:00
// Skip row without labels.
return
}
2022-10-01 15:13:17 +02:00
// Add labels from `global->external_labels` section after the relabeling like Prometheus does.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3137
2022-11-30 06:22:12 +01:00
externalLabels := sw . Config . ExternalLabels . GetLabels ( )
wc . labels = appendExtraLabels ( wc . labels , externalLabels , labelsLen , sw . Config . HonorLabels )
2020-09-11 22:36:24 +02:00
sampleTimestamp := r . Timestamp
if ! sw . Config . HonorTimestamps || sampleTimestamp == 0 {
sampleTimestamp = timestamp
2020-02-23 12:35:47 +01:00
}
2020-09-11 22:36:24 +02:00
wc . samples = append ( wc . samples , prompbmarshal . Sample {
Value : r . Value ,
Timestamp : sampleTimestamp ,
} )
2020-08-13 22:12:22 +02:00
wr := & wc . writeRequest
2020-09-11 22:36:24 +02:00
wr . Timeseries = append ( wr . Timeseries , prompbmarshal . TimeSeries {
Revert "Exemplar support (#5982)"
This reverts commit 5a3abfa0414ab495cbc34a58146b540aa8289636.
Reason for revert: exemplars aren't in wide use because they have numerous issues which prevent their adoption (see below).
Adding support for examplars into VictoriaMetrics introduces non-trivial code changes. These code changes need to be supported forever
once the release of VictoriaMetrics with exemplar support is published. That's why I don't think this is a good feature despite
that the source code of the reverted commit has an excellent quality. See https://docs.victoriametrics.com/goals/ .
Issues with Prometheus exemplars:
- Prometheus still has only experimental support for exemplars after more than three years since they were introduced.
It stores exemplars in memory, so they are lost after Prometheus restart. This doesn't look like production-ready feature.
See https://github.com/prometheus/docs/blob/0a2f3b37940e2949eefe752ed7b6c768e0b00128/content/docs/instrumenting/exposition_formats.md?plain=1#L153-L159
and https://prometheus.io/docs/prometheus/latest/feature_flags/#exemplars-storage
- It is very non-trivial to expose exemplars alongside metrics in your application, since the official Prometheus SDKs
for metrics' exposition ( https://prometheus.io/docs/instrumenting/clientlibs/ ) either have very hard-to-use API
for exposing histograms or do not have this API at all. For example, try figuring out how to expose exemplars
via https://pkg.go.dev/github.com/prometheus/client_golang@v1.19.1/prometheus .
- It looks like exemplars are supported for Histogram metric types only -
see https://pkg.go.dev/github.com/prometheus/client_golang@v1.19.1/prometheus#Timer.ObserveDurationWithExemplar .
Exemplars aren't supported for Counter, Gauge and Summary metric types.
- Grafana has very poor support for Prometheus exemplars. It looks like it supports exemplars only when the query
contains histogram_quantile() function. It queries exemplars via special Prometheus API -
https://prometheus.io/docs/prometheus/latest/querying/api/#querying-exemplars - (which is still marked as experimental, btw.)
and then displays all the returned exemplars on the graph as special dots. The issue is that this doesn't work
in production in most cases when the histogram_quantile() is calculated over thousands of histogram buckets
exposed by big number of application instances. Every histogram bucket may expose an exemplar on every timestamp shown on the graph.
This makes the graph unusable, since it is litterally filled with thousands of exemplar dots.
Neither Prometheus API nor Grafana doesn't provide the ability to filter out unneeded exemplars.
- Exemplars are usually connected to traces. While traces are good for some
I doubt exemplars will become production-ready in the near future because of the issues outlined above.
Alternative to exemplars:
Exemplars are marketed as a silver bullet for the correlation between metrics, traces and logs -
just click the exemplar dot on some graph in Grafana and instantly see the corresponding trace or log entry!
This doesn't work as expected in production as shown above. Are there better solutions, which work in production?
Yes - just use time-based and label-based correlation between metrics, traces and logs. Assign the same `job`
and `instance` labels to metrics, logs and traces, so you can quickly find the needed trace or log entry
by these labes on the time range with the anomaly on metrics' graph.
2024-07-03 15:30:11 +02:00
Labels : wc . labels [ labelsLen : ] ,
Samples : wc . samples [ len ( wc . samples ) - 1 : ] ,
2020-09-11 22:36:24 +02:00
} )
2020-02-23 12:35:47 +01:00
}
2022-11-29 03:37:06 +01:00
var bbPool bytesutil . ByteBufferPool
2020-02-23 12:35:47 +01:00
func appendLabels ( dst [ ] prompbmarshal . Label , metric string , src [ ] parser . Tag , extraLabels [ ] prompbmarshal . Label , honorLabels bool ) [ ] prompbmarshal . Label {
dstLen := len ( dst )
dst = append ( dst , prompbmarshal . Label {
Name : "__name__" ,
Value : metric ,
} )
for i := range src {
tag := & src [ i ]
dst = append ( dst , prompbmarshal . Label {
Name : tag . Key ,
Value : tag . Value ,
} )
}
2022-10-01 15:13:17 +02:00
return appendExtraLabels ( dst , extraLabels , dstLen , honorLabels )
}
func appendExtraLabels ( dst , extraLabels [ ] prompbmarshal . Label , offset int , honorLabels bool ) [ ] prompbmarshal . Label {
// Add extraLabels to labels.
// Handle duplicates in the same way as Prometheus does.
2022-10-28 20:34:07 +02:00
if len ( dst ) == offset {
2022-10-01 15:13:17 +02:00
// Fast path - add extraLabels to dst without the need to de-duplicate.
dst = append ( dst , extraLabels ... )
2020-02-23 12:35:47 +01:00
return dst
}
2022-10-28 20:34:07 +02:00
offsetEnd := len ( dst )
2022-10-01 15:13:17 +02:00
for _ , label := range extraLabels {
2022-10-28 20:34:07 +02:00
labels := dst [ offset : offsetEnd ]
2022-10-01 15:13:17 +02:00
prevLabel := promrelabel . GetLabelByName ( labels , label . Name )
2020-02-23 12:35:47 +01:00
if prevLabel == nil {
2022-10-01 15:13:17 +02:00
// Fast path - the label doesn't exist in labels, so just add it to dst.
dst = append ( dst , label )
2020-02-23 12:35:47 +01:00
continue
}
if honorLabels {
// Skip the extra label with the same name.
continue
}
2022-10-01 15:13:17 +02:00
// Rename the prevLabel to "exported_" + label.Name
2020-02-23 12:35:47 +01:00
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config
exportedName := "exported_" + label . Name
2022-10-01 15:13:17 +02:00
exportedLabel := promrelabel . GetLabelByName ( labels , exportedName )
2022-10-28 20:34:07 +02:00
if exportedLabel != nil {
// The label with the name exported_<label.Name> already exists.
// Add yet another 'exported_' prefix to it.
exportedLabel . Name = "exported_" + exportedName
2020-02-23 12:35:47 +01:00
}
2022-10-28 20:34:07 +02:00
prevLabel . Name = exportedName
dst = append ( dst , label )
2020-02-23 12:35:47 +01:00
}
2022-10-01 15:13:17 +02:00
return dst
2020-02-23 12:35:47 +01:00
}