2023-10-13 13:54:33 +02:00
package remotewrite
import (
"bytes"
"context"
2023-11-08 07:53:07 +01:00
"errors"
2023-10-13 13:54:33 +02:00
"flag"
"fmt"
"io"
"net/http"
"path"
"strings"
"sync"
"time"
"github.com/golang/snappy"
2024-10-22 14:43:55 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
2023-10-13 13:54:33 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
2024-05-30 17:54:42 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
2023-10-13 13:54:33 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/metrics"
)
2024-10-22 14:43:55 +02:00
var defaultConcurrency = cgroup . AvailableCPUs ( ) * 2
2023-10-13 13:54:33 +02:00
const (
2024-10-22 14:43:55 +02:00
defaultMaxBatchSize = 1e4
2024-11-20 16:20:51 +01:00
defaultMaxQueueSize = 1e5
2024-10-22 14:43:55 +02:00
defaultFlushInterval = 2 * time . Second
2023-10-13 13:54:33 +02:00
defaultWriteTimeout = 30 * time . Second
)
var (
disablePathAppend = flag . Bool ( "remoteWrite.disablePathAppend" , false , "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url." )
sendTimeout = flag . Duration ( "remoteWrite.sendTimeout" , 30 * time . Second , "Timeout for sending data to the configured -remoteWrite.url." )
2024-10-08 13:14:38 +02:00
retryMinInterval = flag . Duration ( "remoteWrite.retryMinInterval" , time . Second , "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxTime" )
2023-10-13 13:54:33 +02:00
retryMaxTime = flag . Duration ( "remoteWrite.retryMaxTime" , time . Second * 30 , "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval" )
)
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
addr string
c * http . Client
authCfg * promauth . Config
input chan prompbmarshal . TimeSeries
flushInterval time . Duration
maxBatchSize int
maxQueueSize int
wg sync . WaitGroup
doneCh chan struct { }
}
// Config is config for remote write client.
type Config struct {
// Addr of remote storage
Addr string
AuthCfg * promauth . Config
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
// populated by Push method.
// Push will be rejected once queue is full.
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time . Duration
// Transport will be used by the underlying http.Client
Transport * http . Transport
}
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient ( ctx context . Context , cfg Config ) ( * Client , error ) {
if cfg . Addr == "" {
return nil , fmt . Errorf ( "config.Addr can't be empty" )
}
if cfg . MaxBatchSize == 0 {
cfg . MaxBatchSize = defaultMaxBatchSize
}
if cfg . MaxQueueSize == 0 {
cfg . MaxQueueSize = defaultMaxQueueSize
}
if cfg . FlushInterval == 0 {
cfg . FlushInterval = defaultFlushInterval
}
if cfg . Transport == nil {
cfg . Transport = http . DefaultTransport . ( * http . Transport ) . Clone ( )
}
cc := defaultConcurrency
if cfg . Concurrency > 0 {
cc = cfg . Concurrency
}
c := & Client {
c : & http . Client {
Timeout : * sendTimeout ,
Transport : cfg . Transport ,
} ,
addr : strings . TrimSuffix ( cfg . Addr , "/" ) ,
authCfg : cfg . AuthCfg ,
flushInterval : cfg . FlushInterval ,
maxBatchSize : cfg . MaxBatchSize ,
maxQueueSize : cfg . MaxQueueSize ,
doneCh : make ( chan struct { } ) ,
input : make ( chan prompbmarshal . TimeSeries , cfg . MaxQueueSize ) ,
}
for i := 0 ; i < cc ; i ++ {
c . run ( ctx )
}
return c , nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func ( c * Client ) Push ( s prompbmarshal . TimeSeries ) error {
2023-11-08 07:53:07 +01:00
rwTotal . Inc ( )
2023-10-13 13:54:33 +02:00
select {
case <- c . doneCh :
2023-11-08 07:53:07 +01:00
rwErrors . Inc ( )
droppedRows . Add ( len ( s . Samples ) )
2023-10-13 13:54:33 +02:00
return fmt . Errorf ( "client is closed" )
case c . input <- s :
return nil
default :
2023-11-08 07:53:07 +01:00
rwErrors . Inc ( )
droppedRows . Add ( len ( s . Samples ) )
2023-10-13 13:54:33 +02:00
return fmt . Errorf ( "failed to push timeseries - queue is full (%d entries). " +
"Queue size is controlled by -remoteWrite.maxQueueSize flag" ,
c . maxQueueSize )
}
}
// Close stops the client and waits for all goroutines
// to exit.
func ( c * Client ) Close ( ) error {
if c . doneCh == nil {
return fmt . Errorf ( "client is already closed" )
}
close ( c . input )
close ( c . doneCh )
c . wg . Wait ( )
return nil
}
func ( c * Client ) run ( ctx context . Context ) {
ticker := time . NewTicker ( c . flushInterval )
wr := & prompbmarshal . WriteRequest { }
shutdown := func ( ) {
2024-03-29 14:27:50 +01:00
lastCtx , cancel := context . WithTimeout ( context . Background ( ) , defaultWriteTimeout )
logger . Infof ( "shutting down remote write client and flushing remained series" )
shutdownFlushCnt := 0
2023-10-13 13:54:33 +02:00
for ts := range c . input {
wr . Timeseries = append ( wr . Timeseries , ts )
2024-03-29 14:27:50 +01:00
if len ( wr . Timeseries ) >= c . maxBatchSize {
shutdownFlushCnt += len ( wr . Timeseries )
c . flush ( lastCtx , wr )
}
2023-10-13 13:54:33 +02:00
}
2024-03-29 14:27:50 +01:00
// flush the last batch. `flush` will re-check and avoid flushing empty batch.
shutdownFlushCnt += len ( wr . Timeseries )
2023-10-13 13:54:33 +02:00
c . flush ( lastCtx , wr )
2024-03-29 14:27:50 +01:00
logger . Infof ( "shutting down remote write client flushed %d series" , shutdownFlushCnt )
2023-10-13 13:54:33 +02:00
cancel ( )
}
c . wg . Add ( 1 )
go func ( ) {
defer c . wg . Done ( )
defer ticker . Stop ( )
for {
select {
case <- c . doneCh :
shutdown ( )
return
case <- ctx . Done ( ) :
shutdown ( )
return
case <- ticker . C :
c . flush ( ctx , wr )
case ts , ok := <- c . input :
if ! ok {
continue
}
wr . Timeseries = append ( wr . Timeseries , ts )
if len ( wr . Timeseries ) >= c . maxBatchSize {
c . flush ( ctx , wr )
}
}
}
} ( )
}
var (
2023-11-08 07:53:07 +01:00
rwErrors = metrics . NewCounter ( ` vmalert_remotewrite_errors_total ` )
rwTotal = metrics . NewCounter ( ` vmalert_remotewrite_total ` )
2023-10-13 13:54:33 +02:00
sentRows = metrics . NewCounter ( ` vmalert_remotewrite_sent_rows_total ` )
sentBytes = metrics . NewCounter ( ` vmalert_remotewrite_sent_bytes_total ` )
droppedRows = metrics . NewCounter ( ` vmalert_remotewrite_dropped_rows_total ` )
2023-11-08 07:53:07 +01:00
sendDuration = metrics . NewFloatCounter ( ` vmalert_remotewrite_send_duration_seconds_total ` )
2023-10-13 13:54:33 +02:00
bufferFlushDuration = metrics . NewHistogram ( ` vmalert_remotewrite_flush_duration_seconds ` )
_ = metrics . NewGauge ( ` vmalert_remotewrite_concurrency ` , func ( ) float64 {
return float64 ( * concurrency )
} )
)
2024-06-20 15:12:53 +02:00
// GetDroppedRows returns value of droppedRows metric
func GetDroppedRows ( ) int64 { return int64 ( droppedRows . Get ( ) ) }
2023-10-13 13:54:33 +02:00
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
func ( c * Client ) flush ( ctx context . Context , wr * prompbmarshal . WriteRequest ) {
if len ( wr . Timeseries ) < 1 {
return
}
2024-01-14 22:04:45 +01:00
defer wr . Reset ( )
2023-10-13 13:54:33 +02:00
defer bufferFlushDuration . UpdateDuration ( time . Now ( ) )
2024-01-14 22:04:45 +01:00
data := wr . MarshalProtobuf ( nil )
2023-10-13 13:54:33 +02:00
b := snappy . Encode ( nil , data )
retryInterval , maxRetryInterval := * retryMinInterval , * retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time . Now ( )
defer func ( ) {
sendDuration . Add ( time . Since ( timeStart ) . Seconds ( ) )
} ( )
L :
for attempts := 0 ; ; attempts ++ {
err := c . send ( ctx , b )
2024-05-30 17:54:42 +02:00
if err != nil && ( errors . Is ( err , io . EOF ) || netutil . IsTrivialNetworkError ( err ) ) {
2023-11-08 07:53:07 +01:00
// Something in the middle between client and destination might be closing
// the connection. So we do a one more attempt in hope request will succeed.
err = c . send ( ctx , b )
}
2023-10-13 13:54:33 +02:00
if err == nil {
sentRows . Add ( len ( wr . Timeseries ) )
sentBytes . Add ( len ( b ) )
return
}
_ , isNotRetriable := err . ( * nonRetriableError )
logger . Warnf ( "attempt %d to send request failed: %s (retriable: %v)" , attempts + 1 , err , ! isNotRetriable )
if isNotRetriable {
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <- ctx . Done ( ) :
logger . Errorf ( "interrupting retry attempt %d: context cancelled" , attempts + 1 )
break L
default :
}
timeLeftForRetries := maxRetryInterval - time . Since ( timeStart )
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time . Sleep ( retryInterval )
retryInterval *= 2
}
2023-11-08 07:53:07 +01:00
rwErrors . Inc ( )
2024-01-14 21:52:47 +01:00
rows := 0
for _ , ts := range wr . Timeseries {
rows += len ( ts . Samples )
}
droppedRows . Add ( rows )
2023-10-13 13:54:33 +02:00
logger . Errorf ( "attempts to send remote-write request failed - dropping %d time series" ,
len ( wr . Timeseries ) )
}
func ( c * Client ) send ( ctx context . Context , data [ ] byte ) error {
r := bytes . NewReader ( data )
2024-02-29 14:25:36 +01:00
req , err := http . NewRequestWithContext ( ctx , http . MethodPost , c . addr , r )
2023-10-13 13:54:33 +02:00
if err != nil {
return fmt . Errorf ( "failed to create new HTTP request: %w" , err )
}
// RFC standard compliant headers
req . Header . Set ( "Content-Encoding" , "snappy" )
req . Header . Set ( "Content-Type" , "application/x-protobuf" )
// Prometheus compliant headers
req . Header . Set ( "X-Prometheus-Remote-Write-Version" , "0.1.0" )
if c . authCfg != nil {
2023-10-17 11:58:19 +02:00
err = c . authCfg . SetHeaders ( req , true )
if err != nil {
2023-10-25 23:19:33 +02:00
return & nonRetriableError {
err : err ,
}
2023-10-17 11:58:19 +02:00
}
2023-10-13 13:54:33 +02:00
}
if ! * disablePathAppend {
req . URL . Path = path . Join ( req . URL . Path , "/api/v1/write" )
}
2024-02-29 14:25:36 +01:00
resp , err := c . c . Do ( req )
2023-10-13 13:54:33 +02:00
if err != nil {
return fmt . Errorf ( "error while sending request to %s: %w; Data len %d(%d)" ,
req . URL . Redacted ( ) , err , len ( data ) , r . Size ( ) )
}
defer func ( ) { _ = resp . Body . Close ( ) } ( )
body , _ := io . ReadAll ( resp . Body )
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
switch resp . StatusCode / 100 {
case 2 :
2023-11-08 07:53:07 +01:00
// respond with HTTP 2xx status code when write is successful.
2023-10-13 13:54:33 +02:00
return nil
case 4 :
if resp . StatusCode != http . StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
2023-10-25 23:19:33 +02:00
return & nonRetriableError {
err : fmt . Errorf ( "unexpected response code %d for %s. Response body %q" , resp . StatusCode , req . URL . Redacted ( ) , body ) ,
}
2023-10-13 13:54:33 +02:00
}
fallthrough
default :
return fmt . Errorf ( "unexpected response code %d for %s. Response body %q" ,
resp . StatusCode , req . URL . Redacted ( ) , body )
}
}
type nonRetriableError struct {
err error
}
func ( e * nonRetriableError ) Error ( ) string {
return e . err . Error ( )
}