2020-04-27 23:19:27 +02:00
package remotewrite
import (
"bytes"
"context"
2022-05-13 15:19:32 +02:00
"flag"
2020-04-27 23:19:27 +02:00
"fmt"
2022-08-21 23:13:44 +02:00
"io"
2020-04-27 23:19:27 +02:00
"net/http"
2021-09-16 13:00:16 +02:00
"path"
2020-04-27 23:19:27 +02:00
"strings"
"sync"
"time"
2021-09-14 13:32:06 +02:00
"github.com/golang/snappy"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
2021-09-14 13:32:06 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
2020-04-27 23:19:27 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
2020-07-05 17:46:52 +02:00
"github.com/VictoriaMetrics/metrics"
2020-04-27 23:19:27 +02:00
)
2022-05-13 15:19:32 +02:00
var (
disablePathAppend = flag . Bool ( "remoteWrite.disablePathAppend" , false , "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url." )
2022-12-01 09:57:19 +01:00
sendTimeout = flag . Duration ( "remoteWrite.sendTimeout" , 30 * time . Second , "Timeout for sending data to the configured -remoteWrite.url." )
2023-06-22 15:14:23 +02:00
retryMinInterval = flag . Duration ( "remoteWrite.retryMinInterval" , time . Second , "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval" )
retryMaxTime = flag . Duration ( "remoteWrite.retryMaxTime" , time . Second * 30 , "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval" )
2022-05-13 15:19:32 +02:00
)
2020-04-27 23:19:27 +02:00
// Client is an asynchronous HTTP client for writing
// timeseries via remote write protocol.
type Client struct {
2022-05-13 15:19:32 +02:00
addr string
c * http . Client
authCfg * promauth . Config
input chan prompbmarshal . TimeSeries
flushInterval time . Duration
maxBatchSize int
maxQueueSize int
2020-04-27 23:19:27 +02:00
wg sync . WaitGroup
doneCh chan struct { }
}
2020-04-28 10:19:37 +02:00
// Config is config for remote write.
2020-04-27 23:19:27 +02:00
type Config struct {
// Addr of remote storage
2021-09-14 13:32:06 +02:00
Addr string
AuthCfg * promauth . Config
2020-04-27 23:19:27 +02:00
2020-06-01 12:46:37 +02:00
// Concurrency defines number of readers that
// concurrently read from the queue and flush data
Concurrency int
2020-04-27 23:19:27 +02:00
// MaxBatchSize defines max number of timeseries
// to be flushed at once
MaxBatchSize int
// MaxQueueSize defines max length of input queue
2020-06-01 12:46:37 +02:00
// populated by Push method.
// Push will be rejected once queue is full.
2020-04-27 23:19:27 +02:00
MaxQueueSize int
// FlushInterval defines time interval for flushing batches
FlushInterval time . Duration
2020-06-23 21:45:45 +02:00
// Transport will be used by the underlying http.Client
Transport * http . Transport
2020-04-27 23:19:27 +02:00
}
const (
2020-06-01 12:46:37 +02:00
defaultConcurrency = 4
2020-04-27 23:19:27 +02:00
defaultMaxBatchSize = 1e3
2020-06-01 12:46:37 +02:00
defaultMaxQueueSize = 1e5
2020-07-05 17:46:52 +02:00
defaultFlushInterval = 5 * time . Second
2020-04-27 23:19:27 +02:00
defaultWriteTimeout = 30 * time . Second
)
// NewClient returns asynchronous client for
// writing timeseries via remotewrite protocol.
func NewClient ( ctx context . Context , cfg Config ) ( * Client , error ) {
if cfg . Addr == "" {
return nil , fmt . Errorf ( "config.Addr can't be empty" )
}
if cfg . MaxBatchSize == 0 {
cfg . MaxBatchSize = defaultMaxBatchSize
}
if cfg . MaxQueueSize == 0 {
cfg . MaxQueueSize = defaultMaxQueueSize
}
if cfg . FlushInterval == 0 {
cfg . FlushInterval = defaultFlushInterval
}
2020-07-05 17:46:52 +02:00
if cfg . Transport == nil {
cfg . Transport = http . DefaultTransport . ( * http . Transport ) . Clone ( )
}
2021-09-16 13:00:16 +02:00
cc := defaultConcurrency
if cfg . Concurrency > 0 {
cc = cfg . Concurrency
}
2020-04-27 23:19:27 +02:00
c := & Client {
c : & http . Client {
2022-12-01 09:57:19 +01:00
Timeout : * sendTimeout ,
2020-06-23 21:45:45 +02:00
Transport : cfg . Transport ,
2020-04-27 23:19:27 +02:00
} ,
2022-05-13 15:19:32 +02:00
addr : strings . TrimSuffix ( cfg . Addr , "/" ) ,
authCfg : cfg . AuthCfg ,
flushInterval : cfg . FlushInterval ,
maxBatchSize : cfg . MaxBatchSize ,
maxQueueSize : cfg . MaxQueueSize ,
doneCh : make ( chan struct { } ) ,
input : make ( chan prompbmarshal . TimeSeries , cfg . MaxQueueSize ) ,
2020-04-27 23:19:27 +02:00
}
2021-09-16 13:00:16 +02:00
2020-06-01 12:46:37 +02:00
for i := 0 ; i < cc ; i ++ {
c . run ( ctx )
}
2020-04-27 23:19:27 +02:00
return c , nil
}
// Push adds timeseries into queue for writing into remote storage.
// Push returns and error if client is stopped or if queue is full.
func ( c * Client ) Push ( s prompbmarshal . TimeSeries ) error {
select {
case <- c . doneCh :
return fmt . Errorf ( "client is closed" )
case c . input <- s :
return nil
default :
2020-05-13 20:32:21 +02:00
return fmt . Errorf ( "failed to push timeseries - queue is full (%d entries). " +
"Queue size is controlled by -remoteWrite.maxQueueSize flag" ,
2020-04-27 23:19:27 +02:00
c . maxQueueSize )
}
}
// Close stops the client and waits for all goroutines
// to exit.
func ( c * Client ) Close ( ) error {
if c . doneCh == nil {
return fmt . Errorf ( "client is already closed" )
}
close ( c . input )
close ( c . doneCh )
c . wg . Wait ( )
return nil
}
func ( c * Client ) run ( ctx context . Context ) {
ticker := time . NewTicker ( c . flushInterval )
2020-07-05 17:46:52 +02:00
wr := & prompbmarshal . WriteRequest { }
2020-04-27 23:19:27 +02:00
shutdown := func ( ) {
for ts := range c . input {
wr . Timeseries = append ( wr . Timeseries , ts )
}
2020-06-01 12:46:37 +02:00
lastCtx , cancel := context . WithTimeout ( context . Background ( ) , defaultWriteTimeout )
2023-06-22 15:07:32 +02:00
logger . Infof ( "shutting down remote write client and flushing remained %d series" , len ( wr . Timeseries ) )
2020-04-27 23:19:27 +02:00
c . flush ( lastCtx , wr )
cancel ( )
}
c . wg . Add ( 1 )
go func ( ) {
defer c . wg . Done ( )
defer ticker . Stop ( )
for {
select {
case <- c . doneCh :
shutdown ( )
return
case <- ctx . Done ( ) :
shutdown ( )
return
case <- ticker . C :
c . flush ( ctx , wr )
2020-07-05 17:46:52 +02:00
case ts , ok := <- c . input :
if ! ok {
continue
}
2020-04-27 23:19:27 +02:00
wr . Timeseries = append ( wr . Timeseries , ts )
if len ( wr . Timeseries ) >= c . maxBatchSize {
c . flush ( ctx , wr )
}
}
}
} ( )
}
2020-07-05 17:46:52 +02:00
var (
2021-09-16 13:00:16 +02:00
sentRows = metrics . NewCounter ( ` vmalert_remotewrite_sent_rows_total ` )
sentBytes = metrics . NewCounter ( ` vmalert_remotewrite_sent_bytes_total ` )
droppedRows = metrics . NewCounter ( ` vmalert_remotewrite_dropped_rows_total ` )
droppedBytes = metrics . NewCounter ( ` vmalert_remotewrite_dropped_bytes_total ` )
bufferFlushDuration = metrics . NewHistogram ( ` vmalert_remotewrite_flush_duration_seconds ` )
2020-07-05 17:46:52 +02:00
)
// flush is a blocking function that marshals WriteRequest and sends
2023-05-16 18:51:38 +02:00
// it to remote-write endpoint. Flush performs limited amount of retries
2020-07-05 17:46:52 +02:00
// if request fails.
func ( c * Client ) flush ( ctx context . Context , wr * prompbmarshal . WriteRequest ) {
2020-04-27 23:19:27 +02:00
if len ( wr . Timeseries ) < 1 {
return
}
2020-07-05 17:46:52 +02:00
defer prompbmarshal . ResetWriteRequest ( wr )
2021-09-16 13:00:16 +02:00
defer bufferFlushDuration . UpdateDuration ( time . Now ( ) )
2020-07-05 17:46:52 +02:00
2020-04-27 23:19:27 +02:00
data , err := wr . Marshal ( )
if err != nil {
logger . Errorf ( "failed to marshal WriteRequest: %s" , err )
return
}
2020-07-05 17:46:52 +02:00
b := snappy . Encode ( nil , data )
2023-06-22 15:14:23 +02:00
retryInterval , maxRetryInterval := * retryMinInterval , * retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time . Now ( )
L :
for attempts := 0 ; ; attempts ++ {
2020-07-05 17:46:52 +02:00
err := c . send ( ctx , b )
if err == nil {
sentRows . Add ( len ( wr . Timeseries ) )
sentBytes . Add ( len ( b ) )
return
}
2023-06-20 13:24:45 +02:00
_ , isNotRetriable := err . ( * nonRetriableError )
logger . Warnf ( "attempt %d to send request failed: %s (retriable: %v)" , attempts + 1 , err , ! isNotRetriable )
2023-05-16 18:51:38 +02:00
2023-06-20 13:24:45 +02:00
if isNotRetriable {
2023-05-16 18:51:38 +02:00
// exit fast if error isn't retriable
break
}
// check if request has been cancelled before backoff
select {
case <- ctx . Done ( ) :
2023-06-22 15:07:32 +02:00
logger . Errorf ( "interrupting retry attempt %d: context cancelled" , attempts + 1 )
break L
2023-05-16 18:51:38 +02:00
default :
2023-05-16 16:30:03 +02:00
}
2023-05-16 18:51:38 +02:00
2023-06-22 15:14:23 +02:00
timeLeftForRetries := maxRetryInterval - time . Since ( timeStart )
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time . Sleep ( retryInterval )
retryInterval *= 2
2020-07-05 17:46:52 +02:00
}
droppedRows . Add ( len ( wr . Timeseries ) )
droppedBytes . Add ( len ( b ) )
2023-05-16 18:51:38 +02:00
logger . Errorf ( "attempts to send remote-write request failed - dropping %d time series" ,
len ( wr . Timeseries ) )
2020-07-05 17:46:52 +02:00
}
func ( c * Client ) send ( ctx context . Context , data [ ] byte ) error {
r := bytes . NewReader ( data )
2023-02-23 03:53:05 +01:00
req , err := http . NewRequest ( http . MethodPost , c . addr , r )
2020-04-27 23:19:27 +02:00
if err != nil {
2020-07-15 12:54:45 +02:00
return fmt . Errorf ( "failed to create new HTTP request: %w" , err )
2020-04-27 23:19:27 +02:00
}
2022-06-07 14:33:21 +02:00
2022-06-13 08:59:03 +02:00
// RFC standard compliant headers
2022-06-07 14:33:21 +02:00
req . Header . Set ( "Content-Encoding" , "snappy" )
2022-06-13 08:59:03 +02:00
req . Header . Set ( "Content-Type" , "application/x-protobuf" )
// Prometheus compliant headers
req . Header . Set ( "X-Prometheus-Remote-Write-Version" , "0.1.0" )
2022-06-07 14:33:21 +02:00
2021-09-14 13:32:06 +02:00
if c . authCfg != nil {
2022-06-22 19:38:43 +02:00
c . authCfg . SetHeaders ( req , true )
2020-04-27 23:19:27 +02:00
}
2022-05-13 15:19:32 +02:00
if ! * disablePathAppend {
req . URL . Path = path . Join ( req . URL . Path , "/api/v1/write" )
2021-08-16 13:20:57 +02:00
}
2020-04-27 23:19:27 +02:00
resp , err := c . c . Do ( req . WithContext ( ctx ) )
if err != nil {
2020-07-15 12:54:45 +02:00
return fmt . Errorf ( "error while sending request to %s: %w; Data len %d(%d)" ,
2021-10-18 09:20:26 +02:00
req . URL . Redacted ( ) , err , len ( data ) , r . Size ( ) )
2020-04-27 23:19:27 +02:00
}
defer func ( ) { _ = resp . Body . Close ( ) } ( )
2023-05-16 18:51:38 +02:00
2023-05-16 16:30:03 +02:00
body , _ := io . ReadAll ( resp . Body )
2023-05-16 18:51:38 +02:00
// according to https://prometheus.io/docs/concepts/remote_write_spec/
// Prometheus remote Write compatible receivers MUST
2023-05-16 16:30:03 +02:00
switch resp . StatusCode / 100 {
case 2 :
2023-05-16 18:51:38 +02:00
// respond with a HTTP 2xx status code when the write is successful.
2023-05-16 16:30:03 +02:00
return nil
2023-06-20 13:24:45 +02:00
case 4 :
if resp . StatusCode != http . StatusTooManyRequests {
// MUST NOT retry write requests on HTTP 4xx responses other than 429
return & nonRetriableError { fmt . Errorf ( "unexpected response code %d for %s. Response body %q" ,
resp . StatusCode , req . URL . Redacted ( ) , body ) }
}
fallthrough
2023-05-16 16:30:03 +02:00
default :
2020-07-05 17:46:52 +02:00
return fmt . Errorf ( "unexpected response code %d for %s. Response body %q" ,
2021-10-18 09:20:26 +02:00
resp . StatusCode , req . URL . Redacted ( ) , body )
2020-04-27 23:19:27 +02:00
}
2023-05-16 16:30:03 +02:00
}
2023-06-20 13:24:45 +02:00
type nonRetriableError struct {
2023-05-16 16:30:03 +02:00
err error
}
2023-06-20 13:24:45 +02:00
func ( e * nonRetriableError ) Error ( ) string {
2023-05-16 18:51:38 +02:00
return e . err . Error ( )
2020-04-27 23:19:27 +02:00
}