VictoriaMetrics/app/vmstorage/main.go

778 lines
31 KiB
Go
Raw Normal View History

2019-05-22 23:23:23 +02:00
package main
2019-05-22 23:16:55 +02:00
import (
"flag"
"fmt"
"net/http"
"os"
2019-05-22 23:16:55 +02:00
"strings"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/servers"
2019-05-22 23:23:23 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
2019-05-22 23:16:55 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/mergeset"
2019-05-22 23:23:23 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/common"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics"
2019-05-22 23:16:55 +02:00
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
"github.com/VictoriaMetrics/metrics"
)
var (
retentionPeriod = flagutil.NewDuration("retentionPeriod", "1", "Data with timestamps outside the retentionPeriod is automatically deleted")
httpListenAddr = flag.String("httpListenAddr", ":8482", "Address to listen for http connections")
storageDataPath = flag.String("storageDataPath", "vmstorage-data", "Path to storage data")
vminsertAddr = flag.String("vminsertAddr", ":8400", "TCP address to accept connections from vminsert services")
vmselectAddr = flag.String("vmselectAddr", ":8401", "TCP address to accept connections from vmselect services")
snapshotAuthKey = flag.String("snapshotAuthKey", "", "authKey, which must be passed in query string to /snapshot* pages")
forceMergeAuthKey = flag.String("forceMergeAuthKey", "", "authKey, which must be passed in query string to /internal/force_merge pages")
forceFlushAuthKey = flag.String("forceFlushAuthKey", "", "authKey, which must be passed in query string to /internal/force_flush pages")
snapshotsMaxAge = flagutil.NewDuration("snapshotsMaxAge", "0", "Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted")
finalMergeDelay = flag.Duration("finalMergeDelay", 0, "The delay before starting final merge for per-month partition after no new data is ingested into it. "+
"Final merge may require additional disk IO and CPU resources. Final merge may increase query speed and reduce disk space usage in some cases. "+
"Zero value disables final merge")
bigMergeConcurrency = flag.Int("bigMergeConcurrency", 0, "The maximum number of CPU cores to use for big merges. Default value is used if set to 0")
smallMergeConcurrency = flag.Int("smallMergeConcurrency", 0, "The maximum number of CPU cores to use for small merges. Default value is used if set to 0")
retentionTimezoneOffset = flag.Duration("retentionTimezoneOffset", 0, "The offset for performing indexdb rotation. "+
"If set to 0, then the indexdb rotation is performed at 4am UTC time per each -retentionPeriod. "+
"If set to 2h, then the indexdb rotation is performed at 4am EET time (the timezone with +2h offset)")
minScrapeInterval = flag.Duration("dedup.minScrapeInterval", 0, "Leave only the last sample in every time series per each discrete interval "+
"equal to -dedup.minScrapeInterval > 0. See https://docs.victoriametrics.com/#deduplication for details")
logNewSeries = flag.Bool("logNewSeries", false, "Whether to log new series. This option is for debug purposes only. It can lead to performance issues "+
"when big number of new series are ingested into VictoriaMetrics")
maxHourlySeries = flag.Int("storage.maxHourlySeries", 0, "The maximum number of unique series can be added to the storage during the last hour. "+
"Excess series are logged and dropped. This can be useful for limiting series cardinality. See also -storage.maxDailySeries")
maxDailySeries = flag.Int("storage.maxDailySeries", 0, "The maximum number of unique series can be added to the storage during the last 24 hours. "+
"Excess series are logged and dropped. This can be useful for limiting series churn rate. See also -storage.maxHourlySeries")
minFreeDiskSpaceBytes = flagutil.NewBytes("storage.minFreeDiskSpaceBytes", 10e6, "The minimum free disk space at -storageDataPath after which the storage stops accepting new data")
cacheSizeStorageTSID = flagutil.NewBytes("storage.cacheSizeStorageTSID", 0, "Overrides max size for storage/tsid cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning")
cacheSizeIndexDBIndexBlocks = flagutil.NewBytes("storage.cacheSizeIndexDBIndexBlocks", 0, "Overrides max size for indexdb/indexBlocks cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning")
cacheSizeIndexDBDataBlocks = flagutil.NewBytes("storage.cacheSizeIndexDBDataBlocks", 0, "Overrides max size for indexdb/dataBlocks cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning")
cacheSizeIndexDBTagFilters = flagutil.NewBytes("storage.cacheSizeIndexDBTagFilters", 0, "Overrides max size for indexdb/tagFilters cache. See https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#cache-tuning")
2019-05-22 23:16:55 +02:00
)
2019-05-22 23:23:23 +02:00
func main() {
// Write flags and help message to stdout, since it is easier to grep or pipe.
flag.CommandLine.SetOutput(os.Stdout)
flag.Usage = usage
envflag.Parse()
2019-05-22 23:23:23 +02:00
buildinfo.Init()
logger.Init()
pushmetrics.Init()
2019-05-22 23:23:23 +02:00
storage.SetDedupInterval(*minScrapeInterval)
storage.SetLogNewSeries(*logNewSeries)
storage.SetFinalMergeDelay(*finalMergeDelay)
storage.SetBigMergeWorkersCount(*bigMergeConcurrency)
storage.SetSmallMergeWorkersCount(*smallMergeConcurrency)
storage.SetRetentionTimezoneOffset(*retentionTimezoneOffset)
storage.SetFreeDiskSpaceLimit(minFreeDiskSpaceBytes.N)
storage.SetTSIDCacheSize(cacheSizeStorageTSID.N)
storage.SetTagFilterCacheSize(cacheSizeIndexDBTagFilters.N)
mergeset.SetIndexBlocksCacheSize(cacheSizeIndexDBIndexBlocks.N)
mergeset.SetDataBlocksCacheSize(cacheSizeIndexDBDataBlocks.N)
if retentionPeriod.Msecs < 24*3600*1000 {
logger.Fatalf("-retentionPeriod cannot be smaller than a day; got %s", retentionPeriod)
}
logger.Infof("opening storage at %q with -retentionPeriod=%s", *storageDataPath, retentionPeriod)
2019-05-22 23:16:55 +02:00
startTime := time.Now()
strg, err := storage.OpenStorage(*storageDataPath, retentionPeriod.Msecs, *maxHourlySeries, *maxDailySeries)
2019-05-22 23:16:55 +02:00
if err != nil {
logger.Fatalf("cannot open a storage at %s with -retentionPeriod=%s: %s", *storageDataPath, retentionPeriod, err)
2019-05-22 23:16:55 +02:00
}
initStaleSnapshotsRemover(strg)
2019-05-22 23:16:55 +02:00
var m storage.Metrics
2019-05-22 23:23:23 +02:00
strg.UpdateMetrics(&m)
2019-05-22 23:16:55 +02:00
tm := &m.TableMetrics
partsCount := tm.SmallPartsCount + tm.BigPartsCount
blocksCount := tm.SmallBlocksCount + tm.BigBlocksCount
rowsCount := tm.SmallRowsCount + tm.BigRowsCount
sizeBytes := tm.SmallSizeBytes + tm.BigSizeBytes
logger.Infof("successfully opened storage %q in %.3f seconds; partsCount: %d; blocksCount: %d; rowsCount: %d; sizeBytes: %d",
*storageDataPath, time.Since(startTime).Seconds(), partsCount, blocksCount, rowsCount, sizeBytes)
2019-05-22 23:16:55 +02:00
2019-05-22 23:23:23 +02:00
registerStorageMetrics(strg)
2019-05-22 23:16:55 +02:00
common.StartUnmarshalWorkers()
vminsertSrv, err := servers.NewVMInsertServer(*vminsertAddr, strg)
2019-05-22 23:23:23 +02:00
if err != nil {
logger.Fatalf("cannot create a server with -vminsertAddr=%s: %s", *vminsertAddr, err)
}
vmselectSrv, err := servers.NewVMSelectServer(*vmselectAddr, strg)
if err != nil {
logger.Fatalf("cannot create a server with -vmselectAddr=%s: %s", *vmselectAddr, err)
2019-05-22 23:23:23 +02:00
}
2019-05-22 23:16:55 +02:00
2019-05-22 23:23:23 +02:00
requestHandler := newRequestHandler(strg)
go func() {
httpserver.Serve(*httpListenAddr, requestHandler)
}()
2019-05-22 23:16:55 +02:00
2019-05-22 23:23:23 +02:00
sig := procutil.WaitForSigterm()
logger.Infof("service received signal %s", sig)
2019-05-22 23:16:55 +02:00
app/vmstorage: add missing shutdown for http server on graceful shutdown This could result in the following panic during graceful shutdown when `/metrics` page is requested: http: panic serving 10.101.66.5:57366: runtime error: invalid memory address or nil pointer dereference goroutine 2050 [running]: net/http.(*conn).serve.func1(0xc00ef22000) net/http/server.go:1772 +0x139 panic(0xa0fc00, 0xe91d80) runtime/panic.go:973 +0x3e3 github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache.(*Cache).UpdateStats(0x0, 0xc0000516c8) github.com/VictoriaMetrics/VictoriaMetrics/lib/workingsetcache/cache.go:224 +0x37 github.com/VictoriaMetrics/VictoriaMetrics/lib/storage.(*indexDB).UpdateMetrics(0xc00b931d00, 0xc02c41acf8) github.com/VictoriaMetrics/VictoriaMetrics/lib/storage/index_db.go:258 +0x9f github.com/VictoriaMetrics/VictoriaMetrics/lib/storage.(*Storage).UpdateMetrics(0xc0000bc7e0, 0xc02c41ac00) github.com/VictoriaMetrics/VictoriaMetrics/lib/storage/storage.go:413 +0x4c5 main.registerStorageMetrics.func1(0x0) github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/main.go:186 +0xd9 main.registerStorageMetrics.func3(0xc00008c380) github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/main.go:196 +0x26 main.registerStorageMetrics.func7(0xc) github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage/main.go:211 +0x26 github.com/VictoriaMetrics/metrics.(*Gauge).marshalTo(0xc000010148, 0xaa407d, 0x20, 0xb50d60, 0xc005319890) github.com/VictoriaMetrics/metrics@v1.11.2/gauge.go:38 +0x3f github.com/VictoriaMetrics/metrics.(*Set).WritePrometheus(0xc000084300, 0x7fd56809c940, 0xc005319860) github.com/VictoriaMetrics/metrics@v1.11.2/set.go:51 +0x1e1 github.com/VictoriaMetrics/metrics.WritePrometheus(0x7fd56809c940, 0xc005319860, 0xa16f01) github.com/VictoriaMetrics/metrics@v1.11.2/metrics.go:42 +0x41 github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver.writePrometheusMetrics(0x7fd56809c940, 0xc005319860) github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver/metrics.go:16 +0x44 github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver.handlerWrapper(0xb5a120, 0xc005319860, 0xc005018f00, 0xc00002cc90) github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver/httpserver.go:154 +0x58d github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver.gzipHandler.func1(0xb5a120, 0xc005319860, 0xc005018f00) github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver/httpserver.go:119 +0x8e net/http.HandlerFunc.ServeHTTP(0xc00002d110, 0xb5a660, 0xc0044141c0, 0xc005018f00) net/http/server.go:2012 +0x44 net/http.serverHandler.ServeHTTP(0xc004414000, 0xb5a660, 0xc0044141c0, 0xc005018f00) net/http/server.go:2807 +0xa3 net/http.(*conn).serve(0xc00ef22000, 0xb5bf60, 0xc010532080) net/http/server.go:1895 +0x86c created by net/http.(*Server).Serve net/http/server.go:2933 +0x35c
2020-04-02 20:07:59 +02:00
logger.Infof("gracefully shutting down http service at %q", *httpListenAddr)
startTime = time.Now()
if err := httpserver.Stop(*httpListenAddr); err != nil {
logger.Fatalf("cannot stop http service: %s", err)
}
logger.Infof("successfully shut down http service in %.3f seconds", time.Since(startTime).Seconds())
2019-05-22 23:23:23 +02:00
logger.Infof("gracefully shutting down the service")
startTime = time.Now()
stopStaleSnapshotsRemover()
vmselectSrv.MustStop()
vminsertSrv.MustStop()
common.StopUnmarshalWorkers()
logger.Infof("successfully shut down the service in %.3f seconds", time.Since(startTime).Seconds())
2019-05-22 23:16:55 +02:00
2019-05-22 23:23:23 +02:00
logger.Infof("gracefully closing the storage at %s", *storageDataPath)
startTime = time.Now()
strg.MustClose()
logger.Infof("successfully closed the storage in %.3f seconds", time.Since(startTime).Seconds())
2019-05-22 23:16:55 +02:00
fs.MustStopDirRemover()
2019-05-22 23:23:23 +02:00
logger.Infof("the vmstorage has been stopped")
2019-05-22 23:16:55 +02:00
}
2019-05-22 23:23:23 +02:00
func newRequestHandler(strg *storage.Storage) httpserver.RequestHandler {
return func(w http.ResponseWriter, r *http.Request) bool {
if r.URL.Path == "/" {
if r.Method != "GET" {
return false
}
fmt.Fprintf(w, "vmstorage - a component of VictoriaMetrics cluster. See docs at https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html")
return true
}
2019-05-22 23:23:23 +02:00
return requestHandler(w, r, strg)
2019-05-22 23:16:55 +02:00
}
2019-05-22 23:23:23 +02:00
}
func requestHandler(w http.ResponseWriter, r *http.Request, strg *storage.Storage) bool {
path := r.URL.Path
if path == "/internal/force_merge" {
authKey := r.FormValue("authKey")
if authKey != *forceMergeAuthKey {
httpserver.Errorf(w, r, "invalid authKey %q. It must match the value from -forceMergeAuthKey command line flag", authKey)
return true
}
// Run force merge in background
partitionNamePrefix := r.FormValue("partition_prefix")
go func() {
activeForceMerges.Inc()
defer activeForceMerges.Dec()
logger.Infof("forced merge for partition_prefix=%q has been started", partitionNamePrefix)
startTime := time.Now()
if err := strg.ForceMergePartitions(partitionNamePrefix); err != nil {
logger.Errorf("error in forced merge for partition_prefix=%q: %s", partitionNamePrefix, err)
return
}
logger.Infof("forced merge for partition_prefix=%q has been successfully finished in %.3f seconds", partitionNamePrefix, time.Since(startTime).Seconds())
}()
return true
}
if path == "/internal/force_flush" {
authKey := r.FormValue("authKey")
if authKey != *forceFlushAuthKey {
httpserver.Errorf(w, r, "invalid authKey %q. It must match the value from -forceFlushAuthKey command line flag", authKey)
return true
}
logger.Infof("flushing storage to make pending data available for reading")
strg.DebugFlush()
return true
}
2019-05-22 23:16:55 +02:00
if !strings.HasPrefix(path, "/snapshot") {
return false
}
authKey := r.FormValue("authKey")
if authKey != *snapshotAuthKey {
httpserver.Errorf(w, r, "invalid authKey %q. It must match the value from -snapshotAuthKey command line flag", authKey)
2019-05-22 23:16:55 +02:00
return true
}
path = path[len("/snapshot"):]
switch path {
case "/create":
w.Header().Set("Content-Type", "application/json")
2019-05-22 23:23:23 +02:00
snapshotPath, err := strg.CreateSnapshot()
2019-05-22 23:16:55 +02:00
if err != nil {
err = fmt.Errorf("cannot create snapshot: %w", err)
jsonResponseError(w, err)
2019-05-22 23:16:55 +02:00
return true
}
2019-05-22 23:23:23 +02:00
fmt.Fprintf(w, `{"status":"ok","snapshot":%q}`, snapshotPath)
2019-05-22 23:16:55 +02:00
return true
case "/list":
w.Header().Set("Content-Type", "application/json")
2019-05-22 23:23:23 +02:00
snapshots, err := strg.ListSnapshots()
2019-05-22 23:16:55 +02:00
if err != nil {
err = fmt.Errorf("cannot list snapshots: %w", err)
jsonResponseError(w, err)
2019-05-22 23:16:55 +02:00
return true
}
fmt.Fprintf(w, `{"status":"ok","snapshots":[`)
if len(snapshots) > 0 {
for _, snapshot := range snapshots[:len(snapshots)-1] {
fmt.Fprintf(w, "\n%q,", snapshot)
}
fmt.Fprintf(w, "\n%q\n", snapshots[len(snapshots)-1])
}
fmt.Fprintf(w, `]}`)
return true
case "/delete":
w.Header().Set("Content-Type", "application/json")
2019-05-22 23:16:55 +02:00
snapshotName := r.FormValue("snapshot")
2019-05-22 23:23:23 +02:00
if err := strg.DeleteSnapshot(snapshotName); err != nil {
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
jsonResponseError(w, err)
2019-05-22 23:16:55 +02:00
return true
}
fmt.Fprintf(w, `{"status":"ok"}`)
return true
case "/delete_all":
w.Header().Set("Content-Type", "application/json")
2019-05-22 23:23:23 +02:00
snapshots, err := strg.ListSnapshots()
2019-05-22 23:16:55 +02:00
if err != nil {
err = fmt.Errorf("cannot list snapshots: %w", err)
jsonResponseError(w, err)
2019-05-22 23:16:55 +02:00
return true
}
for _, snapshotName := range snapshots {
2019-05-22 23:23:23 +02:00
if err := strg.DeleteSnapshot(snapshotName); err != nil {
err = fmt.Errorf("cannot delete snapshot %q: %w", snapshotName, err)
jsonResponseError(w, err)
2019-05-22 23:16:55 +02:00
return true
}
}
fmt.Fprintf(w, `{"status":"ok"}`)
return true
default:
return false
}
}
func initStaleSnapshotsRemover(strg *storage.Storage) {
staleSnapshotsRemoverCh = make(chan struct{})
if snapshotsMaxAge.Msecs <= 0 {
return
}
snapshotsMaxAgeDur := time.Duration(snapshotsMaxAge.Msecs) * time.Millisecond
staleSnapshotsRemoverWG.Add(1)
go func() {
defer staleSnapshotsRemoverWG.Done()
t := time.NewTicker(11 * time.Second)
defer t.Stop()
for {
select {
case <-staleSnapshotsRemoverCh:
return
case <-t.C:
}
if err := strg.DeleteStaleSnapshots(snapshotsMaxAgeDur); err != nil {
// Use logger.Errorf instead of logger.Fatalf in the hope the error is temporary.
logger.Errorf("cannot delete stale snapshots: %s", err)
}
}
}()
}
func stopStaleSnapshotsRemover() {
close(staleSnapshotsRemoverCh)
staleSnapshotsRemoverWG.Wait()
}
var (
staleSnapshotsRemoverCh chan struct{}
staleSnapshotsRemoverWG sync.WaitGroup
)
var activeForceMerges = metrics.NewCounter("vm_active_force_merges")
2019-05-22 23:16:55 +02:00
func registerStorageMetrics(strg *storage.Storage) {
mCache := &storage.Metrics{}
var mCacheLock sync.Mutex
var lastUpdateTime time.Time
m := func() *storage.Metrics {
mCacheLock.Lock()
defer mCacheLock.Unlock()
if time.Since(lastUpdateTime) < time.Second {
return mCache
}
var mc storage.Metrics
strg.UpdateMetrics(&mc)
mCache = &mc
lastUpdateTime = time.Now()
return mCache
}
tm := func() *storage.TableMetrics {
sm := m()
return &sm.TableMetrics
}
idbm := func() *storage.IndexDBMetrics {
sm := m()
return &sm.IndexDBMetrics
}
2020-04-01 22:43:09 +02:00
metrics.NewGauge(fmt.Sprintf(`vm_free_disk_space_bytes{path=%q}`, *storageDataPath), func() float64 {
return float64(fs.MustGetFreeSpace(*storageDataPath))
})
metrics.NewGauge(fmt.Sprintf(`vm_free_disk_space_limit_bytes{path=%q}`, *storageDataPath), func() float64 {
return float64(minFreeDiskSpaceBytes.N)
})
metrics.NewGauge(fmt.Sprintf(`vm_storage_is_read_only{path=%q}`, *storageDataPath), func() float64 {
if strg.IsReadOnly() {
return 1
}
return 0
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_active_merges{type="storage/big"}`, func() float64 {
return float64(tm().ActiveBigMerges)
})
metrics.NewGauge(`vm_active_merges{type="storage/small"}`, func() float64 {
return float64(tm().ActiveSmallMerges)
})
metrics.NewGauge(`vm_active_merges{type="indexdb"}`, func() float64 {
return float64(idbm().ActiveMerges)
})
metrics.NewGauge(`vm_merges_total{type="storage/big"}`, func() float64 {
return float64(tm().BigMergesCount)
})
metrics.NewGauge(`vm_merges_total{type="storage/small"}`, func() float64 {
return float64(tm().SmallMergesCount)
})
metrics.NewGauge(`vm_merges_total{type="indexdb"}`, func() float64 {
return float64(idbm().MergesCount)
})
metrics.NewGauge(`vm_rows_merged_total{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsMerged)
})
metrics.NewGauge(`vm_rows_merged_total{type="storage/small"}`, func() float64 {
return float64(tm().SmallRowsMerged)
})
metrics.NewGauge(`vm_rows_merged_total{type="indexdb"}`, func() float64 {
return float64(idbm().ItemsMerged)
})
metrics.NewGauge(`vm_rows_deleted_total{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsDeleted)
})
metrics.NewGauge(`vm_rows_deleted_total{type="storage/small"}`, func() float64 {
return float64(tm().SmallRowsDeleted)
})
metrics.NewGauge(`vm_references{type="storage/big", name="parts"}`, func() float64 {
return float64(tm().BigPartsRefCount)
})
metrics.NewGauge(`vm_references{type="storage/small", name="parts"}`, func() float64 {
return float64(tm().SmallPartsRefCount)
})
metrics.NewGauge(`vm_references{type="storage", name="partitions"}`, func() float64 {
return float64(tm().PartitionsRefCount)
})
metrics.NewGauge(`vm_references{type="indexdb", name="objects"}`, func() float64 {
return float64(idbm().IndexDBRefCount)
})
metrics.NewGauge(`vm_references{type="indexdb", name="parts"}`, func() float64 {
return float64(idbm().PartsRefCount)
})
metrics.NewGauge(`vm_new_timeseries_created_total`, func() float64 {
return float64(idbm().NewTimeseriesCreated)
})
lib/index: reduce read/write load after indexDB rotation (#2177) * lib/index: reduce read/write load after indexDB rotation IndexDB in VM is responsible for storing TSID - ID's used for identifying time series. The index is stored on disk and used by both ingestion and read path. IndexDB is stored separately to data parts and is global for all stored data. It can't be deleted partially as VM deletes data parts. Instead, indexDB is rotated once in `retention` interval. The rotation procedure means that `current` indexDB becomes `previous`, and new freshly created indexDB struct becomes `current`. So in any time, VM holds indexDB for current and previous retention periods. When time series is ingested or queried, VM checks if its TSID is present in `current` indexDB. If it is missing, it checks the `previous` indexDB. If TSID was found, it gets copied to the `current` indexDB. In this way `current` indexDB stores only series which were active during the retention period. To improve indexDB lookups, VM uses a cache layer called `tsidCache`. Both write and read path consult `tsidCache` and on miss the relad lookup happens. When rotation happens, VM resets the `tsidCache`. This is needed for ingestion path to trigger `current` indexDB re-population. Since index re-population requires additional resources, every index rotation event may cause some extra load on CPU and disk. While it may be unnoticeable for most of the cases, for systems with very high number of unique series each rotation may lead to performance degradation for some period of time. This PR makes an attempt to smooth out resource usage after the rotation. The changes are following: 1. `tsidCache` is no longer reset after the rotation; 2. Instead, each entry in `tsidCache` gains a notion of indexDB to which they belong; 3. On ingestion path after the rotation we check if requested TSID was found in `tsidCache`. Then we have 3 branches: 3.1 Fast path. It was found, and belongs to the `current` indexDB. Return TSID. 3.2 Slow path. It wasn't found, so we generate it from scratch, add to `current` indexDB, add it to `tsidCache`. 3.3 Smooth path. It was found but does not belong to the `current` indexDB. In this case, we add it to the `current` indexDB with some probability. The probability is based on time passed since the last rotation with some threshold. The more time has passed since rotation the higher is chance to re-populate `current` indexDB. The default re-population interval in this PR is set to `1h`, during which entries from `previous` index supposed to slowly re-populate `current` index. The new metric `vm_timeseries_repopulated_total` was added to identify how many TSIDs were moved from `previous` indexDB to the `current` indexDB. This metric supposed to grow only during the first `1h` after the last rotation. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1401 Signed-off-by: hagen1778 <roman@victoriametrics.com> * wip * wip Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
2022-02-11 23:30:08 +01:00
metrics.NewGauge(`vm_timeseries_repopulated_total`, func() float64 {
return float64(idbm().TimeseriesRepopulated)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_missing_tsids_for_metric_id_total`, func() float64 {
return float64(idbm().MissingTSIDsForMetricID)
})
metrics.NewGauge(`vm_index_blocks_with_metric_ids_processed_total`, func() float64 {
return float64(idbm().IndexBlocksWithMetricIDsProcessed)
})
metrics.NewGauge(`vm_index_blocks_with_metric_ids_incorrect_order_total`, func() float64 {
return float64(idbm().IndexBlocksWithMetricIDsIncorrectOrder)
})
metrics.NewGauge(`vm_composite_index_min_timestamp`, func() float64 {
return float64(idbm().MinTimestampForCompositeIndex) / 1e3
})
metrics.NewGauge(`vm_composite_filter_success_conversions_total`, func() float64 {
return float64(idbm().CompositeFilterSuccessConversions)
})
metrics.NewGauge(`vm_composite_filter_missing_conversions_total`, func() float64 {
return float64(idbm().CompositeFilterMissingConversions)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_assisted_merges_total{type="storage/small"}`, func() float64 {
return float64(tm().SmallAssistedMerges)
})
metrics.NewGauge(`vm_assisted_merges_total{type="indexdb"}`, func() float64 {
return float64(idbm().AssistedMerges)
})
metrics.NewGauge(`vm_indexdb_items_added_total`, func() float64 {
return float64(idbm().ItemsAdded)
})
metrics.NewGauge(`vm_indexdb_items_added_size_bytes_total`, func() float64 {
return float64(idbm().ItemsAddedSizeBytes)
})
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/686
metrics.NewGauge(`vm_merge_need_free_disk_space{type="storage/small"}`, func() float64 {
return float64(tm().SmallMergeNeedFreeDiskSpace)
})
metrics.NewGauge(`vm_merge_need_free_disk_space{type="storage/big"}`, func() float64 {
return float64(tm().BigMergeNeedFreeDiskSpace)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_pending_rows{type="storage"}`, func() float64 {
return float64(tm().PendingRows)
})
metrics.NewGauge(`vm_pending_rows{type="indexdb"}`, func() float64 {
return float64(idbm().PendingItems)
})
metrics.NewGauge(`vm_parts{type="storage/big"}`, func() float64 {
return float64(tm().BigPartsCount)
})
metrics.NewGauge(`vm_parts{type="storage/small"}`, func() float64 {
return float64(tm().SmallPartsCount)
})
metrics.NewGauge(`vm_parts{type="indexdb"}`, func() float64 {
return float64(idbm().PartsCount)
})
metrics.NewGauge(`vm_blocks{type="storage/big"}`, func() float64 {
return float64(tm().BigBlocksCount)
})
metrics.NewGauge(`vm_blocks{type="storage/small"}`, func() float64 {
return float64(tm().SmallBlocksCount)
})
metrics.NewGauge(`vm_blocks{type="indexdb"}`, func() float64 {
return float64(idbm().BlocksCount)
})
metrics.NewGauge(`vm_data_size_bytes{type="storage/big"}`, func() float64 {
return float64(tm().BigSizeBytes)
})
metrics.NewGauge(`vm_data_size_bytes{type="storage/small"}`, func() float64 {
return float64(tm().SmallSizeBytes)
})
metrics.NewGauge(`vm_data_size_bytes{type="indexdb"}`, func() float64 {
return float64(idbm().SizeBytes)
})
metrics.NewGauge(`vm_rows_added_to_storage_total`, func() float64 {
return float64(m().RowsAddedTotal)
})
metrics.NewGauge(`vm_deduplicated_samples_total{type="merge"}`, func() float64 {
return float64(m().DedupsDuringMerge)
})
metrics.NewGauge(`vm_rows_ignored_total{reason="big_timestamp"}`, func() float64 {
return float64(m().TooBigTimestampRows)
})
metrics.NewGauge(`vm_rows_ignored_total{reason="small_timestamp"}`, func() float64 {
return float64(m().TooSmallTimestampRows)
})
metrics.NewGauge(`vm_concurrent_addrows_limit_reached_total`, func() float64 {
return float64(m().AddRowsConcurrencyLimitReached)
})
metrics.NewGauge(`vm_concurrent_addrows_limit_timeout_total`, func() float64 {
return float64(m().AddRowsConcurrencyLimitTimeout)
})
metrics.NewGauge(`vm_concurrent_addrows_dropped_rows_total`, func() float64 {
return float64(m().AddRowsConcurrencyDroppedRows)
})
metrics.NewGauge(`vm_concurrent_addrows_capacity`, func() float64 {
return float64(m().AddRowsConcurrencyCapacity)
})
metrics.NewGauge(`vm_concurrent_addrows_current`, func() float64 {
return float64(m().AddRowsConcurrencyCurrent)
})
metrics.NewGauge(`vm_concurrent_search_tsids_limit_reached_total`, func() float64 {
return float64(m().SearchTSIDsConcurrencyLimitReached)
})
metrics.NewGauge(`vm_concurrent_search_tsids_limit_timeout_total`, func() float64 {
return float64(m().SearchTSIDsConcurrencyLimitTimeout)
})
metrics.NewGauge(`vm_concurrent_search_tsids_capacity`, func() float64 {
return float64(m().SearchTSIDsConcurrencyCapacity)
})
metrics.NewGauge(`vm_concurrent_search_tsids_current`, func() float64 {
return float64(m().SearchTSIDsConcurrencyCurrent)
})
metrics.NewGauge(`vm_search_delays_total`, func() float64 {
return float64(m().SearchDelays)
})
metrics.NewGauge(`vm_slow_row_inserts_total`, func() float64 {
return float64(m().SlowRowInserts)
})
metrics.NewGauge(`vm_slow_per_day_index_inserts_total`, func() float64 {
return float64(m().SlowPerDayIndexInserts)
})
metrics.NewGauge(`vm_slow_metric_name_loads_total`, func() float64 {
return float64(m().SlowMetricNameLoads)
})
metrics.NewGauge(`vm_hourly_series_limit_rows_dropped_total`, func() float64 {
return float64(m().HourlySeriesLimitRowsDropped)
})
metrics.NewGauge(`vm_daily_series_limit_rows_dropped_total`, func() float64 {
return float64(m().DailySeriesLimitRowsDropped)
})
metrics.NewGauge(`vm_timestamps_blocks_merged_total`, func() float64 {
return float64(m().TimestampsBlocksMerged)
})
metrics.NewGauge(`vm_timestamps_bytes_saved_total`, func() float64 {
return float64(m().TimestampsBytesSaved)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_rows{type="storage/big"}`, func() float64 {
return float64(tm().BigRowsCount)
})
metrics.NewGauge(`vm_rows{type="storage/small"}`, func() float64 {
return float64(tm().SmallRowsCount)
})
metrics.NewGauge(`vm_rows{type="indexdb"}`, func() float64 {
return float64(idbm().ItemsCount)
})
metrics.NewGauge(`vm_date_range_search_calls_total`, func() float64 {
return float64(idbm().DateRangeSearchCalls)
})
metrics.NewGauge(`vm_date_range_hits_total`, func() float64 {
return float64(idbm().DateRangeSearchHits)
})
metrics.NewGauge(`vm_global_search_calls_total`, func() float64 {
return float64(idbm().GlobalSearchCalls)
})
metrics.NewGauge(`vm_missing_metric_names_for_metric_id_total`, func() float64 {
return float64(idbm().MissingMetricNamesForMetricID)
})
metrics.NewGauge(`vm_date_metric_id_cache_syncs_total`, func() float64 {
return float64(m().DateMetricIDCacheSyncsCount)
})
metrics.NewGauge(`vm_date_metric_id_cache_resets_total`, func() float64 {
return float64(m().DateMetricIDCacheResetsCount)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_cache_entries{type="storage/tsid"}`, func() float64 {
return float64(m().TSIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/metricIDs"}`, func() float64 {
return float64(m().MetricIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/metricName"}`, func() float64 {
return float64(m().MetricNameCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/date_metricID"}`, func() float64 {
return float64(m().DateMetricIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/hour_metric_ids"}`, func() float64 {
return float64(m().HourMetricIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/next_day_metric_ids"}`, func() float64 {
return float64(m().NextDayMetricIDCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="storage/indexBlocks"}`, func() float64 {
return float64(tm().IndexBlocksCacheSize)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_entries{type="indexdb/dataBlocks"}`, func() float64 {
return float64(idbm().DataBlocksCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="indexdb/indexBlocks"}`, func() float64 {
return float64(idbm().IndexBlocksCacheSize)
})
metrics.NewGauge(`vm_cache_entries{type="indexdb/tagFilters"}`, func() float64 {
return float64(idbm().TagFiltersCacheSize)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_entries{type="storage/regexps"}`, func() float64 {
return float64(storage.RegexpCacheSize())
})
metrics.NewGauge(`vm_cache_entries{type="storage/regexpPrefixes"}`, func() float64 {
return float64(storage.RegexpPrefixesCacheSize())
})
metrics.NewGauge(`vm_cache_entries{type="storage/prefetchedMetricIDs"}`, func() float64 {
return float64(m().PrefetchedMetricIDsSize)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_cache_size_bytes{type="storage/tsid"}`, func() float64 {
return float64(m().TSIDCacheSizeBytes)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/metricIDs"}`, func() float64 {
return float64(m().MetricIDCacheSizeBytes)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/metricName"}`, func() float64 {
return float64(m().MetricNameCacheSizeBytes)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/indexBlocks"}`, func() float64 {
return float64(tm().IndexBlocksCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/dataBlocks"}`, func() float64 {
return float64(idbm().DataBlocksCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/indexBlocks"}`, func() float64 {
return float64(idbm().IndexBlocksCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/date_metricID"}`, func() float64 {
return float64(m().DateMetricIDCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/hour_metric_ids"}`, func() float64 {
return float64(m().HourMetricIDCacheSizeBytes)
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/next_day_metric_ids"}`, func() float64 {
return float64(m().NextDayMetricIDCacheSizeBytes)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_cache_size_bytes{type="indexdb/tagFilters"}`, func() float64 {
return float64(idbm().TagFiltersCacheSizeBytes)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/regexps"}`, func() float64 {
return float64(storage.RegexpCacheSizeBytes())
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/regexpPrefixes"}`, func() float64 {
return float64(storage.RegexpPrefixesCacheSizeBytes())
})
metrics.NewGauge(`vm_cache_size_bytes{type="storage/prefetchedMetricIDs"}`, func() float64 {
return float64(m().PrefetchedMetricIDsSizeBytes)
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_cache_size_max_bytes{type="storage/tsid"}`, func() float64 {
return float64(m().TSIDCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="storage/metricIDs"}`, func() float64 {
return float64(m().MetricIDCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="storage/metricName"}`, func() float64 {
return float64(m().MetricNameCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="storage/indexBlocks"}`, func() float64 {
return float64(tm().IndexBlocksCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="indexdb/dataBlocks"}`, func() float64 {
return float64(idbm().DataBlocksCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="indexdb/indexBlocks"}`, func() float64 {
return float64(idbm().IndexBlocksCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="indexdb/tagFilters"}`, func() float64 {
return float64(idbm().TagFiltersCacheSizeMaxBytes)
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="storage/regexps"}`, func() float64 {
return float64(storage.RegexpCacheMaxSizeBytes())
})
metrics.NewGauge(`vm_cache_size_max_bytes{type="storage/regexpPrefixes"}`, func() float64 {
return float64(storage.RegexpPrefixesCacheMaxSizeBytes())
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_cache_requests_total{type="storage/tsid"}`, func() float64 {
return float64(m().TSIDCacheRequests)
})
metrics.NewGauge(`vm_cache_requests_total{type="storage/metricIDs"}`, func() float64 {
return float64(m().MetricIDCacheRequests)
})
metrics.NewGauge(`vm_cache_requests_total{type="storage/metricName"}`, func() float64 {
return float64(m().MetricNameCacheRequests)
})
metrics.NewGauge(`vm_cache_requests_total{type="storage/indexBlocks"}`, func() float64 {
return float64(tm().IndexBlocksCacheRequests)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_requests_total{type="indexdb/dataBlocks"}`, func() float64 {
return float64(idbm().DataBlocksCacheRequests)
})
metrics.NewGauge(`vm_cache_requests_total{type="indexdb/indexBlocks"}`, func() float64 {
return float64(idbm().IndexBlocksCacheRequests)
})
metrics.NewGauge(`vm_cache_requests_total{type="indexdb/tagFilters"}`, func() float64 {
return float64(idbm().TagFiltersCacheRequests)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_requests_total{type="storage/regexps"}`, func() float64 {
return float64(storage.RegexpCacheRequests())
})
metrics.NewGauge(`vm_cache_requests_total{type="storage/regexpPrefixes"}`, func() float64 {
return float64(storage.RegexpPrefixesCacheRequests())
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_cache_misses_total{type="storage/tsid"}`, func() float64 {
return float64(m().TSIDCacheMisses)
})
metrics.NewGauge(`vm_cache_misses_total{type="storage/metricIDs"}`, func() float64 {
return float64(m().MetricIDCacheMisses)
})
metrics.NewGauge(`vm_cache_misses_total{type="storage/metricName"}`, func() float64 {
return float64(m().MetricNameCacheMisses)
})
metrics.NewGauge(`vm_cache_misses_total{type="storage/indexBlocks"}`, func() float64 {
return float64(tm().IndexBlocksCacheMisses)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_misses_total{type="indexdb/dataBlocks"}`, func() float64 {
return float64(idbm().DataBlocksCacheMisses)
})
metrics.NewGauge(`vm_cache_misses_total{type="indexdb/indexBlocks"}`, func() float64 {
return float64(idbm().IndexBlocksCacheMisses)
})
metrics.NewGauge(`vm_cache_misses_total{type="indexdb/tagFilters"}`, func() float64 {
return float64(idbm().TagFiltersCacheMisses)
2019-05-22 23:16:55 +02:00
})
metrics.NewGauge(`vm_cache_misses_total{type="storage/regexps"}`, func() float64 {
return float64(storage.RegexpCacheMisses())
})
metrics.NewGauge(`vm_cache_misses_total{type="storage/regexpPrefixes"}`, func() float64 {
return float64(storage.RegexpPrefixesCacheMisses())
})
2019-05-22 23:16:55 +02:00
metrics.NewGauge(`vm_deleted_metrics_total{type="indexdb"}`, func() float64 {
return float64(idbm().DeletedMetricsCount)
})
metrics.NewGauge(`vm_cache_collisions_total{type="storage/tsid"}`, func() float64 {
return float64(m().TSIDCacheCollisions)
})
metrics.NewGauge(`vm_cache_collisions_total{type="storage/metricName"}`, func() float64 {
return float64(m().MetricNameCacheCollisions)
})
metrics.NewGauge(`vm_next_retention_seconds`, func() float64 {
return float64(m().NextRetentionSeconds)
})
2019-05-22 23:16:55 +02:00
}
func jsonResponseError(w http.ResponseWriter, err error) {
logger.Errorf("%s", err)
w.WriteHeader(http.StatusInternalServerError)
fmt.Fprintf(w, `{"status":"error","msg":%q}`, err)
}
func usage() {
const s = `
vmstorage stores time series data obtained from vminsert and returns the requested data to vmselect.
See the docs at https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html .
`
flagutil.Usage(s)
}