From 0d3f31f60ef2b81fc132c37cc03a90d5baa75aad Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Mon, 27 Feb 2023 12:57:22 -0800 Subject: [PATCH] lib/storage: follow-up for 39cdc546ddf794a56a306134ff814e2e3a0ea39a - Use flag.Duration instead of flagutil.Duration for -snapshotCreateTimeout, since the flagutil.Duration is intended mostly for big durations, e.g. days, months and years, while the -snapshotCreateTimeout is usually smaller than one hour. - Add links to https://docs.victoriametrics.com/#how-to-work-with-snapshots in docs/CHANGELOG.md, so readers could easily find the corresponding docs when reading the changelog. - Properly remove all the created directories on unsuccessful attempt to create snapshot in Storage.CreateSnapshot(). Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551 --- README.md | 2 ++ app/vmstorage/main.go | 11 +++---- docs/CHANGELOG.md | 6 ++-- docs/Cluster-VictoriaMetrics.md | 5 ++-- docs/README.md | 2 ++ docs/Single-server-VictoriaMetrics.md | 5 ++-- lib/mergeset/table.go | 11 ++++--- lib/storage/partition.go | 3 ++ lib/storage/storage.go | 42 +++++++++++++++------------ lib/storage/table.go | 13 +++++---- 10 files changed, 57 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 23cdc1e39..e357108c5 100644 --- a/README.md +++ b/README.md @@ -2481,6 +2481,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages + -snapshotCreateTimeout duration + The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index 9a470b1c1..948ce2c86 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -28,7 +28,7 @@ var ( forceMergeAuthKey = flag.String("forceMergeAuthKey", "", "authKey, which must be passed in query string to /internal/force_merge pages") forceFlushAuthKey = flag.String("forceFlushAuthKey", "", "authKey, which must be passed in query string to /internal/force_flush pages") snapshotsMaxAge = flagutil.NewDuration("snapshotsMaxAge", "0", "Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted") - snapshotCreateTimeout = flagutil.NewDuration("snapshotCreateTimeout", "0", "Defines timeout value for process of creating new snapshot if it is set to non-zero duration. If set, make sure that timeout is lower than backup period.") + snapshotCreateTimeout = flag.Duration("snapshotCreateTimeout", 0, "The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period") precisionBits = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss") @@ -293,8 +293,8 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { snapshotsCreateTotal.Inc() w.Header().Set("Content-Type", "application/json") deadline := uint64(0) - if snapshotCreateTimeout.Msecs > 0 { - deadline = fasttime.UnixTimestamp() + uint64(snapshotCreateTimeout.Msecs/1e3) + if *snapshotCreateTimeout > 0 { + deadline = fasttime.UnixTimestamp() + uint64(snapshotCreateTimeout.Seconds()) } snapshotPath, err := Storage.CreateSnapshot(deadline) if err != nil { @@ -353,7 +353,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { } } - err = fmt.Errorf("cannot find snapshot %q: %w", snapshotName, err) + err = fmt.Errorf("cannot find snapshot %q", snapshotName) jsonResponseError(w, err) return true case "/delete_all": @@ -417,7 +417,8 @@ var ( ) var ( - activeForceMerges = metrics.NewCounter("vm_active_force_merges") + activeForceMerges = metrics.NewCounter("vm_active_force_merges") + snapshotsCreateTotal = metrics.NewCounter(`vm_http_requests_total{path="/snapshot/create"}`) snapshotsCreateErrorsTotal = metrics.NewCounter(`vm_http_request_errors_total{path="/snapshot/create"}`) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 433cb0fdf..479f75ed0 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -15,15 +15,15 @@ The following tip changes can be tested by building VictoriaMetrics components f ## tip -* FEATURE: add `-snapshotCreateTimeout` flag to allow configuring timeout for snapshot process. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551). -* FEATURE: expose `vm_http_requests_total` and `vm_http_request_errors_total` metrics for `snapshot/*` paths at [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) `vmstorage` and [VictoriaMetrics Single](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551). +* FEATURE: add `-snapshotCreateTimeout` flag to allow configuring timeout for [snapshot process](https://docs.victoriametrics.com/#how-to-work-with-snapshots). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551). +* FEATURE: expose `vm_http_requests_total` and `vm_http_request_errors_total` metrics for `snapshot/*` [paths](https://docs.victoriametrics.com/#how-to-work-with-snapshots) at [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) `vmstorage` and [VictoriaMetrics Single](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3551). * FEATURE: [vmgateway](https://docs.victoriametrics.com/vmgateway.html): add the ability to discover keys for JWT verification via [OpenID discovery endpoint](https://openid.net/specs/openid-connect-discovery-1_0.html). See [these docs](https://docs.victoriametrics.com/vmgateway.html#using-openid-discovery-endpoint-for-jwt-signature-verification). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): fix panic when executing the query `aggr_func(rollup*(some_value))`. The panic has been introduced in [v1.88.0](https://docs.victoriametrics.com/CHANGELOG.html#v1880). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): use the provided `-remoteWrite.*` auth options when determining whether the remote storage supports [VictoriaMetrics remote write protocol](https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol). Previously the auth options were ignored. This was preventing from automatic switch to VictoriaMetrics remote write protocol. * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not register `vm_promscrape_config_*` metrics if `-promscrape.config` flag is not used. Previously those metrics were registered and never updated, which was confusing and could trigger false-positive alerts. * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): skip measurements with no fields when migrating data from influxdb. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3837). -* BUGFIX: delete failed snapshot contents from disk when creating snapshot fails. Previously failed snapshot contents could remain on disk in incomplete state. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3858) +* BUGFIX: delete failed snapshot contents from disk on failed attempt to [create snapshot](https://docs.victoriametrics.com/#how-to-work-with-snapshots). Previously failed snapshot contents could remain on disk in incomplete state. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3858) ## [v1.88.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.88.0) diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index 30c4abcf2..29ca1819e 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -1361,9 +1361,8 @@ Below is the output for `/path/to/vmstorage -help`: The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages - -snapshotCreateTimeout value - Defines timeout value for process of creating new snapshot if it is set to non-zero duration. If set, make sure that timeout is lower than backup period. - The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) + -snapshotCreateTimeout duration + The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) diff --git a/docs/README.md b/docs/README.md index c86ab500e..7e17ca5a5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2482,6 +2482,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages + -snapshotCreateTimeout duration + The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index 565e22cb4..143f6d151 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -2485,9 +2485,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string authKey, which must be passed in query string to /snapshot* pages - -snapshotCreateTimeout value - Defines timeout value for process of creating new snapshot if it is set to non-zero duration. If set, make sure that timeout is lower than backup period. - The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) + -snapshotCreateTimeout duration + The timeout for creating new snapshot. If set, make sure that timeout is lower than backup period -snapshotsMaxAge value Automatically delete snapshots older than -snapshotsMaxAge if it is set to non-zero duration. Make sure that backup process has enough time to finish the backup before the corresponding snapshot is automatically deleted The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 0) diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go index 23158caa4..9fa62bc9d 100644 --- a/lib/mergeset/table.go +++ b/lib/mergeset/table.go @@ -1505,7 +1505,8 @@ func mustCloseParts(pws []*partWrapper) { // very quickly. // // If deadline is reached before snapshot is created error is returned. -// If any error occurs during snapshot created data is not removed. +// +// The caller is responsible for data removal at dstDir on unsuccessful snapshot creation. func (tb *Table) CreateSnapshotAt(dstDir string, deadline uint64) error { logger.Infof("creating Table snapshot of %q...", tb.path) startTime := time.Now() @@ -1547,11 +1548,9 @@ func (tb *Table) CreateSnapshotAt(dstDir string, deadline uint64) error { return fmt.Errorf("cannot read directory: %w", err) } - for i, fi := range fis { - if deadline > 0 && i%5 == 0 { - if fasttime.UnixTimestamp() > deadline { - return fmt.Errorf("cannot create snapshot for %q in time: timeout exceeded", tb.path) - } + for _, fi := range fis { + if deadline > 0 && fasttime.UnixTimestamp() > deadline { + return fmt.Errorf("cannot create snapshot for %q: timeout exceeded", tb.path) } fn := fi.Name() diff --git a/lib/storage/partition.go b/lib/storage/partition.go index 0b71e8bdb..793fa6ddf 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -1990,6 +1990,9 @@ func (pt *partition) CreateSnapshotAt(smallPath, bigPath string) error { return nil } +// createSnapshot creates a snapshot from srcDir to dstDir. +// +// The caller is responsible for deleting dstDir if createSnapshot() returns error. func (pt *partition) createSnapshot(srcDir, dstDir string) error { if err := fs.MkdirAllFailIfExist(dstDir); err != nil { return fmt.Errorf("cannot create snapshot dir %q: %w", dstDir, err) diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 476a545c2..8685e2b93 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -313,65 +313,71 @@ func (s *Storage) CreateSnapshot(deadline uint64) (string, error) { s.snapshotLock.Lock() defer s.snapshotLock.Unlock() + var dirsToRemoveOnError []string + defer func() { + for _, dir := range dirsToRemoveOnError { + fs.MustRemoveAll(dir) + } + }() + snapshotName := snapshot.NewName() srcDir := s.path dstDir := fmt.Sprintf("%s/snapshots/%s", srcDir, snapshotName) if err := fs.MkdirAllFailIfExist(dstDir); err != nil { return "", fmt.Errorf("cannot create dir %q: %w", dstDir, err) } + dirsToRemoveOnError = append(dirsToRemoveOnError, dstDir) + + smallDir, bigDir, err := s.tb.CreateSnapshot(snapshotName, deadline) + if err != nil { + return "", fmt.Errorf("cannot create table snapshot: %w", err) + } + dirsToRemoveOnError = append(dirsToRemoveOnError, smallDir, bigDir) + dstDataDir := dstDir + "/data" if err := fs.MkdirAllFailIfExist(dstDataDir); err != nil { return "", fmt.Errorf("cannot create dir %q: %w", dstDataDir, err) } - - smallDir, bigDir, err := s.tb.CreateSnapshot(snapshotName, deadline) - if err != nil { - fs.MustRemoveAll(dstDir) - return "", fmt.Errorf("cannot create table snapshot: %w", err) - } dstSmallDir := dstDataDir + "/small" if err := fs.SymlinkRelative(smallDir, dstSmallDir); err != nil { - fs.MustRemoveAll(dstDir) return "", fmt.Errorf("cannot create symlink from %q to %q: %w", smallDir, dstSmallDir, err) } dstBigDir := dstDataDir + "/big" if err := fs.SymlinkRelative(bigDir, dstBigDir); err != nil { - fs.MustRemoveAll(dstDir) return "", fmt.Errorf("cannot create symlink from %q to %q: %w", bigDir, dstBigDir, err) } fs.MustSyncPath(dstDataDir) + srcMetadataDir := srcDir + "/metadata" + dstMetadataDir := dstDir + "/metadata" + if err := fs.CopyDirectory(srcMetadataDir, dstMetadataDir); err != nil { + return "", fmt.Errorf("cannot copy metadata: %w", err) + } + idbSnapshot := fmt.Sprintf("%s/indexdb/snapshots/%s", srcDir, snapshotName) idb := s.idb() currSnapshot := idbSnapshot + "/" + idb.name if err := idb.tb.CreateSnapshotAt(currSnapshot, deadline); err != nil { - fs.MustRemoveAll(dstDir) return "", fmt.Errorf("cannot create curr indexDB snapshot: %w", err) } + dirsToRemoveOnError = append(dirsToRemoveOnError, idbSnapshot) + ok := idb.doExtDB(func(extDB *indexDB) { prevSnapshot := idbSnapshot + "/" + extDB.name err = extDB.tb.CreateSnapshotAt(prevSnapshot, deadline) }) if ok && err != nil { - fs.MustRemoveAll(dstDir) return "", fmt.Errorf("cannot create prev indexDB snapshot: %w", err) } dstIdbDir := dstDir + "/indexdb" if err := fs.SymlinkRelative(idbSnapshot, dstIdbDir); err != nil { - fs.MustRemoveAll(dstDir) return "", fmt.Errorf("cannot create symlink from %q to %q: %w", idbSnapshot, dstIdbDir, err) } - srcMetadataDir := srcDir + "/metadata" - dstMetadataDir := dstDir + "/metadata" - if err := fs.CopyDirectory(srcMetadataDir, dstMetadataDir); err != nil { - fs.MustRemoveAll(dstDir) - return "", fmt.Errorf("cannot copy metadata: %w", err) - } - fs.MustSyncPath(dstDir) logger.Infof("created Storage snapshot for %q at %q in %.3f seconds", srcDir, dstDir, time.Since(startTime).Seconds()) + dirsToRemoveOnError = nil return snapshotName, nil } diff --git a/lib/storage/table.go b/lib/storage/table.go index 508644bae..32bb03373 100644 --- a/lib/storage/table.go +++ b/lib/storage/table.go @@ -157,19 +157,22 @@ func (tb *table) CreateSnapshot(snapshotName string, deadline uint64) (string, s } dstBigDir := fmt.Sprintf("%s/big/snapshots/%s", tb.path, snapshotName) if err := fs.MkdirAllFailIfExist(dstBigDir); err != nil { + fs.MustRemoveAll(dstSmallDir) return "", "", fmt.Errorf("cannot create dir %q: %w", dstBigDir, err) } - for i, ptw := range ptws { - if deadline > 0 && i%5 == 0 { - if fasttime.UnixTimestamp() > deadline { - return "", "", fmt.Errorf("cannot create snapshot for %q in %q in time: timeout exceeded", tb.path, snapshotName) - } + for _, ptw := range ptws { + if deadline > 0 && fasttime.UnixTimestamp() > deadline { + fs.MustRemoveAll(dstSmallDir) + fs.MustRemoveAll(dstBigDir) + return "", "", fmt.Errorf("cannot create snapshot for %q: timeout exceeded", tb.path) } smallPath := dstSmallDir + "/" + ptw.pt.name bigPath := dstBigDir + "/" + ptw.pt.name if err := ptw.pt.CreateSnapshotAt(smallPath, bigPath); err != nil { + fs.MustRemoveAll(dstSmallDir) + fs.MustRemoveAll(dstBigDir) return "", "", fmt.Errorf("cannot create snapshot for partition %q in %q: %w", ptw.pt.name, tb.path, err) } }