mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-15 16:30:55 +01:00
lib/fs: try harder with directory removal on NFS in the event of temporary lock
Do not give up after 11 attempts of directory removal on laggy NFS. Add `vm_nfs_dir_remove_failed_attempts_total` metric for counting the number of failed attempts on directory removal. Log failed attempts on directory removal after long sleep times. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162
This commit is contained in:
parent
7cde25bac4
commit
82bfe818d0
46
lib/fs/fs.go
46
lib/fs/fs.go
@ -248,12 +248,16 @@ func mustSyncParentDirIfExists(path string) {
|
||||
//
|
||||
// It properly handles NFS issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
func MustRemoveAll(path string) {
|
||||
_ = mustRemoveAll(path)
|
||||
}
|
||||
|
||||
func mustRemoveAll(path string) bool {
|
||||
err := os.RemoveAll(path)
|
||||
if err == nil {
|
||||
// Make sure the parent directory doesn't contain references
|
||||
// to the current directory.
|
||||
mustSyncParentDirIfExists(path)
|
||||
return
|
||||
return true
|
||||
}
|
||||
if !isTemporaryNFSError(err) {
|
||||
logger.Panicf("FATAL: cannot remove %q: %s", path, err)
|
||||
@ -261,38 +265,40 @@ func MustRemoveAll(path string) {
|
||||
// NFS prevents from removing directories with open files.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
// Schedule for later directory removal.
|
||||
nfsDirRemoveFailedAttempts.Inc()
|
||||
select {
|
||||
case removeDirCh <- path:
|
||||
default:
|
||||
logger.Panicf("FATAL: cannot schedule %s for removal, since the removal queue is full (%d entries)", path, cap(removeDirCh))
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var nfsDirRemoveFailedAttempts = metrics.NewCounter(`vm_nfs_dir_remove_failed_attempts_total`)
|
||||
|
||||
var removeDirCh = make(chan string, 1024)
|
||||
|
||||
func dirRemover() {
|
||||
const minSleepTime = 100 * time.Millisecond
|
||||
const maxSleepTime = time.Second
|
||||
sleepTime := minSleepTime
|
||||
for path := range removeDirCh {
|
||||
attempts := 0
|
||||
for {
|
||||
err := os.RemoveAll(path)
|
||||
if err == nil {
|
||||
break
|
||||
if mustRemoveAll(path) {
|
||||
sleepTime = minSleepTime
|
||||
continue
|
||||
}
|
||||
if !isTemporaryNFSError(err) {
|
||||
logger.Panicf("FATAL: cannot remove %q: %s", path, err)
|
||||
|
||||
// Couldn't remove the directory at the path because of NFS lock.
|
||||
// Sleep for a while and try again.
|
||||
// Do not limit the amount of time required for deleting the directory,
|
||||
// since this may break on laggy NFS.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/162 .
|
||||
time.Sleep(sleepTime)
|
||||
if sleepTime < maxSleepTime {
|
||||
sleepTime *= 2
|
||||
} else {
|
||||
logger.Errorf("failed to remove directory %q due to NFS lock; retrying later", path)
|
||||
}
|
||||
// NFS prevents from removing directories with open files.
|
||||
// Sleep for a while and try again in the hope open files will be closed.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/61 .
|
||||
attempts++
|
||||
if attempts > 10 {
|
||||
logger.Panicf("FATAL: cannot remove %q in %d attempts: %s", path, attempts, err)
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
// Make sure the parent directory doesn't contain references
|
||||
// to the current directory.
|
||||
mustSyncParentDirIfExists(path)
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user