From 8ddf089deb367ce7d433b7684e21ccebba0696c9 Mon Sep 17 00:00:00 2001 From: faceair Date: Fri, 16 Oct 2020 16:46:55 -0500 Subject: [PATCH] evaluate the execution cost of all tag filters (#824) * evaluate the execution cost of all tag filters * fix suffixes typo --- lib/storage/index_db.go | 8 +- lib/storage/tag_filters.go | 56 ++++--- lib/storage/tag_filters_timing_test.go | 210 +++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 23 deletions(-) diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index 4c0b3da05b..b66f29bcee 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -2186,7 +2186,7 @@ func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, filter *uint64set } metricIDs := &uint64set.Set{} if len(tf.orSuffixes) > 0 { - // Fast path for orSuffixes - seek for rows for each value from orSuffxies. + // Fast path for orSuffixes - seek for rows for each value from orSuffixes. if err := is.updateMetricIDsForOrSuffixesNoFilter(tf, maxMetrics, metricIDs); err != nil { if err == errFallbackToMetricNameMatch { return nil, err @@ -2596,6 +2596,7 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter // This way we limit the amount of work below by applying more specific filters at first. type tagFilterWithCount struct { tf *tagFilter + cost uint64 count uint64 } tfsWithCount := make([]tagFilterWithCount, len(tfs.tfs)) @@ -2611,13 +2612,14 @@ func (is *indexSearch) getMetricIDsForDateAndFilters(date uint64, tfs *TagFilter } tfsWithCount[i] = tagFilterWithCount{ tf: tf, + cost: count * tf.matchCost, count: count, } } sort.Slice(tfsWithCount, func(i, j int) bool { a, b := &tfsWithCount[i], &tfsWithCount[j] - if a.count != b.count { - return a.count < b.count + if a.cost != b.cost { + return a.cost < b.cost } return a.tf.Less(b.tf) }) diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index aa7f0b03e7..9bc67d1d36 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -165,6 +165,7 @@ type tagFilter struct { value []byte isNegative bool isRegexp bool + matchCost uint64 // Prefix always contains {nsPrefixTagToMetricIDs, AccountID, ProjectID, key}. // Additionally it contains: @@ -285,6 +286,7 @@ func (tf *tagFilter) Init(commonPrefix, key, value []byte, isNegative, isRegexp // during the search for matching metricIDs. tf.orSuffixes = append(tf.orSuffixes[:0], "") tf.isEmptyMatch = len(prefix) == 0 + tf.matchCost = defaultCost return nil } rcv, err := getRegexpFromCache(expr) @@ -293,6 +295,7 @@ func (tf *tagFilter) Init(commonPrefix, key, value []byte, isNegative, isRegexp } tf.orSuffixes = append(tf.orSuffixes[:0], rcv.orValues...) tf.reSuffixMatch = rcv.reMatch + tf.matchCost = rcv.reCost tf.isEmptyMatch = len(prefix) == 0 && tf.reSuffixMatch(nil) if !tf.isNegative && len(key) == 0 && strings.IndexByte(rcv.literalSuffix, '.') >= 0 { // Reverse suffix is needed only for non-negative regexp filters on __name__ that contains dots. @@ -357,6 +360,7 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) { sExpr := string(expr) orValues := getOrValues(sExpr) var reMatch func(b []byte) bool + var reCost uint64 var literalSuffix string if len(orValues) > 0 { if len(orValues) == 1 { @@ -364,6 +368,7 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) { reMatch = func(b []byte) bool { return string(b) == v } + reCost = defaultLiteralCost } else { reMatch = func(b []byte) bool { for _, v := range orValues { @@ -373,14 +378,16 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) { } return false } + reCost = uint64(len(orValues)) * defaultLiteralCost } } else { - reMatch, literalSuffix = getOptimizedReMatchFunc(re.Match, sExpr) + reMatch, literalSuffix, reCost = getOptimizedReMatchFunc(re.Match, sExpr) } // Put the reMatch in the cache. rcv.orValues = orValues rcv.reMatch = reMatch + rcv.reCost = reCost rcv.literalSuffix = literalSuffix regexpCacheLock.Lock() @@ -415,32 +422,40 @@ func getRegexpFromCache(expr []byte) (regexpCacheValue, error) { // It returns reMatch if it cannot find optimized function. // // It also returns literal suffix from the expr. -func getOptimizedReMatchFunc(reMatch func(b []byte) bool, expr string) (func(b []byte) bool, string) { +func getOptimizedReMatchFunc(reMatch func(b []byte) bool, expr string) (func(b []byte) bool, string, uint64) { sre, err := syntax.Parse(expr, syntax.Perl) if err != nil { logger.Panicf("BUG: unexpected error when parsing verified expr=%q: %s", expr, err) } - if matchFunc, literalSuffix := getOptimizedReMatchFuncExt(reMatch, sre); matchFunc != nil { + if matchFunc, literalSuffix, reCost := getOptimizedReMatchFuncExt(reMatch, sre); matchFunc != nil { // Found optimized function for matching the expr. suffixUnescaped := tagCharsReverseRegexpEscaper.Replace(literalSuffix) - return matchFunc, suffixUnescaped + return matchFunc, suffixUnescaped, reCost } // Fall back to un-optimized reMatch. - return reMatch, "" + return reMatch, "", defaultReCost } -func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) (func(b []byte) bool, string) { +// The following & default cost values are returned from BenchmarkOptimizedReMatchCost + +var ( + defaultCost uint64 = 1 + defaultLiteralCost uint64 = 3 + defaultReCost uint64 = 140 +) + +func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) (func(b []byte) bool, string, uint64) { if isDotStar(sre) { // '.*' return func(b []byte) bool { return true - }, "" + }, "", 1 } if isDotPlus(sre) { // '.+' return func(b []byte) bool { return len(b) > 0 - }, "" + }, "", 1 } switch sre.Op { case syntax.OpCapture: @@ -448,13 +463,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) return getOptimizedReMatchFuncExt(reMatch, sre.Sub[0]) case syntax.OpLiteral: if !isLiteral(sre) { - return nil, "" + return nil, "", 0 } s := string(sre.Rune) // Literal match return func(b []byte) bool { return string(b) == s - }, s + }, s, defaultLiteralCost case syntax.OpConcat: if len(sre.Sub) == 2 { if isLiteral(sre.Sub[0]) { @@ -463,13 +478,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) // 'prefix.*' return func(b []byte) bool { return bytes.HasPrefix(b, prefix) - }, "" + }, "", 2 } if isDotPlus(sre.Sub[1]) { // 'prefix.+' return func(b []byte) bool { return len(b) > len(prefix) && bytes.HasPrefix(b, prefix) - }, "" + }, "", 2 } } if isLiteral(sre.Sub[1]) { @@ -478,13 +493,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) // '.*suffix' return func(b []byte) bool { return bytes.HasSuffix(b, suffix) - }, string(suffix) + }, string(suffix), 3 } if isDotPlus(sre.Sub[0]) { // '.+suffix' return func(b []byte) bool { return len(b) > len(suffix) && bytes.HasSuffix(b[1:], suffix) - }, string(suffix) + }, string(suffix), 3 } } } @@ -495,13 +510,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) // '.*middle.*' return func(b []byte) bool { return bytes.Contains(b, middle) - }, "" + }, "", 5 } if isDotPlus(sre.Sub[2]) { // '.*middle.+' return func(b []byte) bool { return len(b) > len(middle) && bytes.Contains(b[:len(b)-1], middle) - }, "" + }, "", 5 } } if isDotPlus(sre.Sub[0]) { @@ -509,13 +524,13 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) // '.+middle.*' return func(b []byte) bool { return len(b) > len(middle) && bytes.Contains(b[1:], middle) - }, "" + }, "", 5 } if isDotPlus(sre.Sub[2]) { // '.+middle.+' return func(b []byte) bool { return len(b) > len(middle)+1 && bytes.Contains(b[1:len(b)-1], middle) - }, "" + }, "", 5 } } } @@ -549,9 +564,9 @@ func getOptimizedReMatchFuncExt(reMatch func(b []byte) bool, sre *syntax.Regexp) } // Fall back to slow path. return reMatch(bOrig) - }, string(suffix) + }, string(suffix), defaultReCost default: - return nil, "" + return nil, "", 0 } } @@ -738,6 +753,7 @@ var ( type regexpCacheValue struct { orValues []string reMatch func(b []byte) bool + reCost uint64 literalSuffix string } diff --git a/lib/storage/tag_filters_timing_test.go b/lib/storage/tag_filters_timing_test.go index df0956f57a..f68c765c70 100644 --- a/lib/storage/tag_filters_timing_test.go +++ b/lib/storage/tag_filters_timing_test.go @@ -1,6 +1,8 @@ package storage import ( + "bytes" + "regexp" "testing" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" @@ -307,3 +309,211 @@ func BenchmarkTagFilterMatchSuffix(b *testing.B) { }) }) } + +// Run the following command to get the execution cost of all matches +// +// go test -run=none -bench=BenchmarkOptimizedReMatchCost -count 20 | tee cost.txt +// benchstat ./cost.txt +// +// Calculate the multiplier of default for each match overhead. + +func BenchmarkOptimizedReMatchCost(b *testing.B) { + b.Run("default", func(b *testing.B) { + reMatch := func(b []byte) bool { + return len(b) == 0 + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run("literal match", func(b *testing.B) { + s := "foo1.bar.baz.sss.ddd" + reMatch := func(b []byte) bool { + return string(b) == s + } + suffix := []byte(s) + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run("foo|bar|baz", func(b *testing.B) { + s := []string{"foo", "bar", "baz"} + reMatch := func(b []byte) bool { + for _, v := range s { + if string(b) == v { + return true + } + } + return false + } + suffix := []byte("ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".*", func(b *testing.B) { + reMatch := func(b []byte) bool { + return true + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".+", func(b *testing.B) { + reMatch := func(b []byte) bool { + return len(b) > 0 + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run("prefix.*", func(b *testing.B) { + s := []byte("foo1.bar") + reMatch := func(b []byte) bool { + return bytes.HasPrefix(b, s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run("prefix.+", func(b *testing.B) { + s := []byte("foo1.bar") + reMatch := func(b []byte) bool { + return len(b) > len(s) && bytes.HasPrefix(b, s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".*suffix", func(b *testing.B) { + s := []byte("sss.ddd") + reMatch := func(b []byte) bool { + return bytes.HasSuffix(b, s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".+suffix", func(b *testing.B) { + s := []byte("sss.ddd") + reMatch := func(b []byte) bool { + return len(b) > len(s) && bytes.HasSuffix(b[1:], s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".*middle.*", func(b *testing.B) { + s := []byte("bar.baz") + reMatch := func(b []byte) bool { + return bytes.Contains(b, s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".*middle.+", func(b *testing.B) { + s := []byte("bar.baz") + reMatch := func(b []byte) bool { + return len(b) > len(s) && bytes.Contains(b[:len(b)-1], s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".+middle.*", func(b *testing.B) { + s := []byte("bar.baz") + reMatch := func(b []byte) bool { + return len(b) > len(s) && bytes.Contains(b[1:], s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run(".+middle.+", func(b *testing.B) { + s := []byte("bar.baz") + reMatch := func(b []byte) bool { + return len(b) > len(s)+1 && bytes.Contains(b[1:len(b)-1], s) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) + b.Run("default", func(b *testing.B) { + re := regexp.MustCompile(`foo[^.]*?\.bar\.baz\.[^.]*?\.ddd`) + reMatch := func(b []byte) bool { + return re.Match(b) + } + suffix := []byte("foo1.bar.baz.sss.ddd") + b.ReportAllocs() + b.SetBytes(int64(1)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + reMatch(suffix) + } + }) + }) +}