From 7afe8450fc094ab2be748ffdb4b902f4a6441b84 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Fri, 26 Aug 2022 14:53:02 +0300 Subject: [PATCH] lib/promrelabel: optimize matching for commonly used regex patterns in `if` option The following regex patterns are optimized: - literal string match, e.g. "foo" - prefix match, e.g. "foo.*" and "foo.+" - substring match, e.g. ".*foo.*" and ".+foo.+" - alternate values match, e.g. "foo|bar|baz" --- docs/CHANGELOG.md | 2 +- lib/promrelabel/if_expression.go | 15 ++-- lib/regexutil/promregex.go | 119 +++++++++++++++++++++++++ lib/regexutil/promregex_test.go | 90 +++++++++++++++++++ lib/regexutil/promregex_timing_test.go | 102 +++++++++++++++++++++ 5 files changed, 318 insertions(+), 10 deletions(-) create mode 100644 lib/regexutil/promregex.go create mode 100644 lib/regexutil/promregex_test.go create mode 100644 lib/regexutil/promregex_timing_test.go diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 0bb4376b9..110c30acc 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -23,7 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs. * FEATURE: add the ability to fine-tune the number of points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. This can be done with the `-search.maxPointsSubqueryPerTimeseries` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2922). -* FEATURE: improve the performance for `action: keep`, `action: drop`, `action: labelkeep` and `action: labeldrop` relabeling rules for `regex` containing the list of matching values. For example, `regex: "foo|bar|baz"`. +* FEATURE: improve the performance for relabeling rules with commonly used regular expressions in `regex` and `if` fields such as `some_string`, `prefix.*`, `prefix.+`, `foo|bar|baz`, `.*foo.*` and `.+foo.+`. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to accept [multitenant](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) data via OpenTSDB `/api/put` protocol at `/insert//opentsdb/api/put` http endpoint if [multitenant support](https://docs.victoriametrics.com/vmagent.html#multitenancy) is enabled at `vmagent`. Thanks to @chengjianyun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3015). * FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details. * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`. diff --git a/lib/promrelabel/if_expression.go b/lib/promrelabel/if_expression.go index 32805cab5..24571b865 100644 --- a/lib/promrelabel/if_expression.go +++ b/lib/promrelabel/if_expression.go @@ -3,10 +3,10 @@ package promrelabel import ( "encoding/json" "fmt" - "regexp" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil" "github.com/VictoriaMetrics/metricsql" ) @@ -105,7 +105,7 @@ type labelFilter struct { value string // re contains compiled regexp for `=~` and `!~` op. - re *regexp.Regexp + re *regexutil.PromRegex } func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) { @@ -115,10 +115,7 @@ func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) { value: mlf.Value, } if lf.op == "=~" || lf.op == "!~" { - // PromQL regexps are anchored by default. - // See https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors - reString := "^(?:" + lf.value + ")$" - re, err := regexp.Compile(reString) + re, err := regexutil.NewPromRegex(lf.value) if err != nil { return nil, fmt.Errorf("cannot parse regexp for %s: %w", mlf.AppendString(nil), err) } @@ -134,9 +131,9 @@ func (lf *labelFilter) match(labels []prompbmarshal.Label) bool { case "!=": return !lf.equalValue(labels) case "=~": - return lf.equalRegexp(labels) + return lf.matchRegexp(labels) case "!~": - return !lf.equalRegexp(labels) + return !lf.matchRegexp(labels) default: logger.Panicf("BUG: unexpected operation for label filter: %s", lf.op) } @@ -161,7 +158,7 @@ func (lf *labelFilter) equalValue(labels []prompbmarshal.Label) bool { return false } -func (lf *labelFilter) equalRegexp(labels []prompbmarshal.Label) bool { +func (lf *labelFilter) matchRegexp(labels []prompbmarshal.Label) bool { labelNameMatches := 0 for _, label := range labels { if toCanonicalLabelName(label.Name) != lf.label { diff --git a/lib/regexutil/promregex.go b/lib/regexutil/promregex.go new file mode 100644 index 000000000..bcf5cc247 --- /dev/null +++ b/lib/regexutil/promregex.go @@ -0,0 +1,119 @@ +package regexutil + +import ( + "regexp" + "strings" +) + +// PromRegex implements an optimized string matching for Prometheus-like regex. +// +// The following regexs are optimized: +// +// - plain string such as "foobar" +// - alternate strings such as "foo|bar|baz" +// - prefix match such as "foo.*" or "foo.+" +// - substring match such as ".*foo.*" or ".+bar.+" +type PromRegex struct { + // prefix contains literal prefix for regex. + // For example, prefix="foo" for regex="foo(a|b)" + prefix string + + // Suffix contains regex suffix left after removing the prefix. + // For example, suffix="a|b" for regex="foo(a|b)" + suffix string + + // substrDotStar contains literal string for regex suffix=".*string.*" + substrDotStar string + + // substrDotPlus contains literal string for regex suffix=".+string.+" + substrDotPlus string + + // orValues contains or values for the suffix regex. + // For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz" + orValues []string + + // reSuffix contains an anchored regexp built from suffix: + // "^(?:suffix)$" + reSuffix *regexp.Regexp +} + +// NewPromRegex returns PromRegex for the given expr. +func NewPromRegex(expr string) (*PromRegex, error) { + if _, err := regexp.Compile(expr); err != nil { + return nil, err + } + prefix, suffix := Simplify(expr) + orValues := GetOrValues(suffix) + substrDotStar := getSubstringLiteral(suffix, ".*") + substrDotPlus := getSubstringLiteral(suffix, ".+") + // It is expected that Optimize returns valid regexp in suffix, so use MustCompile here. + // Anchor suffix to the beginning and the end of the matching string. + suffixExpr := "^(?:" + suffix + ")$" + reSuffix := regexp.MustCompile(suffixExpr) + pr := &PromRegex{ + prefix: prefix, + suffix: suffix, + substrDotStar: substrDotStar, + substrDotPlus: substrDotPlus, + orValues: orValues, + reSuffix: reSuffix, + } + return pr, nil +} + +// MatchString retruns true if s matches pr. +// +// The pr is automatically anchored to the beginning and to the end +// of the matching string with '^' and '$'. +func (pr *PromRegex) MatchString(s string) bool { + if !strings.HasPrefix(s, pr.prefix) { + // Fast path - s has another prefix than pr. + return false + } + s = s[len(pr.prefix):] + if len(pr.orValues) > 0 { + // Fast path - pr contains only alternate strings such as 'foo|bar|baz' + for _, v := range pr.orValues { + if s == v { + return true + } + } + return false + } + if pr.substrDotStar != "" { + // Fast path - pr contains ".*someText.*" + return strings.Contains(s, pr.substrDotStar) + } + if pr.substrDotPlus != "" { + // Fast path - pr contains ".+someText.+" + n := strings.Index(s, pr.substrDotPlus) + return n > 0 && n + len(pr.substrDotPlus) < len(s) + } + switch pr.suffix { + case ".*": + // Fast path - the pr contains "prefix.*" + return true + case ".+": + // Fast path - the pr contains "prefix.+" + return len(s) > 0 + } + // Fall back to slow path by matching the original regexp. + return pr.reSuffix.MatchString(s) +} + +func getSubstringLiteral(expr, prefixSuffix string) string { + if !strings.HasPrefix(expr, prefixSuffix) { + return "" + } + expr = expr[len(prefixSuffix):] + if !strings.HasSuffix(expr, prefixSuffix) { + return "" + } + expr = expr[:len(expr)-len(prefixSuffix)] + prefix, suffix := Simplify(expr) + if suffix != "" { + return "" + } + return prefix +} + diff --git a/lib/regexutil/promregex_test.go b/lib/regexutil/promregex_test.go new file mode 100644 index 000000000..2a2a86351 --- /dev/null +++ b/lib/regexutil/promregex_test.go @@ -0,0 +1,90 @@ +package regexutil + +import ( + "regexp" + "testing" +) + +func TestPromRegexParseFailure(t *testing.T) { + f := func(expr string) { + t.Helper() + pr, err := NewPromRegex(expr) + if err == nil { + t.Fatalf("expecting non-nil error for expr=%s", expr) + } + if pr != nil { + t.Fatalf("expecting nil pr for expr=%s", expr) + } + } + f("fo[bar") + f("foo(bar") +} + +func TestPromRegex(t *testing.T) { + f := func(expr, s string, resultExpected bool) { + t.Helper() + pr, err := NewPromRegex(expr) + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + result := pr.MatchString(s) + if result != resultExpected { + t.Fatalf("unexpected result when matching %s against %s; got %v; want %v", expr, s, result, resultExpected) + } + + // Make sure the result is the same for regular regexp + exprAnchored := "^(?:" + expr + ")$" + re := regexp.MustCompile(exprAnchored) + result = re.MatchString(s) + if result != resultExpected { + t.Fatalf("unexpected result when matching %s against %s during sanity check; got %v; want %v", exprAnchored, s, result, resultExpected) + } + } + f("", "", true) + f("", "foo", false) + f("foo", "", false) + f(".*", "", true) + f(".*", "foo", true) + f(".+", "", false) + f(".+", "foo", true) + f("foo.*", "bar", false) + f("foo.*", "foo", true) + f("foo.*", "foobar", true) + f("foo.+", "bar", false) + f("foo.+", "foo", false) + f("foo.+", "foobar", true) + f("foo|bar", "", false) + f("foo|bar", "a", false) + f("foo|bar", "foo", true) + f("foo|bar", "bar", true) + f("foo|bar", "foobar", false) + f("foo(bar|baz)", "a", false) + f("foo(bar|baz)", "foobar", true) + f("foo(bar|baz)", "foobaz", true) + f("foo(bar|baz)", "foobaza", false) + f("foo(bar|baz)", "foobal", false) + f("^foo|b(ar)$", "foo", true) + f("^foo|b(ar)$", "bar", true) + f("^foo|b(ar)$", "ar", false) + f(".*foo.*", "foo", true) + f(".*foo.*", "afoobar", true) + f(".*foo.*", "abc", false) + f("foo.*bar.*", "foobar", true) + f("foo.*bar.*", "foo_bar_", true) + f("foo.*bar.*", "foobaz", false) + f(".+foo.+", "foo", false) + f(".+foo.+", "afoobar", true) + f(".+foo.+", "afoo", false) + f(".+foo.+", "abc", false) + f("foo.+bar.+", "foobar", false) + f("foo.+bar.+", "foo_bar_", true) + f("foo.+bar.+", "foobaz", false) + f(".+foo.*", "foo", false) + f(".+foo.*", "afoo", true) + f(".+foo.*", "afoobar", true) + f(".*(a|b).*", "a", true) + f(".*(a|b).*", "ax", true) + f(".*(a|b).*", "xa", true) + f(".*(a|b).*", "xay", true) + f(".*(a|b).*", "xzy", false) +} diff --git a/lib/regexutil/promregex_timing_test.go b/lib/regexutil/promregex_timing_test.go new file mode 100644 index 000000000..abb63be91 --- /dev/null +++ b/lib/regexutil/promregex_timing_test.go @@ -0,0 +1,102 @@ +package regexutil + +import ( + "fmt" + "regexp" + "testing" +) + +func BenchmarkPromRegexMatchString(b *testing.B) { + b.Run("unpotimized-noprefix-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "xbar.*|baz", "xbarz", true) + }) + b.Run("unpotimized-noprefix-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "xbar.*|baz", "zfoobarz", false) + }) + b.Run("unpotimized-prefix-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "foobarz", true) + }) + b.Run("unpotimized-prefix-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "zfoobarz", false) + }) + b.Run("literal-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo", "foo", true) + }) + b.Run("literal-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo", "bar", false) + }) + b.Run("prefix-dot-star-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo.*", "foobar", true) + }) + b.Run("prefix-dot-star-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo.*", "afoobar", false) + }) + b.Run("prefix-dot-plus-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo.+", "foobar", true) + }) + b.Run("prefix-dot-plus-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo.+", "afoobar", false) + }) + b.Run("or-values-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo|bar|baz", "baz", true) + }) + b.Run("or-values-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "foo|bar|baz", "abaz", false) + }) + b.Run("prefix-or-values-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "xbaz", true) + }) + b.Run("prefix-or-values-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "abaz", false) + }) + b.Run("substring-dot-star-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, ".*foo.*", "afoobar", true) + }) + b.Run("substring-dot-star-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, ".*foo.*", "abarbaz", false) + }) + b.Run("substring-dot-plus-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, ".+foo.+", "afoobar", true) + }) + b.Run("substring-dot-plus-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, ".+foo.+", "abarbaz", false) + }) + b.Run("prefix-substring-dot-star-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "a.*foo.*", "afoobar", true) + }) + b.Run("prefix-substring-dot-star-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "a.*foo.*", "abarbaz", false) + }) + b.Run("prefix-substring-dot-plus-match", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "a.+foo.+", "abfoobar", true) + }) + b.Run("prefix-substring-dot-plus-mismatch", func(b *testing.B) { + benchmarkPromRegexMatchString(b, "a.+foo.+", "abarbaz", false) + }) +} + +func benchmarkPromRegexMatchString(b *testing.B, expr, s string, resultExpected bool) { + pr, err := NewPromRegex(expr) + if err != nil { + panic(fmt.Errorf("unexpected error: %s", err)) + } + re := regexp.MustCompile("^(?:" + expr + ")$") + f := func(b *testing.B, matchString func(s string) bool) { + b.SetBytes(1) + b.ReportAllocs() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + result := matchString(s) + if result != resultExpected { + panic(fmt.Errorf("unexpected result when matching %s against %s; got %v; want %v", s, expr, result, resultExpected)) + } + } + }) + } + b.Run("PromRegex", func(b *testing.B) { + f(b, pr.MatchString) + }) + b.Run("StandardRegex", func(b *testing.B) { + f(b, re.MatchString) + }) +}