From 7afe8450fc094ab2be748ffdb4b902f4a6441b84 Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Fri, 26 Aug 2022 14:53:02 +0300
Subject: [PATCH] lib/promrelabel: optimize matching for commonly used regex
 patterns in `if` option

The following regex patterns are optimized:

- literal string match, e.g. "foo"
- prefix match, e.g. "foo.*" and "foo.+"
- substring match, e.g. ".*foo.*" and ".+foo.+"
- alternate values match, e.g. "foo|bar|baz"
---
 docs/CHANGELOG.md                      |   2 +-
 lib/promrelabel/if_expression.go       |  15 ++--
 lib/regexutil/promregex.go             | 119 +++++++++++++++++++++++++
 lib/regexutil/promregex_test.go        |  90 +++++++++++++++++++
 lib/regexutil/promregex_timing_test.go | 102 +++++++++++++++++++++
 5 files changed, 318 insertions(+), 10 deletions(-)
 create mode 100644 lib/regexutil/promregex.go
 create mode 100644 lib/regexutil/promregex_test.go
 create mode 100644 lib/regexutil/promregex_timing_test.go
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 0bb4376b9..110c30acc 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -23,7 +23,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 
 * FEATURE: return shorter error messages to Grafana and to other clients requesting [/api/v1/query](https://docs.victoriametrics.com/keyConcepts.html#instant-query) and [/api/v1/query_range](https://docs.victoriametrics.com/keyConcepts.html#range-query) endpoints. This should simplify reading these errors by humans. The long error message with full context is still written to logs.
 * FEATURE: add the ability to fine-tune the number of points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. This can be done with the `-search.maxPointsSubqueryPerTimeseries` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2922).
-* FEATURE: improve the performance for `action: keep`, `action: drop`, `action: labelkeep` and `action: labeldrop` relabeling rules for `regex` containing the list of matching values. For example, `regex: "foo|bar|baz"`.
+* FEATURE: improve the performance for relabeling rules with commonly used regular expressions in `regex` and `if` fields such as `some_string`, `prefix.*`, `prefix.+`, `foo|bar|baz`, `.*foo.*` and `.+foo.+`.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to accept [multitenant](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#multitenancy) data via OpenTSDB `/api/put` protocol at `/insert/<tenantID>/opentsdb/api/put` http endpoint if [multitenant support](https://docs.victoriametrics.com/vmagent.html#multitenancy) is enabled at `vmagent`. Thanks to @chengjianyun for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3015).
 * FEATURE: [monitoring](https://docs.victoriametrics.com/#monitoring): expose `vm_hourly_series_limit_max_series`, `vm_hourly_series_limit_current_series`, `vm_daily_series_limit_max_series` and `vm_daily_series_limit_current_series` metrics when `-search.maxHourlySeries` or `-search.maxDailySeries` limits are set. This allows alerting when the number of unique series reaches the configured limits. See [these docs](https://docs.victoriametrics.com/#cardinality-limiter) for details.
 * FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the amounts of logging at `vmstorage` when `vmselect` connects/disconnects to `vmstorage`.
diff --git a/lib/promrelabel/if_expression.go b/lib/promrelabel/if_expression.go
index 32805cab5..24571b865 100644
--- a/lib/promrelabel/if_expression.go
+++ b/lib/promrelabel/if_expression.go
@@ -3,10 +3,10 @@ package promrelabel
 import (
 	"encoding/json"
 	"fmt"
-	"regexp"
 
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil"
 	"github.com/VictoriaMetrics/metricsql"
 )
 
@@ -105,7 +105,7 @@ type labelFilter struct {
 	value string
 
 	// re contains compiled regexp for `=~` and `!~` op.
-	re *regexp.Regexp
+	re *regexutil.PromRegex
 }
 
 func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) {
@@ -115,10 +115,7 @@ func newLabelFilter(mlf *metricsql.LabelFilter) (*labelFilter, error) {
 		value: mlf.Value,
 	}
 	if lf.op == "=~" || lf.op == "!~" {
-		// PromQL regexps are anchored by default.
-		// See https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors
-		reString := "^(?:" + lf.value + ")$"
-		re, err := regexp.Compile(reString)
+		re, err := regexutil.NewPromRegex(lf.value)
 		if err != nil {
 			return nil, fmt.Errorf("cannot parse regexp for %s: %w", mlf.AppendString(nil), err)
 		}
@@ -134,9 +131,9 @@ func (lf *labelFilter) match(labels []prompbmarshal.Label) bool {
 	case "!=":
 		return !lf.equalValue(labels)
 	case "=~":
-		return lf.equalRegexp(labels)
+		return lf.matchRegexp(labels)
 	case "!~":
-		return !lf.equalRegexp(labels)
+		return !lf.matchRegexp(labels)
 	default:
 		logger.Panicf("BUG: unexpected operation for label filter: %s", lf.op)
 	}
@@ -161,7 +158,7 @@ func (lf *labelFilter) equalValue(labels []prompbmarshal.Label) bool {
 	return false
 }
 
-func (lf *labelFilter) equalRegexp(labels []prompbmarshal.Label) bool {
+func (lf *labelFilter) matchRegexp(labels []prompbmarshal.Label) bool {
 	labelNameMatches := 0
 	for _, label := range labels {
 		if toCanonicalLabelName(label.Name) != lf.label {
diff --git a/lib/regexutil/promregex.go b/lib/regexutil/promregex.go
new file mode 100644
index 000000000..bcf5cc247
--- /dev/null
+++ b/lib/regexutil/promregex.go
@@ -0,0 +1,119 @@
+package regexutil
+
+import (
+	"regexp"
+	"strings"
+)
+
+// PromRegex implements an optimized string matching for Prometheus-like regex.
+//
+// The following regexs are optimized:
+//
+// - plain string such as "foobar"
+// - alternate strings such as "foo|bar|baz"
+// - prefix match such as "foo.*" or "foo.+"
+// - substring match such as ".*foo.*" or ".+bar.+"
+type PromRegex struct {
+	// prefix contains literal prefix for regex.
+	// For example, prefix="foo" for regex="foo(a|b)"
+	prefix   string
+
+	// Suffix contains regex suffix left after removing the prefix.
+	// For example, suffix="a|b" for regex="foo(a|b)"
+	suffix string
+
+	// substrDotStar contains literal string for regex suffix=".*string.*"
+	substrDotStar string
+
+	// substrDotPlus contains literal string for regex suffix=".+string.+"
+	substrDotPlus string
+
+	// orValues contains or values for the suffix regex.
+	// For example, orValues contain ["foo","bar","baz"] for regex suffix="foo|bar|baz"
+	orValues []string
+
+	// reSuffix contains an anchored regexp built from suffix:
+	// "^(?:suffix)$"
+	reSuffix *regexp.Regexp
+}
+
+// NewPromRegex returns PromRegex for the given expr.
+func NewPromRegex(expr string) (*PromRegex, error) {
+	if _, err := regexp.Compile(expr); err != nil {
+		return nil, err
+	}
+	prefix, suffix := Simplify(expr)
+	orValues := GetOrValues(suffix)
+	substrDotStar := getSubstringLiteral(suffix, ".*")
+	substrDotPlus := getSubstringLiteral(suffix, ".+")
+	// It is expected that Optimize returns valid regexp in suffix, so use MustCompile here.
+	// Anchor suffix to the beginning and the end of the matching string.
+	suffixExpr := "^(?:" + suffix + ")$"
+	reSuffix := regexp.MustCompile(suffixExpr)
+	pr := &PromRegex{
+		prefix:   prefix,
+		suffix: suffix,
+		substrDotStar: substrDotStar,
+		substrDotPlus: substrDotPlus,
+		orValues: orValues,
+		reSuffix:       reSuffix,
+	}
+	return pr, nil
+}
+
+// MatchString retruns true if s matches pr.
+//
+// The pr is automatically anchored to the beginning and to the end
+// of the matching string with '^' and '$'.
+func (pr *PromRegex) MatchString(s string) bool {
+	if !strings.HasPrefix(s, pr.prefix) {
+		// Fast path - s has another prefix than pr.
+		return false
+	}
+	s = s[len(pr.prefix):]
+	if len(pr.orValues) > 0 {
+		// Fast path - pr contains only alternate strings such as 'foo|bar|baz'
+		for _, v := range pr.orValues {
+			if s == v {
+				return true
+			}
+		}
+		return false
+	}
+	if pr.substrDotStar != "" {
+		// Fast path - pr contains ".*someText.*"
+		return strings.Contains(s, pr.substrDotStar)
+	}
+	if pr.substrDotPlus != "" {
+		// Fast path - pr contains ".+someText.+"
+		n := strings.Index(s, pr.substrDotPlus)
+		return n > 0 && n + len(pr.substrDotPlus) < len(s)
+	}
+	switch pr.suffix {
+	case ".*":
+		// Fast path - the pr contains "prefix.*"
+		return true
+	case ".+":
+		// Fast path - the pr contains "prefix.+"
+		return len(s) > 0
+	}
+	// Fall back to slow path by matching the original regexp.
+	return pr.reSuffix.MatchString(s)
+}
+
+func getSubstringLiteral(expr, prefixSuffix string) string {
+	if !strings.HasPrefix(expr, prefixSuffix) {
+		return ""
+	}
+	expr = expr[len(prefixSuffix):]
+	if !strings.HasSuffix(expr, prefixSuffix) {
+		return ""
+	}
+	expr = expr[:len(expr)-len(prefixSuffix)]
+	prefix, suffix := Simplify(expr)
+	if suffix != "" {
+		return ""
+	}
+	return prefix
+}
+
diff --git a/lib/regexutil/promregex_test.go b/lib/regexutil/promregex_test.go
new file mode 100644
index 000000000..2a2a86351
--- /dev/null
+++ b/lib/regexutil/promregex_test.go
@@ -0,0 +1,90 @@
+package regexutil
+
+import (
+	"regexp"
+	"testing"
+)
+
+func TestPromRegexParseFailure(t *testing.T) {
+	f := func(expr string) {
+		t.Helper()
+		pr, err := NewPromRegex(expr)
+		if err == nil {
+			t.Fatalf("expecting non-nil error for expr=%s", expr)
+		}
+		if pr != nil {
+			t.Fatalf("expecting nil pr for expr=%s", expr)
+		}
+	}
+	f("fo[bar")
+	f("foo(bar")
+}
+
+func TestPromRegex(t *testing.T) {
+	f := func(expr, s string, resultExpected bool) {
+		t.Helper()
+		pr, err := NewPromRegex(expr)
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+		result := pr.MatchString(s)
+		if result != resultExpected {
+			t.Fatalf("unexpected result when matching %s against %s; got %v; want %v", expr, s, result, resultExpected)
+		}
+
+		// Make sure the result is the same for regular regexp
+		exprAnchored := "^(?:" + expr + ")$"
+		re := regexp.MustCompile(exprAnchored)
+		result = re.MatchString(s)
+		if result != resultExpected {
+			t.Fatalf("unexpected result when matching %s against %s during sanity check; got %v; want %v", exprAnchored, s, result, resultExpected)
+		}
+	}
+	f("", "", true)
+	f("", "foo", false)
+	f("foo", "", false)
+	f(".*", "", true)
+	f(".*", "foo", true)
+	f(".+", "", false)
+	f(".+", "foo", true)
+	f("foo.*", "bar", false)
+	f("foo.*", "foo", true)
+	f("foo.*", "foobar", true)
+	f("foo.+", "bar", false)
+	f("foo.+", "foo", false)
+	f("foo.+", "foobar", true)
+	f("foo|bar", "", false)
+	f("foo|bar", "a", false)
+	f("foo|bar", "foo", true)
+	f("foo|bar", "bar", true)
+	f("foo|bar", "foobar", false)
+	f("foo(bar|baz)", "a", false)
+	f("foo(bar|baz)", "foobar", true)
+	f("foo(bar|baz)", "foobaz", true)
+	f("foo(bar|baz)", "foobaza", false)
+	f("foo(bar|baz)", "foobal", false)
+	f("^foo|b(ar)$", "foo", true)
+	f("^foo|b(ar)$", "bar", true)
+	f("^foo|b(ar)$", "ar", false)
+	f(".*foo.*", "foo", true)
+	f(".*foo.*", "afoobar", true)
+	f(".*foo.*", "abc", false)
+	f("foo.*bar.*", "foobar", true)
+	f("foo.*bar.*", "foo_bar_", true)
+	f("foo.*bar.*", "foobaz", false)
+	f(".+foo.+", "foo", false)
+	f(".+foo.+", "afoobar", true)
+	f(".+foo.+", "afoo", false)
+	f(".+foo.+", "abc", false)
+	f("foo.+bar.+", "foobar", false)
+	f("foo.+bar.+", "foo_bar_", true)
+	f("foo.+bar.+", "foobaz", false)
+	f(".+foo.*", "foo", false)
+	f(".+foo.*", "afoo", true)
+	f(".+foo.*", "afoobar", true)
+	f(".*(a|b).*", "a", true)
+	f(".*(a|b).*", "ax", true)
+	f(".*(a|b).*", "xa", true)
+	f(".*(a|b).*", "xay", true)
+	f(".*(a|b).*", "xzy", false)
+}
diff --git a/lib/regexutil/promregex_timing_test.go b/lib/regexutil/promregex_timing_test.go
new file mode 100644
index 000000000..abb63be91
--- /dev/null
+++ b/lib/regexutil/promregex_timing_test.go
@@ -0,0 +1,102 @@
+package regexutil
+
+import (
+	"fmt"
+	"regexp"
+	"testing"
+)
+
+func BenchmarkPromRegexMatchString(b *testing.B) {
+	b.Run("unpotimized-noprefix-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "xbar.*|baz", "xbarz", true)
+	})
+	b.Run("unpotimized-noprefix-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "xbar.*|baz", "zfoobarz", false)
+	})
+	b.Run("unpotimized-prefix-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "foobarz", true)
+	})
+	b.Run("unpotimized-prefix-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo(bar.*|baz)", "zfoobarz", false)
+	})
+	b.Run("literal-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo", "foo", true)
+	})
+	b.Run("literal-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo", "bar", false)
+	})
+	b.Run("prefix-dot-star-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo.*", "foobar", true)
+	})
+	b.Run("prefix-dot-star-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo.*", "afoobar", false)
+	})
+	b.Run("prefix-dot-plus-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo.+", "foobar", true)
+	})
+	b.Run("prefix-dot-plus-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo.+", "afoobar", false)
+	})
+	b.Run("or-values-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo|bar|baz", "baz", true)
+	})
+	b.Run("or-values-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "foo|bar|baz", "abaz", false)
+	})
+	b.Run("prefix-or-values-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "xbaz", true)
+	})
+	b.Run("prefix-or-values-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "x(foo|bar|baz)", "abaz", false)
+	})
+	b.Run("substring-dot-star-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, ".*foo.*", "afoobar", true)
+	})
+	b.Run("substring-dot-star-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, ".*foo.*", "abarbaz", false)
+	})
+	b.Run("substring-dot-plus-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, ".+foo.+", "afoobar", true)
+	})
+	b.Run("substring-dot-plus-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, ".+foo.+", "abarbaz", false)
+	})
+	b.Run("prefix-substring-dot-star-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "a.*foo.*", "afoobar", true)
+	})
+	b.Run("prefix-substring-dot-star-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "a.*foo.*", "abarbaz", false)
+	})
+	b.Run("prefix-substring-dot-plus-match", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "a.+foo.+", "abfoobar", true)
+	})
+	b.Run("prefix-substring-dot-plus-mismatch", func(b *testing.B) {
+		benchmarkPromRegexMatchString(b, "a.+foo.+", "abarbaz", false)
+	})
+}
+
+func benchmarkPromRegexMatchString(b *testing.B, expr, s string, resultExpected bool) {
+	pr, err := NewPromRegex(expr)
+	if err != nil {
+		panic(fmt.Errorf("unexpected error: %s", err))
+	}
+	re := regexp.MustCompile("^(?:" + expr + ")$")
+	f := func(b *testing.B, matchString func(s string) bool) {
+		b.SetBytes(1)
+		b.ReportAllocs()
+		b.RunParallel(func(pb *testing.PB) {
+			for pb.Next() {
+				result := matchString(s)
+				if result != resultExpected {
+					panic(fmt.Errorf("unexpected result when matching %s against %s; got %v; want %v", s, expr, result, resultExpected))
+				}
+			}
+		})
+	}
+	b.Run("PromRegex", func(b *testing.B) {
+		f(b, pr.MatchString)
+	})
+	b.Run("StandardRegex", func(b *testing.B) {
+		f(b, re.MatchString)
+	})
+}