From 297301e8c0a9ba64f6a0c1ef44af1290c6d0ec50 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Sun, 8 Sep 2024 12:23:45 +0200 Subject: [PATCH] lib/logstorage: preserve the order of tokens to check against bloom filters in AND filters Previously tokens from AND filters were extracted in random order. This could slow down checking them agains bloom filters if the most specific tokens go at the beginning of the AND filters. Preserve the original order of tokens when matching them against bloom filters, so the user could control the performance of the query by putting the most specific AND filters at the beginning of the query. While at it, add tests for getCommonTokensForAndFilters() and getCommonTokensForOrFilters(). Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6556 --- lib/logstorage/filter_and.go | 18 +++--- lib/logstorage/filter_and_test.go | 90 ++++++++++++++++++++++++++++ lib/logstorage/filter_or.go | 10 +--- lib/logstorage/filter_or_test.go | 98 +++++++++++++++++++++++++++++++ 4 files changed, 199 insertions(+), 17 deletions(-) diff --git a/lib/logstorage/filter_and.go b/lib/logstorage/filter_and.go index 131050ed36..cf0fa8db28 100644 --- a/lib/logstorage/filter_and.go +++ b/lib/logstorage/filter_and.go @@ -118,7 +118,7 @@ func (fa *filterAnd) initByFieldTokens() { } func getCommonTokensForAndFilters(filters []filter) []fieldTokens { - m := make(map[string]map[string]struct{}) + m := make(map[string][]string) var fieldNames []string mergeFieldTokens := func(fieldName string, tokens []string) { @@ -127,15 +127,10 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens { } fieldName = getCanonicalColumnName(fieldName) - mTokens, ok := m[fieldName] - if !ok { + if _, ok := m[fieldName]; !ok { fieldNames = append(fieldNames, fieldName) - mTokens = make(map[string]struct{}) - m[fieldName] = mTokens - } - for _, token := range tokens { - mTokens[token] = struct{}{} } + m[fieldName] = append(m[fieldName], tokens...) } for _, f := range filters { @@ -169,8 +164,13 @@ func getCommonTokensForAndFilters(filters []filter) []fieldTokens { var byFieldTokens []fieldTokens for _, fieldName := range fieldNames { mTokens := m[fieldName] + seenTokens := make(map[string]struct{}) tokens := make([]string, 0, len(mTokens)) - for token := range mTokens { + for _, token := range mTokens { + if _, ok := seenTokens[token]; ok { + continue + } + seenTokens[token] = struct{}{} tokens = append(tokens, token) } diff --git a/lib/logstorage/filter_and_test.go b/lib/logstorage/filter_and_test.go index 41dd2d19fa..223fffb7ac 100644 --- a/lib/logstorage/filter_and_test.go +++ b/lib/logstorage/filter_and_test.go @@ -1,6 +1,7 @@ package logstorage import ( + "reflect" "testing" ) @@ -75,3 +76,92 @@ func TestFilterAnd(t *testing.T) { f(`foo:foo* AND !foo:~bar`, []int{0}) f(`foo:foo* AND foo:!~bar`, []int{0}) } + +func TestGetCommonTokensForAndFilters(t *testing.T) { + f := func(qStr string, tokensExpected []fieldTokens) { + t.Helper() + + q, err := ParseQuery(qStr) + if err != nil { + t.Fatalf("unexpected error in ParseQuery: %s", err) + } + fa, ok := q.f.(*filterAnd) + if !ok { + t.Fatalf("unexpected filter type: %T; want *filterAnd", q.f) + } + tokens := getCommonTokensForAndFilters(fa.filters) + + if len(tokens) != len(tokensExpected) { + t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected) + } + for i, ft := range tokens { + ftExpected := tokensExpected[i] + if ft.field != ftExpected.field { + t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens) + } + if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) { + t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens) + } + } + } + + f(`foo AND bar`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + + f(`="foo bar" AND ="a foo"* AND "bar foo" AND "foo bar"* AND ~"foo qwe bar.+" AND seq(x, bar, "foo qwe")`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar", "a", "qwe", "x"}, + }, + }) + + // extract common tokens from OR filters + f(`foo AND (bar OR ~"x bar baz")`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + + // star matches any non-empty token, so it is skipped + f(`foo bar *`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + f(`* *`, nil) + + // empty filter must be skipped + f(`foo "" bar`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo", "bar"}, + }, + }) + f(`"" ""`, nil) + + // unknown filters must be skipped + f(`_time:5m !foo "bar baz" x`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"bar", "baz", "x"}, + }, + }) + + // distinct field names + f(`foo:x bar:"a bc" (foo:y OR (bar:qwe AND foo:"z y a"))`, []fieldTokens{ + { + field: "foo", + tokens: []string{"x", "y"}, + }, + { + field: "bar", + tokens: []string{"a", "bc"}, + }, + }) +} diff --git a/lib/logstorage/filter_or.go b/lib/logstorage/filter_or.go index ab268f7454..39e49a0d66 100644 --- a/lib/logstorage/filter_or.go +++ b/lib/logstorage/filter_or.go @@ -182,17 +182,11 @@ func getCommonTokensForOrFilters(filters []filter) []fieldTokens { if len(tokenss) != len(filters) { // The filter for the given fieldName is missing in some OR filters, // so it is impossible to extract common tokens from these filters. - // Give up extracting common tokens from the remaining filters, - // since they may not cover log entries matching fieldName filters. - // This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 - return nil + continue } commonTokens := getCommonTokens(tokenss) if len(commonTokens) == 0 { - // Give up extracting common tokens from the remaining filters, - // since they may not cover log entries matching fieldName filters. - // This fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6554 - return nil + continue } byFieldTokens = append(byFieldTokens, fieldTokens{ field: fieldName, diff --git a/lib/logstorage/filter_or_test.go b/lib/logstorage/filter_or_test.go index c7bc860eee..371941a5ca 100644 --- a/lib/logstorage/filter_or_test.go +++ b/lib/logstorage/filter_or_test.go @@ -1,6 +1,7 @@ package logstorage import ( + "reflect" "testing" ) @@ -70,3 +71,100 @@ func TestFilterOr(t *testing.T) { f(`foo:baz or !foo:~foo`, []int{2, 3, 5, 7, 8, 9}) f(`foo:baz or foo:!~foo`, []int{2, 3, 5, 7, 8, 9}) } + +func TestGetCommonTokensForOrFilters(t *testing.T) { + f := func(qStr string, tokensExpected []fieldTokens) { + t.Helper() + + q, err := ParseQuery(qStr) + if err != nil { + t.Fatalf("unexpected error in ParseQuery: %s", err) + } + fo, ok := q.f.(*filterOr) + if !ok { + t.Fatalf("unexpected filter type: %T; want *filterOr", q.f) + } + tokens := getCommonTokensForOrFilters(fo.filters) + + if len(tokens) != len(tokensExpected) { + t.Fatalf("unexpected len(tokens); got %d; want %d\ntokens\n%#v\ntokensExpected\n%#v", len(tokens), len(tokensExpected), tokens, tokensExpected) + } + for i, ft := range tokens { + ftExpected := tokensExpected[i] + if ft.field != ftExpected.field { + t.Fatalf("unexpected field; got %q; want %q\ntokens\n%q\ntokensExpected\n%q", ft.field, ftExpected.field, ft.tokens, ftExpected.tokens) + } + if !reflect.DeepEqual(ft.tokens, ftExpected.tokens) { + t.Fatalf("unexpected tokens for field %q; got %q; want %q", ft.field, ft.tokens, ftExpected.tokens) + } + } + } + + // no common tokens + f(`foo OR bar`, nil) + + // star filter matches non-empty value; it is skipped, since one of OR filters may contain an empty filter + f(`* OR foo`, nil) + f(`foo or *`, nil) + f(`* or *`, nil) + f(`"" or * or foo`, nil) + + // empty filter suppresses all the common tokens. + f(`"" or foo or "foo bar"`, nil) + f(`foo or ""`, nil) + f(`"" or ""`, nil) + + // common foo token + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo"}, + }, + }) + + // regexp ending on foo token doesn't contain foo token, since it may match foobar. + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"a.+ foo"`, nil) + + // regexp starting from foo token doesn't contain foo token, since it may match barfoo. + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo "* OR seq("bar foo", "baz") OR ~"foo bar"`, nil) + + // prefix filter ending on foo doesn't contain foo token, since it may match foobar. + f(`foo OR "foo bar" OR ="a foo" OR ="foo bar"* OR "bar foo"* OR seq("bar foo", "baz") OR ~"a.+ foo bar"`, nil) + + // bar and foo are common tokens + f(`"bar foo baz" OR (foo AND "bar x" AND foobar)`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"bar", "foo"}, + }, + }) + + // bar and foo are common tokens, x:foobar should be ignored, since it doesn't present in every OR filter + f(`"bar foo baz" OR (foo AND "bar x" AND x:foobar)`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"bar", "foo"}, + }, + }) + + // common tokens for distinct fields + f(`(foo AND x:a) OR (x:"b a c" AND ~"aaa foo ")`, []fieldTokens{ + { + field: "_msg", + tokens: []string{"foo"}, + }, + { + field: "x", + tokens: []string{"a"}, + }, + }) + + // zero common tokens for distinct fields + f(`(foo AND x:a) OR (x:"b c" AND ~"aaa foo" AND y:z)`, nil) + + // negative filter removes all the matching + f(`foo OR !"foo bar"`, nil) + + // time filter removes all the matching + f(`_time:5m or foo`, nil) +}