VictoriaMetrics/lib/regexutil/regexutil.go

260 lines
6.3 KiB
Go

package regexutil
import (
"fmt"
"regexp/syntax"
"sort"
"strings"
)
// RemoveStartEndAnchors removes '^' at the start of expr and '$' at the end of the expr.
func RemoveStartEndAnchors(expr string) string {
for strings.HasPrefix(expr, "^") {
expr = expr[1:]
}
for strings.HasSuffix(expr, "$") && !strings.HasSuffix(expr, "\\$") {
expr = expr[:len(expr)-1]
}
return expr
}
// GetOrValues returns "or" values from the given regexp expr.
//
// It ignores start and end anchors ('^') and ('$') at the start and the end of expr.
// It returns ["foo", "bar"] for "foo|bar" regexp.
// It returns ["foo"] for "foo" regexp.
// It returns [""] for "" regexp.
// It returns an empty list if it is impossible to extract "or" values from the regexp.
func GetOrValues(expr string) []string {
expr = RemoveStartEndAnchors(expr)
prefix, tailExpr := Simplify(expr)
if tailExpr == "" {
return []string{prefix}
}
sre, err := syntax.Parse(tailExpr, syntax.Perl)
if err != nil {
panic(fmt.Errorf("BUG: unexpected error when parsing verified tailExpr=%q: %w", tailExpr, err))
}
orValues := getOrValuesExt(sre)
// Sort orValues for faster index seek later
sort.Strings(orValues)
if len(prefix) > 0 {
// Add prefix to orValues
for i, orValue := range orValues {
orValues[i] = prefix + orValue
}
}
return orValues
}
func getOrValuesExt(sre *syntax.Regexp) []string {
switch sre.Op {
case syntax.OpCapture:
return getOrValuesExt(sre.Sub[0])
case syntax.OpLiteral:
if !isLiteral(sre) {
return nil
}
return []string{string(sre.Rune)}
case syntax.OpEmptyMatch:
return []string{""}
case syntax.OpAlternate:
a := make([]string, 0, len(sre.Sub))
for _, reSub := range sre.Sub {
ca := getOrValuesExt(reSub)
if len(ca) == 0 {
return nil
}
a = append(a, ca...)
if len(a) > maxOrValues {
// It is cheaper to use regexp here.
return nil
}
}
return a
case syntax.OpCharClass:
a := make([]string, 0, len(sre.Rune)/2)
for i := 0; i < len(sre.Rune); i += 2 {
start := sre.Rune[i]
end := sre.Rune[i+1]
for start <= end {
a = append(a, string(start))
start++
if len(a) > maxOrValues {
// It is cheaper to use regexp here.
return nil
}
}
}
return a
case syntax.OpConcat:
if len(sre.Sub) < 1 {
return []string{""}
}
prefixes := getOrValuesExt(sre.Sub[0])
if len(prefixes) == 0 {
return nil
}
if len(sre.Sub) == 1 {
return prefixes
}
sre.Sub = sre.Sub[1:]
suffixes := getOrValuesExt(sre)
if len(suffixes) == 0 {
return nil
}
if len(prefixes)*len(suffixes) > maxOrValues {
// It is cheaper to use regexp here.
return nil
}
a := make([]string, 0, len(prefixes)*len(suffixes))
for _, prefix := range prefixes {
for _, suffix := range suffixes {
s := prefix + suffix
a = append(a, s)
}
}
return a
default:
return nil
}
}
func isLiteral(sre *syntax.Regexp) bool {
if sre.Op == syntax.OpCapture {
return isLiteral(sre.Sub[0])
}
return sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0
}
const maxOrValues = 100
// Simplify simplifies the given expr.
//
// It returns plaintext prefix and the remaining regular expression
// with dropped '^' and '$' anchors at the beginning and the end
// of the regular expression.
//
// The function removes capturing parens from the expr,
// so it cannot be used when capturing parens are necessary.
func Simplify(expr string) (string, string) {
sre, err := syntax.Parse(expr, syntax.Perl)
if err != nil {
// Cannot parse the regexp. Return it all as prefix.
return expr, ""
}
sre = simplifyRegexp(sre, false)
if sre == emptyRegexp {
return "", ""
}
if isLiteral(sre) {
return string(sre.Rune), ""
}
var prefix string
if sre.Op == syntax.OpConcat {
sub0 := sre.Sub[0]
if isLiteral(sub0) {
prefix = string(sub0.Rune)
sre.Sub = sre.Sub[1:]
if len(sre.Sub) == 0 {
return prefix, ""
}
sre = simplifyRegexp(sre, true)
}
}
if _, err := syntax.Compile(sre); err != nil {
// Cannot compile the regexp. Return it all as prefix.
return expr, ""
}
s := sre.String()
s = strings.ReplaceAll(s, "(?:)", "")
s = strings.ReplaceAll(s, "(?-s:.)", ".")
s = strings.ReplaceAll(s, "(?-m:$)", "$")
return prefix, s
}
func simplifyRegexp(sre *syntax.Regexp, hasPrefix bool) *syntax.Regexp {
s := sre.String()
for {
sre = simplifyRegexpExt(sre, hasPrefix, false)
sre = sre.Simplify()
if sre.Op == syntax.OpBeginText || sre.Op == syntax.OpEndText {
sre = emptyRegexp
}
sNew := sre.String()
if sNew == s {
return sre
}
var err error
sre, err = syntax.Parse(sNew, syntax.Perl)
if err != nil {
panic(fmt.Errorf("BUG: cannot parse simplified regexp %q: %w", sNew, err))
}
s = sNew
}
}
func simplifyRegexpExt(sre *syntax.Regexp, hasPrefix, hasSuffix bool) *syntax.Regexp {
switch sre.Op {
case syntax.OpCapture:
// Substitute all the capture regexps with non-capture regexps.
sre.Op = syntax.OpAlternate
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], hasPrefix, hasSuffix)
if sre.Sub[0] == emptyRegexp {
return emptyRegexp
}
return sre
case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat:
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], hasPrefix, hasSuffix)
if sre.Sub[0] == emptyRegexp {
return emptyRegexp
}
return sre
case syntax.OpAlternate:
// Do not remove empty captures from OpAlternate, since this may break regexp.
for i, sub := range sre.Sub {
sre.Sub[i] = simplifyRegexpExt(sub, hasPrefix, hasSuffix)
}
return sre
case syntax.OpConcat:
subs := sre.Sub[:0]
for i, sub := range sre.Sub {
sub = simplifyRegexpExt(sub, hasPrefix || len(subs) > 0, hasSuffix || i+1 < len(sre.Sub))
if sub != emptyRegexp {
subs = append(subs, sub)
}
}
sre.Sub = subs
// Remove anchros from the beginning and the end of regexp, since they
// will be added later.
if !hasPrefix {
for len(sre.Sub) > 0 && sre.Sub[0].Op == syntax.OpBeginText {
sre.Sub = sre.Sub[1:]
}
}
if !hasSuffix {
for len(sre.Sub) > 0 && sre.Sub[len(sre.Sub)-1].Op == syntax.OpEndText {
sre.Sub = sre.Sub[:len(sre.Sub)-1]
}
}
if len(sre.Sub) == 0 {
return emptyRegexp
}
if len(sre.Sub) == 1 {
return sre.Sub[0]
}
return sre
case syntax.OpEmptyMatch:
return emptyRegexp
default:
return sre
}
}
var emptyRegexp = &syntax.Regexp{
Op: syntax.OpEmptyMatch,
}