2022-08-24 16:54:26 +02:00
|
|
|
package regexutil
|
|
|
|
|
|
|
|
import (
|
2022-08-26 10:57:12 +02:00
|
|
|
"fmt"
|
2022-08-24 16:54:26 +02:00
|
|
|
"regexp/syntax"
|
|
|
|
"sort"
|
2022-08-25 22:22:03 +02:00
|
|
|
"strings"
|
2022-08-24 16:54:26 +02:00
|
|
|
)
|
|
|
|
|
2022-08-25 22:22:03 +02:00
|
|
|
// RemoveStartEndAnchors removes '^' at the start of expr and '$' at the end of the expr.
|
|
|
|
func RemoveStartEndAnchors(expr string) string {
|
|
|
|
for strings.HasPrefix(expr, "^") {
|
|
|
|
expr = expr[1:]
|
|
|
|
}
|
2022-09-30 07:13:56 +02:00
|
|
|
for strings.HasSuffix(expr, "$") && !strings.HasSuffix(expr, "\\$") {
|
2022-08-25 22:22:03 +02:00
|
|
|
expr = expr[:len(expr)-1]
|
|
|
|
}
|
|
|
|
return expr
|
|
|
|
}
|
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
// GetOrValuesRegex returns "or" values from the given regexp expr.
|
|
|
|
//
|
|
|
|
// It returns ["foo", "bar"] for "foo|bar" regexp.
|
|
|
|
// It returns ["foo"] for "foo" regexp.
|
|
|
|
// It returns [""] for "" regexp.
|
|
|
|
// It returns an empty list if it is impossible to extract "or" values from the regexp.
|
|
|
|
func GetOrValuesRegex(expr string) []string {
|
|
|
|
return getOrValuesRegex(expr, true)
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetOrValuesPromRegex returns "or" values from the given Prometheus-like regexp expr.
|
2022-08-24 16:54:26 +02:00
|
|
|
//
|
2022-08-25 22:22:03 +02:00
|
|
|
// It ignores start and end anchors ('^') and ('$') at the start and the end of expr.
|
|
|
|
// It returns ["foo", "bar"] for "foo|bar" regexp.
|
|
|
|
// It returns ["foo"] for "foo" regexp.
|
2022-08-24 16:54:26 +02:00
|
|
|
// It returns [""] for "" regexp.
|
2022-08-25 22:22:03 +02:00
|
|
|
// It returns an empty list if it is impossible to extract "or" values from the regexp.
|
2024-05-24 03:06:55 +02:00
|
|
|
func GetOrValuesPromRegex(expr string) []string {
|
2022-08-25 22:22:03 +02:00
|
|
|
expr = RemoveStartEndAnchors(expr)
|
2024-05-24 03:06:55 +02:00
|
|
|
return getOrValuesRegex(expr, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
func getOrValuesRegex(expr string, keepAnchors bool) []string {
|
|
|
|
prefix, tailExpr := simplifyRegex(expr, keepAnchors)
|
2022-08-26 10:57:12 +02:00
|
|
|
if tailExpr == "" {
|
|
|
|
return []string{prefix}
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
sre, err := parseRegexp(tailExpr)
|
2022-08-24 16:54:26 +02:00
|
|
|
if err != nil {
|
2024-05-24 03:06:55 +02:00
|
|
|
return nil
|
2022-08-24 16:54:26 +02:00
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
orValues := getOrValues(sre)
|
2022-08-24 16:54:26 +02:00
|
|
|
|
|
|
|
// Sort orValues for faster index seek later
|
|
|
|
sort.Strings(orValues)
|
|
|
|
|
2022-08-26 10:57:12 +02:00
|
|
|
if len(prefix) > 0 {
|
|
|
|
// Add prefix to orValues
|
|
|
|
for i, orValue := range orValues {
|
|
|
|
orValues[i] = prefix + orValue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-24 16:54:26 +02:00
|
|
|
return orValues
|
|
|
|
}
|
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
func getOrValues(sre *syntax.Regexp) []string {
|
2022-08-24 16:54:26 +02:00
|
|
|
switch sre.Op {
|
|
|
|
case syntax.OpCapture:
|
2024-05-24 03:06:55 +02:00
|
|
|
return getOrValues(sre.Sub[0])
|
2022-08-24 16:54:26 +02:00
|
|
|
case syntax.OpLiteral:
|
2024-05-24 03:06:55 +02:00
|
|
|
v, ok := getLiteral(sre)
|
|
|
|
if !ok {
|
2022-08-24 16:54:26 +02:00
|
|
|
return nil
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
return []string{v}
|
2022-08-24 16:54:26 +02:00
|
|
|
case syntax.OpEmptyMatch:
|
|
|
|
return []string{""}
|
|
|
|
case syntax.OpAlternate:
|
|
|
|
a := make([]string, 0, len(sre.Sub))
|
|
|
|
for _, reSub := range sre.Sub {
|
2024-05-24 03:06:55 +02:00
|
|
|
ca := getOrValues(reSub)
|
2022-08-24 16:54:26 +02:00
|
|
|
if len(ca) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
a = append(a, ca...)
|
|
|
|
if len(a) > maxOrValues {
|
|
|
|
// It is cheaper to use regexp here.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return a
|
|
|
|
case syntax.OpCharClass:
|
|
|
|
a := make([]string, 0, len(sre.Rune)/2)
|
|
|
|
for i := 0; i < len(sre.Rune); i += 2 {
|
|
|
|
start := sre.Rune[i]
|
|
|
|
end := sre.Rune[i+1]
|
|
|
|
for start <= end {
|
|
|
|
a = append(a, string(start))
|
|
|
|
start++
|
|
|
|
if len(a) > maxOrValues {
|
|
|
|
// It is cheaper to use regexp here.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return a
|
|
|
|
case syntax.OpConcat:
|
|
|
|
if len(sre.Sub) < 1 {
|
|
|
|
return []string{""}
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
prefixes := getOrValues(sre.Sub[0])
|
2022-08-24 16:54:26 +02:00
|
|
|
if len(prefixes) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2022-08-26 10:57:12 +02:00
|
|
|
if len(sre.Sub) == 1 {
|
|
|
|
return prefixes
|
|
|
|
}
|
2022-08-24 16:54:26 +02:00
|
|
|
sre.Sub = sre.Sub[1:]
|
2024-05-24 03:06:55 +02:00
|
|
|
suffixes := getOrValues(sre)
|
2022-08-24 16:54:26 +02:00
|
|
|
if len(suffixes) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if len(prefixes)*len(suffixes) > maxOrValues {
|
|
|
|
// It is cheaper to use regexp here.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
a := make([]string, 0, len(prefixes)*len(suffixes))
|
|
|
|
for _, prefix := range prefixes {
|
|
|
|
for _, suffix := range suffixes {
|
|
|
|
s := prefix + suffix
|
|
|
|
a = append(a, s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return a
|
|
|
|
default:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
func getLiteral(sre *syntax.Regexp) (string, bool) {
|
2022-08-24 16:54:26 +02:00
|
|
|
if sre.Op == syntax.OpCapture {
|
2024-05-24 03:06:55 +02:00
|
|
|
return getLiteral(sre.Sub[0])
|
|
|
|
}
|
|
|
|
if sre.Op == syntax.OpLiteral && sre.Flags&syntax.FoldCase == 0 {
|
|
|
|
return string(sre.Rune), true
|
2022-08-24 16:54:26 +02:00
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
return "", false
|
2022-08-24 16:54:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const maxOrValues = 100
|
2022-08-26 10:57:12 +02:00
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
// SimplifyRegex simplifies the given regexp expr.
|
|
|
|
//
|
|
|
|
// It returns plaintext pefix and the remaining regular expression
|
|
|
|
// without capturing parens.
|
|
|
|
func SimplifyRegex(expr string) (string, string) {
|
|
|
|
prefix, suffix := simplifyRegex(expr, true)
|
|
|
|
sre := mustParseRegexp(suffix)
|
|
|
|
|
|
|
|
if isDotOp(sre, syntax.OpStar) {
|
|
|
|
return prefix, ""
|
|
|
|
}
|
|
|
|
if sre.Op == syntax.OpConcat {
|
|
|
|
subs := sre.Sub
|
|
|
|
if prefix == "" {
|
|
|
|
// Drop .* at the start
|
|
|
|
for len(subs) > 0 && isDotOp(subs[0], syntax.OpStar) {
|
|
|
|
subs = subs[1:]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Drop .* at the end.
|
|
|
|
for len(subs) > 0 && isDotOp(subs[len(subs)-1], syntax.OpStar) {
|
|
|
|
subs = subs[:len(subs)-1]
|
|
|
|
}
|
|
|
|
|
|
|
|
sre.Sub = subs
|
|
|
|
if len(subs) == 0 {
|
|
|
|
return prefix, ""
|
|
|
|
}
|
|
|
|
suffix = sre.String()
|
|
|
|
}
|
|
|
|
return prefix, suffix
|
|
|
|
}
|
|
|
|
|
|
|
|
// SimplifyPromRegex simplifies the given Prometheus-like expr.
|
2022-08-26 10:57:12 +02:00
|
|
|
//
|
|
|
|
// It returns plaintext prefix and the remaining regular expression
|
2024-05-24 03:06:55 +02:00
|
|
|
// with dropped '^' and '$' anchors at the beginning and at the end
|
2022-08-26 10:57:12 +02:00
|
|
|
// of the regular expression.
|
|
|
|
//
|
|
|
|
// The function removes capturing parens from the expr,
|
|
|
|
// so it cannot be used when capturing parens are necessary.
|
2024-05-24 03:06:55 +02:00
|
|
|
func SimplifyPromRegex(expr string) (string, string) {
|
|
|
|
return simplifyRegex(expr, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
func simplifyRegex(expr string, keepAnchors bool) (string, string) {
|
|
|
|
sre, err := parseRegexp(expr)
|
2022-08-26 10:57:12 +02:00
|
|
|
if err != nil {
|
|
|
|
// Cannot parse the regexp. Return it all as prefix.
|
|
|
|
return expr, ""
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
sre = simplifyRegexp(sre, keepAnchors, keepAnchors)
|
2022-08-26 10:57:12 +02:00
|
|
|
if sre == emptyRegexp {
|
|
|
|
return "", ""
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
v, ok := getLiteral(sre)
|
|
|
|
if ok {
|
|
|
|
return v, ""
|
2022-08-26 10:57:12 +02:00
|
|
|
}
|
|
|
|
var prefix string
|
|
|
|
if sre.Op == syntax.OpConcat {
|
2024-05-24 03:06:55 +02:00
|
|
|
prefix, ok = getLiteral(sre.Sub[0])
|
|
|
|
if ok {
|
2022-08-26 10:57:12 +02:00
|
|
|
sre.Sub = sre.Sub[1:]
|
|
|
|
if len(sre.Sub) == 0 {
|
|
|
|
return prefix, ""
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
sre = simplifyRegexp(sre, true, keepAnchors)
|
2022-08-26 10:57:12 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if _, err := syntax.Compile(sre); err != nil {
|
|
|
|
// Cannot compile the regexp. Return it all as prefix.
|
|
|
|
return expr, ""
|
|
|
|
}
|
|
|
|
s := sre.String()
|
|
|
|
s = strings.ReplaceAll(s, "(?:)", "")
|
2024-05-24 03:06:55 +02:00
|
|
|
s = strings.ReplaceAll(s, "(?s:.)", ".")
|
|
|
|
s = strings.ReplaceAll(s, "(?m:$)", "$")
|
2022-08-26 10:57:12 +02:00
|
|
|
return prefix, s
|
|
|
|
}
|
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
func simplifyRegexp(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Regexp {
|
2022-08-26 10:57:12 +02:00
|
|
|
s := sre.String()
|
|
|
|
for {
|
2024-05-24 03:06:55 +02:00
|
|
|
sre = simplifyRegexpExt(sre, keepBeginOp, keepEndOp)
|
2022-08-26 10:57:12 +02:00
|
|
|
sre = sre.Simplify()
|
2024-05-24 03:06:55 +02:00
|
|
|
if !keepBeginOp && sre.Op == syntax.OpBeginText {
|
|
|
|
sre = emptyRegexp
|
|
|
|
} else if !keepEndOp && sre.Op == syntax.OpEndText {
|
2022-08-26 10:57:12 +02:00
|
|
|
sre = emptyRegexp
|
|
|
|
}
|
|
|
|
sNew := sre.String()
|
|
|
|
if sNew == s {
|
|
|
|
return sre
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
sre = mustParseRegexp(sNew)
|
2022-08-26 10:57:12 +02:00
|
|
|
s = sNew
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
func simplifyRegexpExt(sre *syntax.Regexp, keepBeginOp, keepEndOp bool) *syntax.Regexp {
|
2022-08-26 10:57:12 +02:00
|
|
|
switch sre.Op {
|
|
|
|
case syntax.OpCapture:
|
|
|
|
// Substitute all the capture regexps with non-capture regexps.
|
|
|
|
sre.Op = syntax.OpAlternate
|
2024-05-24 03:06:55 +02:00
|
|
|
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], keepBeginOp, keepEndOp)
|
2022-08-26 10:57:12 +02:00
|
|
|
if sre.Sub[0] == emptyRegexp {
|
|
|
|
return emptyRegexp
|
|
|
|
}
|
|
|
|
return sre
|
|
|
|
case syntax.OpStar, syntax.OpPlus, syntax.OpQuest, syntax.OpRepeat:
|
2024-05-24 03:06:55 +02:00
|
|
|
sre.Sub[0] = simplifyRegexpExt(sre.Sub[0], keepBeginOp, keepEndOp)
|
2022-08-26 10:57:12 +02:00
|
|
|
if sre.Sub[0] == emptyRegexp {
|
|
|
|
return emptyRegexp
|
|
|
|
}
|
|
|
|
return sre
|
|
|
|
case syntax.OpAlternate:
|
|
|
|
// Do not remove empty captures from OpAlternate, since this may break regexp.
|
|
|
|
for i, sub := range sre.Sub {
|
2024-05-24 03:06:55 +02:00
|
|
|
sre.Sub[i] = simplifyRegexpExt(sub, keepBeginOp, keepEndOp)
|
2022-08-26 10:57:12 +02:00
|
|
|
}
|
|
|
|
return sre
|
|
|
|
case syntax.OpConcat:
|
|
|
|
subs := sre.Sub[:0]
|
|
|
|
for i, sub := range sre.Sub {
|
2024-05-24 03:06:55 +02:00
|
|
|
sub = simplifyRegexpExt(sub, keepBeginOp || len(subs) > 0, keepEndOp || i+1 < len(sre.Sub))
|
2022-08-26 10:57:12 +02:00
|
|
|
if sub != emptyRegexp {
|
|
|
|
subs = append(subs, sub)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sre.Sub = subs
|
|
|
|
// Remove anchros from the beginning and the end of regexp, since they
|
|
|
|
// will be added later.
|
2024-05-24 03:06:55 +02:00
|
|
|
if !keepBeginOp {
|
2022-08-26 10:57:12 +02:00
|
|
|
for len(sre.Sub) > 0 && sre.Sub[0].Op == syntax.OpBeginText {
|
|
|
|
sre.Sub = sre.Sub[1:]
|
|
|
|
}
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
if !keepEndOp {
|
2022-08-26 10:57:12 +02:00
|
|
|
for len(sre.Sub) > 0 && sre.Sub[len(sre.Sub)-1].Op == syntax.OpEndText {
|
|
|
|
sre.Sub = sre.Sub[:len(sre.Sub)-1]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(sre.Sub) == 0 {
|
|
|
|
return emptyRegexp
|
|
|
|
}
|
|
|
|
if len(sre.Sub) == 1 {
|
|
|
|
return sre.Sub[0]
|
|
|
|
}
|
|
|
|
return sre
|
|
|
|
case syntax.OpEmptyMatch:
|
|
|
|
return emptyRegexp
|
|
|
|
default:
|
|
|
|
return sre
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-05-24 03:06:55 +02:00
|
|
|
// getSubstringLiteral returns regex part from sre surrounded by .+ or .* depending on the prefixSuffixOp.
|
|
|
|
//
|
|
|
|
// For example, if sre=".+foo.+" and prefixSuffix=syntax.OpPlus, then the function returns "foo".
|
|
|
|
//
|
|
|
|
// An empty string is returned if sre doesn't contain the given prefixSuffixOp prefix and suffix.
|
|
|
|
func getSubstringLiteral(sre *syntax.Regexp, prefixSuffixOp syntax.Op) string {
|
|
|
|
if sre.Op != syntax.OpConcat || len(sre.Sub) != 3 {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
if !isDotOp(sre.Sub[0], prefixSuffixOp) || !isDotOp(sre.Sub[2], prefixSuffixOp) {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
v, ok := getLiteral(sre.Sub[1])
|
|
|
|
if !ok {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
return v
|
|
|
|
}
|
|
|
|
|
|
|
|
func isDotOp(sre *syntax.Regexp, op syntax.Op) bool {
|
|
|
|
if sre.Op != op {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return sre.Sub[0].Op == syntax.OpAnyChar
|
|
|
|
}
|
|
|
|
|
2022-08-26 10:57:12 +02:00
|
|
|
var emptyRegexp = &syntax.Regexp{
|
|
|
|
Op: syntax.OpEmptyMatch,
|
|
|
|
}
|
2024-05-24 03:06:55 +02:00
|
|
|
|
|
|
|
func parseRegexp(expr string) (*syntax.Regexp, error) {
|
|
|
|
return syntax.Parse(expr, syntax.Perl|syntax.DotNL)
|
|
|
|
}
|
|
|
|
|
|
|
|
func mustParseRegexp(expr string) *syntax.Regexp {
|
|
|
|
sre, err := parseRegexp(expr)
|
|
|
|
if err != nil {
|
|
|
|
panic(fmt.Errorf("BUG: cannot parse already verified regexp %q: %w", expr, err))
|
|
|
|
}
|
|
|
|
return sre
|
|
|
|
}
|