VictoriaMetrics/lib/protoparser/prometheus/parser.go
Aliaksandr Valialkin 92d01db85a
lib/protoparser/prometheus: optimize GetRowsDiff() function
This should help https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1745 ,
since the provided profile shows that the majority of CPU and memory is spent in this function
during `streamParse` when `-promscrape.noStaleMarkers` wasn't set.
2021-10-27 18:55:25 +03:00

684 lines
15 KiB
Go

package prometheus
import (
"bytes"
"fmt"
"sort"
"strings"
"sync"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/metrics"
"github.com/valyala/fastjson/fastfloat"
)
// Rows contains parsed Prometheus rows.
type Rows struct {
Rows []Row
tagsPool []Tag
}
// Reset resets rs.
func (rs *Rows) Reset() {
// Reset items, so they can be GC'ed
for i := range rs.Rows {
rs.Rows[i].reset()
}
rs.Rows = rs.Rows[:0]
for i := range rs.tagsPool {
rs.tagsPool[i].reset()
}
rs.tagsPool = rs.tagsPool[:0]
}
// Unmarshal unmarshals Prometheus exposition text rows from s.
//
// See https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md#text-format-details
//
// s shouldn't be modified while rs is in use.
func (rs *Rows) Unmarshal(s string) {
rs.UnmarshalWithErrLogger(s, stdErrLogger)
}
func stdErrLogger(s string) {
logger.ErrorfSkipframes(1, "%s", s)
}
// UnmarshalWithErrLogger unmarshal Prometheus exposition text rows from s.
//
// It calls errLogger for logging parsing errors.
//
// s shouldn't be modified while rs is in use.
func (rs *Rows) UnmarshalWithErrLogger(s string, errLogger func(s string)) {
noEscapes := strings.IndexByte(s, '\\') < 0
rs.Rows, rs.tagsPool = unmarshalRows(rs.Rows[:0], s, rs.tagsPool[:0], noEscapes, errLogger)
}
// Row is a single Prometheus row.
type Row struct {
Metric string
Tags []Tag
Value float64
Timestamp int64
}
func (r *Row) reset() {
r.Metric = ""
r.Tags = nil
r.Value = 0
r.Timestamp = 0
}
func skipTrailingComment(s string) string {
n := strings.IndexByte(s, '#')
if n < 0 {
return s
}
return s[:n]
}
func skipLeadingWhitespace(s string) string {
// Prometheus treats ' ' and '\t' as whitespace
// according to https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md#text-format-details
for len(s) > 0 && (s[0] == ' ' || s[0] == '\t') {
s = s[1:]
}
return s
}
func skipTrailingWhitespace(s string) string {
// Prometheus treats ' ' and '\t' as whitespace
// according to https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md#text-format-details
for len(s) > 0 && (s[len(s)-1] == ' ' || s[len(s)-1] == '\t') {
s = s[:len(s)-1]
}
return s
}
func nextWhitespace(s string) int {
n := strings.IndexByte(s, ' ')
if n < 0 {
return strings.IndexByte(s, '\t')
}
n1 := strings.IndexByte(s, '\t')
if n1 < 0 || n1 > n {
return n
}
return n1
}
func (r *Row) unmarshal(s string, tagsPool []Tag, noEscapes bool) ([]Tag, error) {
r.reset()
s = skipLeadingWhitespace(s)
n := strings.IndexByte(s, '{')
if n >= 0 {
// Tags found. Parse them.
r.Metric = skipTrailingWhitespace(s[:n])
s = s[n+1:]
tagsStart := len(tagsPool)
var err error
s, tagsPool, err = unmarshalTags(tagsPool, s, noEscapes)
if err != nil {
return tagsPool, fmt.Errorf("cannot unmarshal tags: %w", err)
}
if len(s) > 0 && s[0] == ' ' {
// Fast path - skip whitespace.
s = s[1:]
}
tags := tagsPool[tagsStart:]
r.Tags = tags[:len(tags):len(tags)]
} else {
// Tags weren't found. Search for value after whitespace
n = nextWhitespace(s)
if n < 0 {
return tagsPool, fmt.Errorf("missing value")
}
r.Metric = s[:n]
s = s[n+1:]
}
if len(r.Metric) == 0 {
return tagsPool, fmt.Errorf("metric cannot be empty")
}
s = skipLeadingWhitespace(s)
s = skipTrailingComment(s)
if len(s) == 0 {
return tagsPool, fmt.Errorf("value cannot be empty")
}
n = nextWhitespace(s)
if n < 0 {
// There is no timestamp.
v, err := fastfloat.Parse(s)
if err != nil {
return tagsPool, fmt.Errorf("cannot parse value %q: %w", s, err)
}
r.Value = v
return tagsPool, nil
}
// There is a timestamp.
v, err := fastfloat.Parse(s[:n])
if err != nil {
return tagsPool, fmt.Errorf("cannot parse value %q: %w", s[:n], err)
}
r.Value = v
s = skipLeadingWhitespace(s[n+1:])
if len(s) == 0 {
// There is no timestamp - just a whitespace after the value.
return tagsPool, nil
}
// There are some whitespaces after timestamp
s = skipTrailingWhitespace(s)
ts, err := fastfloat.Parse(s)
if err != nil {
return tagsPool, fmt.Errorf("cannot parse timestamp %q: %w", s, err)
}
if ts >= -1<<31 && ts < 1<<31 {
// This looks like OpenMetrics timestamp in Unix seconds.
// Convert it to milliseconds.
//
// See https://github.com/OpenObservability/OpenMetrics/blob/master/specification/OpenMetrics.md#timestamps
ts *= 1000
}
r.Timestamp = int64(ts)
return tagsPool, nil
}
var rowsReadScrape = metrics.NewCounter(`vm_protoparser_rows_read_total{type="promscrape"}`)
func unmarshalRows(dst []Row, s string, tagsPool []Tag, noEscapes bool, errLogger func(s string)) ([]Row, []Tag) {
dstLen := len(dst)
for len(s) > 0 {
n := strings.IndexByte(s, '\n')
if n < 0 {
// The last line.
dst, tagsPool = unmarshalRow(dst, s, tagsPool, noEscapes, errLogger)
break
}
dst, tagsPool = unmarshalRow(dst, s[:n], tagsPool, noEscapes, errLogger)
s = s[n+1:]
}
rowsReadScrape.Add(len(dst) - dstLen)
return dst, tagsPool
}
func unmarshalRow(dst []Row, s string, tagsPool []Tag, noEscapes bool, errLogger func(s string)) ([]Row, []Tag) {
if len(s) > 0 && s[len(s)-1] == '\r' {
s = s[:len(s)-1]
}
s = skipLeadingWhitespace(s)
if len(s) == 0 {
// Skip empty line
return dst, tagsPool
}
if s[0] == '#' {
// Skip comment
return dst, tagsPool
}
if cap(dst) > len(dst) {
dst = dst[:len(dst)+1]
} else {
dst = append(dst, Row{})
}
r := &dst[len(dst)-1]
var err error
tagsPool, err = r.unmarshal(s, tagsPool, noEscapes)
if err != nil {
dst = dst[:len(dst)-1]
msg := fmt.Sprintf("cannot unmarshal Prometheus line %q: %s", s, err)
errLogger(msg)
invalidLines.Inc()
}
return dst, tagsPool
}
var invalidLines = metrics.NewCounter(`vm_rows_invalid_total{type="prometheus"}`)
func unmarshalTags(dst []Tag, s string, noEscapes bool) (string, []Tag, error) {
for {
s = skipLeadingWhitespace(s)
if len(s) > 0 && s[0] == '}' {
// End of tags found.
return s[1:], dst, nil
}
n := strings.IndexByte(s, '=')
if n < 0 {
return s, dst, fmt.Errorf("missing value for tag %q", s)
}
key := skipTrailingWhitespace(s[:n])
s = skipLeadingWhitespace(s[n+1:])
if len(s) == 0 || s[0] != '"' {
return s, dst, fmt.Errorf("expecting quoted value for tag %q; got %q", key, s)
}
value := s[1:]
if noEscapes {
// Fast path - the line has no escape chars
n = strings.IndexByte(value, '"')
if n < 0 {
return s, dst, fmt.Errorf("missing closing quote for tag value %q", s)
}
s = value[n+1:]
value = value[:n]
} else {
// Slow path - the line contains escape chars
n = findClosingQuote(s)
if n < 0 {
return s, dst, fmt.Errorf("missing closing quote for tag value %q", s)
}
value = unescapeValue(s[1:n])
s = s[n+1:]
}
if len(key) > 0 {
// Allow empty values (len(value)==0) - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/453
if cap(dst) > len(dst) {
dst = dst[:len(dst)+1]
} else {
dst = append(dst, Tag{})
}
tag := &dst[len(dst)-1]
tag.Key = key
tag.Value = value
}
s = skipLeadingWhitespace(s)
if len(s) > 0 && s[0] == '}' {
// End of tags found.
return s[1:], dst, nil
}
if len(s) == 0 || s[0] != ',' {
return s, dst, fmt.Errorf("missing comma after tag %s=%q", key, value)
}
s = s[1:]
}
}
// Tag is a Prometheus tag.
type Tag struct {
Key string
Value string
}
func (t *Tag) reset() {
t.Key = ""
t.Value = ""
}
func findClosingQuote(s string) int {
if len(s) == 0 || s[0] != '"' {
return -1
}
off := 1
s = s[1:]
for {
n := strings.IndexByte(s, '"')
if n < 0 {
return -1
}
if prevBackslashesCount(s[:n])%2 == 0 {
return off + n
}
off += n + 1
s = s[n+1:]
}
}
func unescapeValue(s string) string {
n := strings.IndexByte(s, '\\')
if n < 0 {
// Fast path - nothing to unescape
return s
}
b := make([]byte, 0, len(s))
for {
b = append(b, s[:n]...)
s = s[n+1:]
if len(s) == 0 {
b = append(b, '\\')
break
}
// label_value can be any sequence of UTF-8 characters, but the backslash (\), double-quote ("),
// and line feed (\n) characters have to be escaped as \\, \", and \n, respectively.
// See https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md
switch s[0] {
case '\\':
b = append(b, '\\')
case '"':
b = append(b, '"')
case 'n':
b = append(b, '\n')
default:
b = append(b, '\\', s[0])
}
s = s[1:]
n = strings.IndexByte(s, '\\')
if n < 0 {
b = append(b, s...)
break
}
}
return string(b)
}
func appendEscapedValue(dst []byte, s string) []byte {
// label_value can be any sequence of UTF-8 characters, but the backslash (\), double-quote ("),
// and line feed (\n) characters have to be escaped as \\, \", and \n, respectively.
// See https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md
for {
n := strings.IndexAny(s, "\\\"\n")
if n < 0 {
return append(dst, s...)
}
dst = append(dst, s[:n]...)
switch s[n] {
case '\\':
dst = append(dst, "\\\\"...)
case '"':
dst = append(dst, "\\\""...)
case '\n':
dst = append(dst, "\\n"...)
}
s = s[n+1:]
}
}
func prevBackslashesCount(s string) int {
n := 0
for len(s) > 0 && s[len(s)-1] == '\\' {
n++
s = s[:len(s)-1]
}
return n
}
// GetRowsDiff returns rows from s1, which are missing in s2.
//
// The returned rows have default value 0 and have no timestamps.
func GetRowsDiff(s1, s2 string) string {
li1 := getLinesIterator()
li2 := getLinesIterator()
defer func() {
putLinesIterator(li1)
putLinesIterator(li2)
}()
li1.Init(s1)
li2.Init(s2)
if !li1.NextKey() {
return ""
}
var diff []byte
if !li2.NextKey() {
diff = appendKeys(diff, li1)
return string(diff)
}
for {
switch bytes.Compare(li1.Key, li2.Key) {
case -1:
diff = appendKey(diff, li1.Key)
if !li1.NextKey() {
return string(diff)
}
case 0:
if !li1.NextKey() {
return string(diff)
}
if !li2.NextKey() {
diff = appendKeys(diff, li1)
return string(diff)
}
case 1:
if !li2.NextKey() {
diff = appendKeys(diff, li1)
return string(diff)
}
}
}
}
type linesIterator struct {
rows []Row
a []string
tagsPool []Tag
// Key contains the next key after NextKey call
Key []byte
}
var linesIteratorPool sync.Pool
func getLinesIterator() *linesIterator {
v := linesIteratorPool.Get()
if v == nil {
return &linesIterator{}
}
return v.(*linesIterator)
}
func putLinesIterator(li *linesIterator) {
li.a = nil
linesIteratorPool.Put(li)
}
func (li *linesIterator) Init(s string) {
a := strings.Split(s, "\n")
sort.Strings(a)
li.a = a
}
// NextKey advances to the next key in li.
//
// It returns true if the next key is found and Key is successcully updated.
func (li *linesIterator) NextKey() bool {
for {
if len(li.a) == 0 {
return false
}
li.rows, li.tagsPool = unmarshalRow(li.rows[:0], li.a[0], li.tagsPool[:0], false, stdErrLogger)
li.a = li.a[1:]
if len(li.rows) > 0 {
li.Key = marshalMetricNameWithTags(li.Key[:0], &li.rows[0])
return true
}
}
}
func appendKey(dst, key []byte) []byte {
dst = append(dst, key...)
dst = append(dst, " 0\n"...)
return dst
}
func appendKeys(dst []byte, li *linesIterator) []byte {
for {
dst = appendKey(dst, li.Key)
if !li.NextKey() {
return dst
}
}
}
func marshalMetricNameWithTags(dst []byte, r *Row) []byte {
dst = append(dst, r.Metric...)
if len(r.Tags) == 0 {
return dst
}
dst = append(dst, '{')
for i, t := range r.Tags {
dst = append(dst, t.Key...)
dst = append(dst, `="`...)
dst = appendEscapedValue(dst, t.Value)
dst = append(dst, '"')
if i+1 < len(r.Tags) {
dst = append(dst, ',')
}
}
dst = append(dst, '}')
return dst
}
// AreIdenticalSeriesFast returns true if s1 and s2 contains identical Prometheus series with possible different values.
//
// This function is optimized for speed.
func AreIdenticalSeriesFast(s1, s2 string) bool {
for {
if len(s1) == 0 {
// The last byte on the line reached.
return len(s2) == 0
}
if len(s2) == 0 {
// The last byte on s2 reached, while s1 has non-empty contents.
return false
}
// Extract the next pair of lines from s1 and s2.
var x1, x2 string
n1 := strings.IndexByte(s1, '\n')
if n1 < 0 {
x1 = s1
s1 = ""
} else {
x1 = s1[:n1]
s1 = s1[n1+1:]
}
if n := strings.IndexByte(x1, '#'); n >= 0 {
// Drop comment.
x1 = x1[:n]
}
n2 := strings.IndexByte(s2, '\n')
if n2 < 0 {
if n1 >= 0 {
return false
}
x2 = s2
s2 = ""
} else {
if n1 < 0 {
return false
}
x2 = s2[:n2]
s2 = s2[n2+1:]
}
if n := strings.IndexByte(x2, '#'); n >= 0 {
// Drop comment.
x2 = x2[:n]
}
// Skip whitespaces in front of lines
for len(x1) > 0 && x1[0] == ' ' {
if len(x2) == 0 || x2[0] != ' ' {
return false
}
x1 = x1[1:]
x2 = x2[1:]
}
if len(x1) == 0 {
// The last byte on x1 reached.
if len(x2) != 0 {
return false
}
continue
}
if len(x2) == 0 {
// The last byte on x2 reached, while x1 has non-empty contents.
return false
}
// Compare metric names
n := strings.IndexByte(x1, ' ')
if n < 0 {
// Invalid Prometheus line - it must contain at least a single space between metric name and value
// Compare it in full with x2.
n = len(x1) - 1
}
n++
if n > len(x2) || x1[:n] != x2[:n] {
// Metric names mismatch
return false
}
x1 = x1[n:]
x2 = x2[n:]
// The space could belong to metric name in the following cases:
// foo {bar="baz"} 1
// foo{ bar="baz"} 2
// foo{bar="baz", aa="b"} 3
// foo{bar="b az"} 4
// foo 5
// Continue comparing the remaining parts until space or newline.
for {
n1 := strings.IndexByte(x1, ' ')
if n1 < 0 {
// Fast path.
// Treat x1 as a value.
// Skip values at x1 and x2.
n2 := strings.IndexByte(x2, ' ')
if n2 >= 0 {
// x2 contains additional parts.
return false
}
break
}
n1++
// Slow path.
// The x1[:n1] can be either a part of metric name or a value if timestamp is present:
// foo 12 34
if isNumeric(x1[:n1-1]) {
// Skip numeric part (most likely a value before timestamp) in x1 and x2
n2 := strings.IndexByte(x2, ' ')
if n2 < 0 {
// x2 contains less parts than x1
return false
}
n2++
if !isNumeric(x2[:n2-1]) {
// x1 contains numeric part, while x2 contains non-numeric part
return false
}
x1 = x1[n1:]
x2 = x2[n2:]
} else {
// The non-numeric part from x1 must match the corresponding part from x2.
if n1 > len(x2) || x1[:n1] != x2[:n1] {
// Parts mismatch
return false
}
x1 = x1[n1:]
x2 = x2[n1:]
}
}
}
}
func isNumeric(s string) bool {
for i := 0; i < len(s); i++ {
if numericChars[s[i]] {
continue
}
if i == 0 && s == "NaN" || s == "nan" || s == "Inf" || s == "inf" {
return true
}
if i == 1 && (s[0] == '-' || s[0] == '+') && (s[1:] == "Inf" || s[1:] == "inf") {
return true
}
return false
}
return true
}
var numericChars = [256]bool{
'0': true,
'1': true,
'2': true,
'3': true,
'4': true,
'5': true,
'6': true,
'7': true,
'8': true,
'9': true,
'-': true,
'+': true,
'e': true,
'E': true,
'.': true,
}