VictoriaMetrics/lib/logstorage/pipe_extract.go

package logstorage

import (
	"fmt"
	"html"
	"strconv"
	"strings"
	"unsafe"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)

// pipeExtract processes '| extract from <field> <pattern>' pipe.
//
// See https://docs.victoriametrics.com/victorialogs/logsql/#extract-pipe
type pipeExtract struct {
	fromField string
	steps     []extractFormatStep

	pattern string
}

func (pe *pipeExtract) String() string {
	s := "extract"
	if !isMsgFieldName(pe.fromField) {
		s += " from " + quoteTokenIfNeeded(pe.fromField)
	}
	s += " " + quoteTokenIfNeeded(pe.pattern)
	return s
}

func (pe *pipeExtract) updateNeededFields(neededFields, unneededFields fieldsSet) {
	if neededFields.contains("*") {
		unneededFieldsOrig := unneededFields.clone()
		needFromField := false
		for _, step := range pe.steps {
			if step.field != "" {
				if !unneededFieldsOrig.contains(step.field) {
					needFromField = true
				}
				unneededFields.add(step.field)
			}
		}
		if needFromField {
			unneededFields.remove(pe.fromField)
		} else {
			unneededFields.add(pe.fromField)
		}
	} else {
		needFromField := false
		for _, step := range pe.steps {
			if step.field != "" && neededFields.contains(step.field) {
				needFromField = true
				neededFields.remove(step.field)
			}
		}
		if needFromField {
			neededFields.add(pe.fromField)
		}
	}
}

func (pe *pipeExtract) newPipeProcessor(workersCount int, _ <-chan struct{}, _ func(), ppBase pipeProcessor) pipeProcessor {
	shards := make([]pipeExtractProcessorShard, workersCount)
	for i := range shards {
		ef := newExtractFormat(pe.steps)
		rcs := make([]resultColumn, len(ef.fields))
		for j := range rcs {
			rcs[j].name = ef.fields[j].name
		}
		shards[i] = pipeExtractProcessorShard{
			pipeExtractProcessorShardNopad: pipeExtractProcessorShardNopad{
				ef:  ef,
				rcs: rcs,
			},
		}
	}

	pep := &pipeExtractProcessor{
		pe:     pe,
		ppBase: ppBase,

		shards: shards,
	}
	return pep
}

type pipeExtractProcessor struct {
	pe     *pipeExtract
	ppBase pipeProcessor

	shards []pipeExtractProcessorShard
}

type pipeExtractProcessorShard struct {
	pipeExtractProcessorShardNopad

	// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .
	_ [128 - unsafe.Sizeof(pipeExtractProcessorShardNopad{})%128]byte
}

type pipeExtractProcessorShardNopad struct {
	ef *extractFormat

	rcs []resultColumn
}

func (pep *pipeExtractProcessor) writeBlock(workerID uint, br *blockResult) {
	if len(br.timestamps) == 0 {
		return
	}

	shard := &pep.shards[workerID]
	ef := shard.ef
	rcs := shard.rcs

	c := br.getColumnByName(pep.pe.fromField)
	if c.isConst {
		v := c.valuesEncoded[0]
		ef.apply(v)
		for i, f := range ef.fields {
			fieldValue := *f.value
			rc := &rcs[i]
			for range br.timestamps {
				rc.addValue(fieldValue)
			}
		}
	} else {
		values := c.getValues(br)
		for i, v := range values {
			if i == 0 || values[i-1] != v {
				ef.apply(v)
			}
			for j, f := range ef.fields {
				rcs[j].addValue(*f.value)
			}
		}
	}

	br.addResultColumns(rcs)
	pep.ppBase.writeBlock(workerID, br)

	for i := range rcs {
		rcs[i].resetValues()
	}
}

func (pep *pipeExtractProcessor) flush() error {
	return nil
}

func parsePipeExtract(lex *lexer) (*pipeExtract, error) {
	if !lex.isKeyword("extract") {
		return nil, fmt.Errorf("unexpected token: %q; want %q", lex.token, "extract")
	}
	lex.nextToken()

	fromField := "_msg"
	if lex.isKeyword("from") {
		lex.nextToken()
		f, err := parseFieldName(lex)
		if err != nil {
			return nil, fmt.Errorf("cannot parse 'from' field name: %w", err)
		}
		fromField = f
	}

	pattern, err := getCompoundToken(lex)
	if err != nil {
		return nil, fmt.Errorf("cannot read 'pattern': %w", err)
	}
	steps, err := parseExtractFormatSteps(pattern)
	if err != nil {
		return nil, fmt.Errorf("cannot parse 'pattern' %q: %w", pattern, err)
	}

	pe := &pipeExtract{
		fromField: fromField,
		steps:     steps,
		pattern:   pattern,
	}
	return pe, nil
}

type extractFormat struct {
	// steps contains steps for extracting fields from string
	steps []extractFormatStep

	// matches contains matches for every step in steps
	matches []string

	// fields contains matches for non-empty fields
	fields []extractField
}

type extractField struct {
	name  string
	value *string
}

type extractFormatStep struct {
	prefix string
	field  string
}

func newExtractFormat(steps []extractFormatStep) *extractFormat {
	if len(steps) == 0 {
		logger.Panicf("BUG: steps cannot be empty")
	}

	matches := make([]string, len(steps))

	var fields []extractField
	for i, step := range steps {
		if step.field != "" {
			fields = append(fields, extractField{
				name:  step.field,
				value: &matches[i],
			})
		}
	}
	if len(fields) == 0 {
		logger.Panicf("BUG: fields cannot be empty")
	}

	ef := &extractFormat{
		steps:   steps,
		matches: matches,
		fields:  fields,
	}
	return ef
}

func (ef *extractFormat) apply(s string) {
	clear(ef.matches)

	steps := ef.steps

	if prefix := steps[0].prefix; prefix != "" {
		n := strings.Index(s, prefix)
		if n < 0 {
			// Mismatch
			return
		}
		s = s[n+len(prefix):]
	}

	matches := ef.matches
	for i := range steps {
		nextPrefix := ""
		if i+1 < len(steps) {
			nextPrefix = steps[i+1].prefix
		}

		us, nOffset := tryUnquoteString(s)
		if nOffset >= 0 {
			// Matched quoted string
			matches[i] = us
			s = s[nOffset:]
			if !strings.HasPrefix(s, nextPrefix) {
				// Mismatch
				return
			}
			s = s[len(nextPrefix):]
		} else {
			// Match unquoted string until the nextPrefix
			if nextPrefix == "" {
				matches[i] = s
				return
			}
			n := strings.Index(s, nextPrefix)
			if n < 0 {
				// Mismatch
				return
			}
			matches[i] = s[:n]
			s = s[n+len(nextPrefix):]
		}
	}
}

func tryUnquoteString(s string) (string, int) {
	if len(s) == 0 {
		return s, -1
	}
	if s[0] != '"' && s[0] != '`' {
		return s, -1
	}
	qp, err := strconv.QuotedPrefix(s)
	if err != nil {
		return s, -1
	}
	us, err := strconv.Unquote(qp)
	if err != nil {
		return s, -1
	}
	return us, len(qp)
}

func parseExtractFormatSteps(s string) ([]extractFormatStep, error) {
	var steps []extractFormatStep

	hasNamedField := false

	n := strings.IndexByte(s, '<')
	if n < 0 {
		return nil, fmt.Errorf("missing <...> fields")
	}
	prefix := s[:n]
	s = s[n+1:]
	for {
		n := strings.IndexByte(s, '>')
		if n < 0 {
			return nil, fmt.Errorf("missing '>' for <%s", s)
		}
		field := s[:n]
		s = s[n+1:]

		if field == "_" || field == "*" {
			field = ""
		}
		steps = append(steps, extractFormatStep{
			prefix: prefix,
			field:  field,
		})
		if !hasNamedField && field != "" {
			hasNamedField = true
		}
		if len(s) == 0 {
			break
		}

		n = strings.IndexByte(s, '<')
		if n < 0 {
			steps = append(steps, extractFormatStep{
				prefix: s,
			})
			break
		}
		if n == 0 {
			return nil, fmt.Errorf("missing delimiter after <%s>", field)
		}
		prefix = s[:n]
		s = s[n+1:]
	}

	if !hasNamedField {
		return nil, fmt.Errorf("missing named fields like <name>")
	}

	for i := range steps {
		step := &steps[i]
		step.prefix = html.UnescapeString(step.prefix)
	}

	return steps, nil
}
lib/logstorage: work-in-progress 2024-05-20 04:08:30 +02:00			`package logstorage`

			`import (`
			`"fmt"`
			`"html"`
			`"strconv"`
			`"strings"`
			`"unsafe"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"`
			`)`

			`// pipeExtract processes '\| extract from <field> <pattern>' pipe.`
			`//`
			`// See https://docs.victoriametrics.com/victorialogs/logsql/#extract-pipe`
			`type pipeExtract struct {`
			`fromField string`
			`steps []extractFormatStep`

			`pattern string`
			`}`

			`func (pe *pipeExtract) String() string {`
			`s := "extract"`
			`if !isMsgFieldName(pe.fromField) {`
			`s += " from " + quoteTokenIfNeeded(pe.fromField)`
			`}`
			`s += " " + quoteTokenIfNeeded(pe.pattern)`
			`return s`
			`}`

			`func (pe *pipeExtract) updateNeededFields(neededFields, unneededFields fieldsSet) {`
			`if neededFields.contains("*") {`
			`unneededFieldsOrig := unneededFields.clone()`
			`needFromField := false`
			`for _, step := range pe.steps {`
			`if step.field != "" {`
			`if !unneededFieldsOrig.contains(step.field) {`
			`needFromField = true`
			`}`
			`unneededFields.add(step.field)`
			`}`
			`}`
			`if needFromField {`
			`unneededFields.remove(pe.fromField)`
			`} else {`
			`unneededFields.add(pe.fromField)`
			`}`
			`} else {`
			`needFromField := false`
			`for _, step := range pe.steps {`
			`if step.field != "" && neededFields.contains(step.field) {`
			`needFromField = true`
			`neededFields.remove(step.field)`
			`}`
			`}`
			`if needFromField {`
			`neededFields.add(pe.fromField)`
			`}`
			`}`
			`}`

			`func (pe *pipeExtract) newPipeProcessor(workersCount int, _ <-chan struct{}, _ func(), ppBase pipeProcessor) pipeProcessor {`
			`shards := make([]pipeExtractProcessorShard, workersCount)`
			`for i := range shards {`
			`ef := newExtractFormat(pe.steps)`
			`rcs := make([]resultColumn, len(ef.fields))`
			`for j := range rcs {`
			`rcs[j].name = ef.fields[j].name`
			`}`
			`shards[i] = pipeExtractProcessorShard{`
			`pipeExtractProcessorShardNopad: pipeExtractProcessorShardNopad{`
			`ef: ef,`
			`rcs: rcs,`
			`},`
			`}`
			`}`

			`pep := &pipeExtractProcessor{`
			`pe: pe,`
			`ppBase: ppBase,`

			`shards: shards,`
			`}`
			`return pep`
			`}`

			`type pipeExtractProcessor struct {`
			`pe *pipeExtract`
			`ppBase pipeProcessor`

			`shards []pipeExtractProcessorShard`
			`}`

			`type pipeExtractProcessorShard struct {`
			`pipeExtractProcessorShardNopad`

			`// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .`
			`_ [128 - unsafe.Sizeof(pipeExtractProcessorShardNopad{})%128]byte`
			`}`

			`type pipeExtractProcessorShardNopad struct {`
			`ef *extractFormat`

			`rcs []resultColumn`
			`}`

			`func (pep pipeExtractProcessor) writeBlock(workerID uint, br blockResult) {`
			`if len(br.timestamps) == 0 {`
			`return`
			`}`

			`shard := &pep.shards[workerID]`
			`ef := shard.ef`
			`rcs := shard.rcs`

			`c := br.getColumnByName(pep.pe.fromField)`
			`if c.isConst {`
			`v := c.valuesEncoded[0]`
			`ef.apply(v)`
			`for i, f := range ef.fields {`
			`fieldValue := *f.value`
			`rc := &rcs[i]`
			`for range br.timestamps {`
			`rc.addValue(fieldValue)`
			`}`
			`}`
			`} else {`
			`values := c.getValues(br)`
			`for i, v := range values {`
			`if i == 0 \|\| values[i-1] != v {`
			`ef.apply(v)`
			`}`
			`for j, f := range ef.fields {`
			`rcs[j].addValue(*f.value)`
			`}`
			`}`
			`}`

			`br.addResultColumns(rcs)`
			`pep.ppBase.writeBlock(workerID, br)`

			`for i := range rcs {`
			`rcs[i].resetValues()`
			`}`
			`}`

			`func (pep *pipeExtractProcessor) flush() error {`
			`return nil`
			`}`

			`func parsePipeExtract(lex lexer) (pipeExtract, error) {`
			`if !lex.isKeyword("extract") {`
			`return nil, fmt.Errorf("unexpected token: %q; want %q", lex.token, "extract")`
			`}`
			`lex.nextToken()`

			`fromField := "_msg"`
			`if lex.isKeyword("from") {`
			`lex.nextToken()`
			`f, err := parseFieldName(lex)`
			`if err != nil {`
			`return nil, fmt.Errorf("cannot parse 'from' field name: %w", err)`
			`}`
			`fromField = f`
			`}`

			`pattern, err := getCompoundToken(lex)`
			`if err != nil {`
			`return nil, fmt.Errorf("cannot read 'pattern': %w", err)`
			`}`
			`steps, err := parseExtractFormatSteps(pattern)`
			`if err != nil {`
			`return nil, fmt.Errorf("cannot parse 'pattern' %q: %w", pattern, err)`
			`}`

			`pe := &pipeExtract{`
			`fromField: fromField,`
			`steps: steps,`
			`pattern: pattern,`
			`}`
			`return pe, nil`
			`}`

			`type extractFormat struct {`
			`// steps contains steps for extracting fields from string`
			`steps []extractFormatStep`

			`// matches contains matches for every step in steps`
			`matches []string`

			`// fields contains matches for non-empty fields`
			`fields []extractField`
			`}`

			`type extractField struct {`
			`name string`
			`value *string`
			`}`

			`type extractFormatStep struct {`
			`prefix string`
			`field string`
			`}`

			`func newExtractFormat(steps []extractFormatStep) *extractFormat {`
			`if len(steps) == 0 {`
			`logger.Panicf("BUG: steps cannot be empty")`
			`}`

			`matches := make([]string, len(steps))`

			`var fields []extractField`
			`for i, step := range steps {`
			`if step.field != "" {`
			`fields = append(fields, extractField{`
			`name: step.field,`
			`value: &matches[i],`
			`})`
			`}`
			`}`
			`if len(fields) == 0 {`
			`logger.Panicf("BUG: fields cannot be empty")`
			`}`

			`ef := &extractFormat{`
			`steps: steps,`
			`matches: matches,`
			`fields: fields,`
			`}`
			`return ef`
			`}`

			`func (ef *extractFormat) apply(s string) {`
			`clear(ef.matches)`

			`steps := ef.steps`

			`if prefix := steps[0].prefix; prefix != "" {`
			`n := strings.Index(s, prefix)`
			`if n < 0 {`
			`// Mismatch`
			`return`
			`}`
			`s = s[n+len(prefix):]`
			`}`

			`matches := ef.matches`
			`for i := range steps {`
			`nextPrefix := ""`
			`if i+1 < len(steps) {`
			`nextPrefix = steps[i+1].prefix`
			`}`

			`us, nOffset := tryUnquoteString(s)`
			`if nOffset >= 0 {`
			`// Matched quoted string`
			`matches[i] = us`
			`s = s[nOffset:]`
			`if !strings.HasPrefix(s, nextPrefix) {`
			`// Mismatch`
			`return`
			`}`
			`s = s[len(nextPrefix):]`
			`} else {`
			`// Match unquoted string until the nextPrefix`
			`if nextPrefix == "" {`
			`matches[i] = s`
			`return`
			`}`
			`n := strings.Index(s, nextPrefix)`
			`if n < 0 {`
			`// Mismatch`
			`return`
			`}`
			`matches[i] = s[:n]`
			`s = s[n+len(nextPrefix):]`
			`}`
			`}`
			`}`

			`func tryUnquoteString(s string) (string, int) {`
			`if len(s) == 0 {`
			`return s, -1`
			`}`
			if s[0] != '"' && s[0] != '`' {
			`return s, -1`
			`}`
			`qp, err := strconv.QuotedPrefix(s)`
			`if err != nil {`
			`return s, -1`
			`}`
			`us, err := strconv.Unquote(qp)`
			`if err != nil {`
			`return s, -1`
			`}`
			`return us, len(qp)`
			`}`

			`func parseExtractFormatSteps(s string) ([]extractFormatStep, error) {`
			`var steps []extractFormatStep`

			`hasNamedField := false`

			`n := strings.IndexByte(s, '<')`
			`if n < 0 {`
			`return nil, fmt.Errorf("missing <...> fields")`
			`}`
			`prefix := s[:n]`
			`s = s[n+1:]`
			`for {`
			`n := strings.IndexByte(s, '>')`
			`if n < 0 {`
			`return nil, fmt.Errorf("missing '>' for <%s", s)`
			`}`
			`field := s[:n]`
			`s = s[n+1:]`

			`if field == "_" \|\| field == "*" {`
			`field = ""`
			`}`
			`steps = append(steps, extractFormatStep{`
			`prefix: prefix,`
			`field: field,`
			`})`
			`if !hasNamedField && field != "" {`
			`hasNamedField = true`
			`}`
			`if len(s) == 0 {`
			`break`
			`}`

			`n = strings.IndexByte(s, '<')`
			`if n < 0 {`
			`steps = append(steps, extractFormatStep{`
			`prefix: s,`
			`})`
			`break`
			`}`
			`if n == 0 {`
			`return nil, fmt.Errorf("missing delimiter after <%s>", field)`
			`}`
			`prefix = s[:n]`
			`s = s[n+1:]`
			`}`

			`if !hasNamedField {`
			`return nil, fmt.Errorf("missing named fields like <name>")`
			`}`

			`for i := range steps {`
			`step := &steps[i]`
			`step.prefix = html.UnescapeString(step.prefix)`
			`}`

			`return steps, nil`
			`}`