VictoriaMetrics/lib/logstorage/stats_uniq_values.go

package logstorage

import (
	"fmt"
	"slices"
	"strconv"
	"strings"
	"unsafe"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
)

type statsUniqValues struct {
	fields       []string
	containsStar bool
	limit        uint64
}

func (su *statsUniqValues) String() string {
	s := "uniq_values(" + fieldNamesString(su.fields) + ")"
	if su.limit > 0 {
		s += fmt.Sprintf(" limit %d", su.limit)
	}
	return s
}

func (su *statsUniqValues) neededFields() []string {
	return su.fields
}

func (su *statsUniqValues) newStatsProcessor() (statsProcessor, int) {
	sup := &statsUniqValuesProcessor{
		su: su,

		m: make(map[string]struct{}),
	}
	return sup, int(unsafe.Sizeof(*sup))
}

type statsUniqValuesProcessor struct {
	su *statsUniqValues

	m map[string]struct{}
}

func (sup *statsUniqValuesProcessor) updateStatsForAllRows(br *blockResult) int {
	if sup.limitReached() {
		// Limit on the number of unique values has been reached
		return 0
	}

	stateSizeIncrease := 0
	if sup.su.containsStar {
		for _, c := range br.getColumns() {
			stateSizeIncrease += sup.updateStatsForAllRowsColumn(c, br)
		}
	} else {
		for _, field := range sup.su.fields {
			c := br.getColumnByName(field)
			stateSizeIncrease += sup.updateStatsForAllRowsColumn(c, br)
		}
	}
	return stateSizeIncrease
}

func (sup *statsUniqValuesProcessor) updateStatsForAllRowsColumn(c *blockResultColumn, br *blockResult) int {
	m := sup.m
	stateSizeIncrease := 0
	if c.isConst {
		// collect unique const values
		v := c.encodedValues[0]
		if v == "" {
			// skip empty values
			return stateSizeIncrease
		}
		if _, ok := m[v]; !ok {
			vCopy := strings.Clone(v)
			m[vCopy] = struct{}{}
			stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))
		}
		return stateSizeIncrease
	}
	if c.valueType == valueTypeDict {
		// collect unique non-zero c.dictValues
		for _, v := range c.dictValues {
			if v == "" {
				// skip empty values
				continue
			}
			if _, ok := m[v]; !ok {
				vCopy := strings.Clone(v)
				m[vCopy] = struct{}{}
				stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))
			}
		}
		return stateSizeIncrease
	}

	// slow path - collect unique values across all rows
	values := c.getValues(br)
	for i, v := range values {
		if v == "" {
			// skip empty values
			continue
		}
		if i > 0 && values[i-1] == v {
			// This value has been already counted.
			continue
		}
		if _, ok := m[v]; !ok {
			vCopy := strings.Clone(v)
			m[vCopy] = struct{}{}
			stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))
		}
	}
	return stateSizeIncrease
}

func (sup *statsUniqValuesProcessor) updateStatsForRow(br *blockResult, rowIdx int) int {
	if sup.limitReached() {
		// Limit on the number of unique values has been reached
		return 0
	}

	stateSizeIncrease := 0
	if sup.su.containsStar {
		for _, c := range br.getColumns() {
			stateSizeIncrease += sup.updateStatsForRowColumn(c, br, rowIdx)
		}
	} else {
		for _, field := range sup.su.fields {
			c := br.getColumnByName(field)
			stateSizeIncrease += sup.updateStatsForRowColumn(c, br, rowIdx)
		}
	}
	return stateSizeIncrease
}

func (sup *statsUniqValuesProcessor) updateStatsForRowColumn(c *blockResultColumn, br *blockResult, rowIdx int) int {
	m := sup.m
	stateSizeIncrease := 0
	if c.isConst {
		// collect unique const values
		v := c.encodedValues[0]
		if v == "" {
			// skip empty values
			return stateSizeIncrease
		}
		if _, ok := m[v]; !ok {
			vCopy := strings.Clone(v)
			m[vCopy] = struct{}{}
			stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))
		}
		return stateSizeIncrease
	}
	if c.valueType == valueTypeDict {
		// collect unique non-zero c.dictValues
		dictIdx := c.encodedValues[rowIdx][0]
		v := c.dictValues[dictIdx]
		if v == "" {
			// skip empty values
			return stateSizeIncrease
		}
		if _, ok := m[v]; !ok {
			vCopy := strings.Clone(v)
			m[vCopy] = struct{}{}
			stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))
		}
		return stateSizeIncrease
	}

	// collect unique values for the given rowIdx.
	v := c.getValueAtRow(br, rowIdx)
	if v == "" {
		// skip empty values
		return stateSizeIncrease
	}
	if _, ok := m[v]; !ok {
		vCopy := strings.Clone(v)
		m[vCopy] = struct{}{}
		stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))
	}
	return stateSizeIncrease
}

func (sup *statsUniqValuesProcessor) mergeState(sfp statsProcessor) {
	if sup.limitReached() {
		return
	}

	src := sfp.(*statsUniqValuesProcessor)
	m := sup.m
	for k := range src.m {
		if _, ok := m[k]; !ok {
			m[k] = struct{}{}
		}
	}
}

func (sup *statsUniqValuesProcessor) finalizeStats() string {
	if len(sup.m) == 0 {
		return "[]"
	}

	// Sort unique items
	items := make([]string, 0, len(sup.m))
	for k := range sup.m {
		items = append(items, k)
	}
	slices.SortFunc(items, compareValues)

	if limit := sup.su.limit; limit > 0 && uint64(len(items)) > limit {
		items = items[:limit]
	}

	return marshalJSONArray(items)
}

func (sup *statsUniqValuesProcessor) limitReached() bool {
	limit := sup.su.limit
	return limit > 0 && uint64(len(sup.m)) >= limit
}

func marshalJSONArray(items []string) string {
	// Pre-allocate buffer for serialized items.
	// Assume that there is no need in quoting items. Otherwise additional reallocations
	// for the allocated buffer are possible.
	bufSize := len(items) + 1
	for _, item := range items {
		bufSize += len(item)
	}
	b := make([]byte, 0, bufSize)

	b = append(b, '[')
	b = strconv.AppendQuote(b, items[0])
	for _, item := range items[1:] {
		b = append(b, ',')
		b = strconv.AppendQuote(b, item)
	}
	b = append(b, ']')

	return bytesutil.ToUnsafeString(b)
}

func compareValues(a, b string) int {
	fA, okA := tryParseFloat64(a)
	fB, okB := tryParseFloat64(b)
	if okA && okB {
		if fA == fB {
			return 0
		}
		if fA < fB {
			return -1
		}
		return 1
	}
	if okA {
		return -1
	}
	if okB {
		return 1
	}
	return strings.Compare(a, b)
}

func parseStatsUniqValues(lex *lexer) (*statsUniqValues, error) {
	fields, err := parseFieldNamesForStatsFunc(lex, "uniq_values")
	if err != nil {
		return nil, err
	}
	su := &statsUniqValues{
		fields:       fields,
		containsStar: slices.Contains(fields, "*"),
	}
	if lex.isKeyword("limit") {
		lex.nextToken()
		n, ok := tryParseUint64(lex.token)
		if !ok {
			return nil, fmt.Errorf("cannot parse 'limit %s' for 'uniq_values': %w", lex.token, err)
		}
		lex.nextToken()
		su.limit = n
	}
	return su, nil
}
lib/logstorage: initial implementation of pipes in LogsQL See https://docs.victoriametrics.com/victorialogs/logsql/#pipes 2024-05-12 16:33:29 +02:00			`package logstorage`

			`import (`
			`"fmt"`
			`"slices"`
			`"strconv"`
			`"strings"`
			`"unsafe"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"`
			`)`

			`type statsUniqValues struct {`
			`fields []string`
			`containsStar bool`
			`limit uint64`
			`}`

			`func (su *statsUniqValues) String() string {`
			`s := "uniq_values(" + fieldNamesString(su.fields) + ")"`
			`if su.limit > 0 {`
			`s += fmt.Sprintf(" limit %d", su.limit)`
			`}`
			`return s`
			`}`

			`func (su *statsUniqValues) neededFields() []string {`
			`return su.fields`
			`}`

			`func (su *statsUniqValues) newStatsProcessor() (statsProcessor, int) {`
			`sup := &statsUniqValuesProcessor{`
			`su: su,`

			`m: make(map[string]struct{}),`
			`}`
			`return sup, int(unsafe.Sizeof(*sup))`
			`}`

			`type statsUniqValuesProcessor struct {`
			`su *statsUniqValues`

			`m map[string]struct{}`
			`}`

			`func (sup statsUniqValuesProcessor) updateStatsForAllRows(br blockResult) int {`
			`if sup.limitReached() {`
			`// Limit on the number of unique values has been reached`
			`return 0`
			`}`

			`stateSizeIncrease := 0`
			`if sup.su.containsStar {`
			`for _, c := range br.getColumns() {`
			`stateSizeIncrease += sup.updateStatsForAllRowsColumn(c, br)`
			`}`
			`} else {`
			`for _, field := range sup.su.fields {`
			`c := br.getColumnByName(field)`
			`stateSizeIncrease += sup.updateStatsForAllRowsColumn(c, br)`
			`}`
			`}`
			`return stateSizeIncrease`
			`}`

			`func (sup statsUniqValuesProcessor) updateStatsForAllRowsColumn(c blockResultColumn, br *blockResult) int {`
			`m := sup.m`
			`stateSizeIncrease := 0`
			`if c.isConst {`
			`// collect unique const values`
			`v := c.encodedValues[0]`
			`if v == "" {`
			`// skip empty values`
			`return stateSizeIncrease`
			`}`
			`if _, ok := m[v]; !ok {`
			`vCopy := strings.Clone(v)`
			`m[vCopy] = struct{}{}`
			`stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))`
			`}`
			`return stateSizeIncrease`
			`}`
			`if c.valueType == valueTypeDict {`
			`// collect unique non-zero c.dictValues`
			`for _, v := range c.dictValues {`
			`if v == "" {`
			`// skip empty values`
			`continue`
			`}`
			`if _, ok := m[v]; !ok {`
			`vCopy := strings.Clone(v)`
			`m[vCopy] = struct{}{}`
			`stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))`
			`}`
			`}`
			`return stateSizeIncrease`
			`}`

			`// slow path - collect unique values across all rows`
			`values := c.getValues(br)`
			`for i, v := range values {`
			`if v == "" {`
			`// skip empty values`
			`continue`
			`}`
			`if i > 0 && values[i-1] == v {`
			`// This value has been already counted.`
			`continue`
			`}`
			`if _, ok := m[v]; !ok {`
			`vCopy := strings.Clone(v)`
			`m[vCopy] = struct{}{}`
			`stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))`
			`}`
			`}`
			`return stateSizeIncrease`
			`}`

			`func (sup statsUniqValuesProcessor) updateStatsForRow(br blockResult, rowIdx int) int {`
			`if sup.limitReached() {`
			`// Limit on the number of unique values has been reached`
			`return 0`
			`}`

			`stateSizeIncrease := 0`
			`if sup.su.containsStar {`
			`for _, c := range br.getColumns() {`
			`stateSizeIncrease += sup.updateStatsForRowColumn(c, br, rowIdx)`
			`}`
			`} else {`
			`for _, field := range sup.su.fields {`
			`c := br.getColumnByName(field)`
			`stateSizeIncrease += sup.updateStatsForRowColumn(c, br, rowIdx)`
			`}`
			`}`
			`return stateSizeIncrease`
			`}`

			`func (sup statsUniqValuesProcessor) updateStatsForRowColumn(c blockResultColumn, br *blockResult, rowIdx int) int {`
			`m := sup.m`
			`stateSizeIncrease := 0`
			`if c.isConst {`
			`// collect unique const values`
			`v := c.encodedValues[0]`
			`if v == "" {`
			`// skip empty values`
			`return stateSizeIncrease`
			`}`
			`if _, ok := m[v]; !ok {`
			`vCopy := strings.Clone(v)`
			`m[vCopy] = struct{}{}`
			`stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))`
			`}`
			`return stateSizeIncrease`
			`}`
			`if c.valueType == valueTypeDict {`
			`// collect unique non-zero c.dictValues`
			`dictIdx := c.encodedValues[rowIdx][0]`
			`v := c.dictValues[dictIdx]`
			`if v == "" {`
			`// skip empty values`
			`return stateSizeIncrease`
			`}`
			`if _, ok := m[v]; !ok {`
			`vCopy := strings.Clone(v)`
			`m[vCopy] = struct{}{}`
			`stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))`
			`}`
			`return stateSizeIncrease`
			`}`

			`// collect unique values for the given rowIdx.`
			`v := c.getValueAtRow(br, rowIdx)`
			`if v == "" {`
			`// skip empty values`
			`return stateSizeIncrease`
			`}`
			`if _, ok := m[v]; !ok {`
			`vCopy := strings.Clone(v)`
			`m[vCopy] = struct{}{}`
			`stateSizeIncrease += len(vCopy) + int(unsafe.Sizeof(vCopy))`
			`}`
			`return stateSizeIncrease`
			`}`

			`func (sup *statsUniqValuesProcessor) mergeState(sfp statsProcessor) {`
			`if sup.limitReached() {`
			`return`
			`}`

			`src := sfp.(*statsUniqValuesProcessor)`
			`m := sup.m`
			`for k := range src.m {`
			`if _, ok := m[k]; !ok {`
			`m[k] = struct{}{}`
			`}`
			`}`
			`}`

			`func (sup *statsUniqValuesProcessor) finalizeStats() string {`
			`if len(sup.m) == 0 {`
			`return "[]"`
			`}`

			`// Sort unique items`
			`items := make([]string, 0, len(sup.m))`
			`for k := range sup.m {`
			`items = append(items, k)`
			`}`
			`slices.SortFunc(items, compareValues)`

			`if limit := sup.su.limit; limit > 0 && uint64(len(items)) > limit {`
			`items = items[:limit]`
			`}`

			`return marshalJSONArray(items)`
			`}`

			`func (sup *statsUniqValuesProcessor) limitReached() bool {`
			`limit := sup.su.limit`
			`return limit > 0 && uint64(len(sup.m)) >= limit`
			`}`

			`func marshalJSONArray(items []string) string {`
			`// Pre-allocate buffer for serialized items.`
			`// Assume that there is no need in quoting items. Otherwise additional reallocations`
			`// for the allocated buffer are possible.`
			`bufSize := len(items) + 1`
			`for _, item := range items {`
			`bufSize += len(item)`
			`}`
			`b := make([]byte, 0, bufSize)`

			`b = append(b, '[')`
			`b = strconv.AppendQuote(b, items[0])`
			`for _, item := range items[1:] {`
			`b = append(b, ',')`
			`b = strconv.AppendQuote(b, item)`
			`}`
			`b = append(b, ']')`

			`return bytesutil.ToUnsafeString(b)`
			`}`

			`func compareValues(a, b string) int {`
			`fA, okA := tryParseFloat64(a)`
			`fB, okB := tryParseFloat64(b)`
			`if okA && okB {`
			`if fA == fB {`
			`return 0`
			`}`
			`if fA < fB {`
			`return -1`
			`}`
			`return 1`
			`}`
			`if okA {`
			`return -1`
			`}`
			`if okB {`
			`return 1`
			`}`
			`return strings.Compare(a, b)`
			`}`

			`func parseStatsUniqValues(lex lexer) (statsUniqValues, error) {`
			`fields, err := parseFieldNamesForStatsFunc(lex, "uniq_values")`
			`if err != nil {`
			`return nil, err`
			`}`
			`su := &statsUniqValues{`
			`fields: fields,`
			`containsStar: slices.Contains(fields, "*"),`
			`}`
			`if lex.isKeyword("limit") {`
			`lex.nextToken()`
			`n, ok := tryParseUint64(lex.token)`
			`if !ok {`
			`return nil, fmt.Errorf("cannot parse 'limit %s' for 'uniq_values': %w", lex.token, err)`
			`}`
			`lex.nextToken()`
			`su.limit = n`
			`}`
			`return su, nil`
			`}`