VictoriaMetrics/lib/logstorage/pipe_uniq.go

package logstorage

import (
	"fmt"
	"slices"
	"sync/atomic"
	"unsafe"

	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
)

// pipeUniq processes '| uniq ...' queries.
//
// See https://docs.victoriametrics.com/victorialogs/logsql/#uniq-pipe
type pipeUniq struct {
	// fields contains field names for returning unique values
	byFields []string

	limit uint64
}

func (pu *pipeUniq) String() string {
	s := "uniq"
	if len(pu.byFields) > 0 {
		s += " by (" + fieldNamesString(pu.byFields) + ")"
	}
	if pu.limit > 0 {
		s += fmt.Sprintf(" limit %d", pu.limit)
	}
	return s
}

func (pu *pipeUniq) updateNeededFields(neededFields, unneededFields fieldsSet) {
	neededFields.reset()
	unneededFields.reset()

	if len(pu.byFields) == 0 {
		neededFields.add("*")
	} else {
		neededFields.addAll(pu.byFields)
	}
}

func (pu *pipeUniq) newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppBase pipeProcessor) pipeProcessor {
	maxStateSize := int64(float64(memory.Allowed()) * 0.2)

	shards := make([]pipeUniqProcessorShard, workersCount)
	for i := range shards {
		shard := &shards[i]
		shard.pu = pu
		shard.m = make(map[string]struct{})
		shard.stateSizeBudget = stateSizeBudgetChunk
		maxStateSize -= stateSizeBudgetChunk
	}

	pup := &pipeUniqProcessor{
		pu:     pu,
		stopCh: stopCh,
		cancel: cancel,
		ppBase: ppBase,

		shards: shards,

		maxStateSize: maxStateSize,
	}
	pup.stateSizeBudget.Store(maxStateSize)

	return pup
}

type pipeUniqProcessor struct {
	pu     *pipeUniq
	stopCh <-chan struct{}
	cancel func()
	ppBase pipeProcessor

	shards []pipeUniqProcessorShard

	maxStateSize    int64
	stateSizeBudget atomic.Int64
}

type pipeUniqProcessorShard struct {
	pipeUniqProcessorShardNopad

	// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .
	_ [128 - unsafe.Sizeof(pipeUniqProcessorShardNopad{})%128]byte
}

type pipeUniqProcessorShardNopad struct {
	// pu points to the parent pipeUniq.
	pu *pipeUniq

	// m holds unique rows.
	m map[string]struct{}

	// keyBuf is a temporary buffer for building keys for m.
	keyBuf []byte

	// columnValues is a temporary buffer for the processed column values.
	columnValues [][]string

	// stateSizeBudget is the remaining budget for the whole state size for the shard.
	// The per-shard budget is provided in chunks from the parent pipeUniqProcessor.
	stateSizeBudget int
}

// writeBlock writes br to shard.
//
// It returns false if the block cannot be written because of the exceeded limit.
func (shard *pipeUniqProcessorShard) writeBlock(br *blockResult) bool {
	if limit := shard.pu.limit; limit > 0 && uint64(len(shard.m)) >= limit {
		return false
	}

	m := shard.m
	byFields := shard.pu.byFields
	if len(byFields) == 0 {
		// Take into account all the columns in br.
		keyBuf := shard.keyBuf
		cs := br.getColumns()
		for i := range br.timestamps {
			keyBuf = keyBuf[:0]
			for _, c := range cs {
				v := c.getValueAtRow(br, i)
				keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(c.name))
				keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))
			}
			if _, ok := m[string(keyBuf)]; !ok {
				m[string(keyBuf)] = struct{}{}
				shard.stateSizeBudget -= len(keyBuf) + int(unsafe.Sizeof(""))
			}
		}
		shard.keyBuf = keyBuf
		return true
	}

	// Take into account only the selected columns.
	columnValues := shard.columnValues[:0]
	for _, f := range byFields {
		c := br.getColumnByName(f)
		columnValues = append(columnValues, c.getValues(br))
	}
	shard.columnValues = columnValues

	keyBuf := shard.keyBuf
	for i := range br.timestamps {
		seenValue := true
		for _, values := range columnValues {
			if i == 0 || values[i-1] != values[i] {
				seenValue = false
				break
			}
		}
		if seenValue {
			continue
		}

		keyBuf = keyBuf[:0]
		for _, values := range columnValues {
			keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(values[i]))
		}
		if _, ok := m[string(keyBuf)]; !ok {
			m[string(keyBuf)] = struct{}{}
			shard.stateSizeBudget -= len(keyBuf) + int(unsafe.Sizeof(""))
		}
	}
	shard.keyBuf = keyBuf

	return true
}

func (pup *pipeUniqProcessor) writeBlock(workerID uint, br *blockResult) {
	if len(br.timestamps) == 0 {
		return
	}

	shard := &pup.shards[workerID]

	for shard.stateSizeBudget < 0 {
		// steal some budget for the state size from the global budget.
		remaining := pup.stateSizeBudget.Add(-stateSizeBudgetChunk)
		if remaining < 0 {
			// The state size is too big. Stop processing data in order to avoid OOM crash.
			if remaining+stateSizeBudgetChunk >= 0 {
				// Notify worker goroutines to stop calling writeBlock() in order to save CPU time.
				pup.cancel()
			}
			return
		}
		shard.stateSizeBudget += stateSizeBudgetChunk
	}

	if !shard.writeBlock(br) {
		pup.cancel()
	}
}

func (pup *pipeUniqProcessor) flush() error {
	if n := pup.stateSizeBudget.Load(); n <= 0 {
		return fmt.Errorf("cannot calculate [%s], since it requires more than %dMB of memory", pup.pu.String(), pup.maxStateSize/(1<<20))
	}

	// merge state across shards
	shards := pup.shards
	m := shards[0].m
	shards = shards[1:]
	for i := range shards {
		select {
		case <-pup.stopCh:
			return nil
		default:
		}

		for k := range shards[i].m {
			m[k] = struct{}{}
		}
	}

	// write result
	wctx := &pipeUniqWriteContext{
		pup: pup,
	}
	byFields := pup.pu.byFields
	var rowFields []Field

	if len(byFields) == 0 {
		for k := range m {
			select {
			case <-pup.stopCh:
				return nil
			default:
			}

			rowFields = rowFields[:0]
			keyBuf := bytesutil.ToUnsafeBytes(k)
			for len(keyBuf) > 0 {
				tail, name, err := encoding.UnmarshalBytes(keyBuf)
				if err != nil {
					logger.Panicf("BUG: cannot unmarshal field name: %s", err)
				}
				keyBuf = tail

				tail, value, err := encoding.UnmarshalBytes(keyBuf)
				if err != nil {
					logger.Panicf("BUG: cannot unmarshal field value: %s", err)
				}
				keyBuf = tail

				rowFields = append(rowFields, Field{
					Name:  bytesutil.ToUnsafeString(name),
					Value: bytesutil.ToUnsafeString(value),
				})
			}
			wctx.writeRow(rowFields)
		}
	} else {
		for k := range m {
			select {
			case <-pup.stopCh:
				return nil
			default:
			}

			rowFields = rowFields[:0]
			keyBuf := bytesutil.ToUnsafeBytes(k)
			fieldIdx := 0
			for len(keyBuf) > 0 {
				tail, value, err := encoding.UnmarshalBytes(keyBuf)
				if err != nil {
					logger.Panicf("BUG: cannot unmarshal field value: %s", err)
				}
				keyBuf = tail

				rowFields = append(rowFields, Field{
					Name:  byFields[fieldIdx],
					Value: bytesutil.ToUnsafeString(value),
				})
				fieldIdx++
			}
			wctx.writeRow(rowFields)
		}
	}

	wctx.flush()

	return nil
}

type pipeUniqWriteContext struct {
	pup *pipeUniqProcessor
	rcs []resultColumn
	br  blockResult

	rowsWritten uint64

	valuesLen int
}

func (wctx *pipeUniqWriteContext) writeRow(rowFields []Field) {
	if limit := wctx.pup.pu.limit; limit > 0 && wctx.rowsWritten >= limit {
		return
	}
	wctx.rowsWritten++

	rcs := wctx.rcs

	areEqualColumns := len(rcs) == len(rowFields)
	if areEqualColumns {
		for i, f := range rowFields {
			if rcs[i].name != f.Name {
				areEqualColumns = false
				break
			}
		}
	}
	if !areEqualColumns {
		// send the current block to bbBase and construct a block with new set of columns
		wctx.flush()

		rcs = wctx.rcs[:0]
		for _, f := range rowFields {
			rcs = append(rcs, resultColumn{
				name: f.Name,
			})
		}
		wctx.rcs = rcs
	}

	for i, f := range rowFields {
		v := f.Value
		rcs[i].addValue(v)
		wctx.valuesLen += len(v)
	}
	if wctx.valuesLen >= 1_000_000 {
		wctx.flush()
	}
}

func (wctx *pipeUniqWriteContext) flush() {
	rcs := wctx.rcs
	br := &wctx.br

	wctx.valuesLen = 0

	if len(rcs) == 0 {
		return
	}

	// Flush rcs to ppBase
	br.setResultColumns(rcs)
	wctx.pup.ppBase.writeBlock(0, br)
	br.reset()
	for i := range rcs {
		rcs[i].resetKeepName()
	}
}

func parsePipeUniq(lex *lexer) (*pipeUniq, error) {
	if !lex.isKeyword("uniq") {
		return nil, fmt.Errorf("expecting 'uniq'; got %q", lex.token)
	}
	lex.nextToken()

	var pu pipeUniq
	if lex.isKeyword("by") {
		lex.nextToken()
		bfs, err := parseFieldNamesInParens(lex)
		if err != nil {
			return nil, fmt.Errorf("cannot parse 'by' clause: %w", err)
		}
		if slices.Contains(bfs, "*") {
			bfs = nil
		}
		pu.byFields = bfs
	}

	if lex.isKeyword("limit") {
		lex.nextToken()
		n, ok := tryParseUint64(lex.token)
		if !ok {
			return nil, fmt.Errorf("cannot parse 'limit %s'", lex.token)
		}
		lex.nextToken()
		pu.limit = n
	}

	return &pu, nil
}
lib/logstorage: initial implementation of pipes in LogsQL See https://docs.victoriametrics.com/victorialogs/logsql/#pipes 2024-05-12 16:33:29 +02:00			`package logstorage`

			`import (`
			`"fmt"`
			`"slices"`
			`"sync/atomic"`
			`"unsafe"`

			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"`
			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"`
			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"`
			`"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"`
			`)`

			`// pipeUniq processes '\| uniq ...' queries.`
			`//`
			`// See https://docs.victoriametrics.com/victorialogs/logsql/#uniq-pipe`
			`type pipeUniq struct {`
			`// fields contains field names for returning unique values`
			`byFields []string`

			`limit uint64`
			`}`

			`func (pu *pipeUniq) String() string {`
			`s := "uniq"`
			`if len(pu.byFields) > 0 {`
			`s += " by (" + fieldNamesString(pu.byFields) + ")"`
			`}`
			`if pu.limit > 0 {`
			`s += fmt.Sprintf(" limit %d", pu.limit)`
			`}`
			`return s`
			`}`

			`func (pu *pipeUniq) updateNeededFields(neededFields, unneededFields fieldsSet) {`
			`neededFields.reset()`
			`unneededFields.reset()`

			`if len(pu.byFields) == 0 {`
			`neededFields.add("*")`
			`} else {`
			`neededFields.addAll(pu.byFields)`
			`}`
			`}`

			`func (pu *pipeUniq) newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppBase pipeProcessor) pipeProcessor {`
			`maxStateSize := int64(float64(memory.Allowed()) * 0.2)`

			`shards := make([]pipeUniqProcessorShard, workersCount)`
			`for i := range shards {`
			`shard := &shards[i]`
			`shard.pu = pu`
			`shard.m = make(map[string]struct{})`
			`shard.stateSizeBudget = stateSizeBudgetChunk`
			`maxStateSize -= stateSizeBudgetChunk`
			`}`

			`pup := &pipeUniqProcessor{`
			`pu: pu,`
			`stopCh: stopCh,`
			`cancel: cancel,`
			`ppBase: ppBase,`

			`shards: shards,`

			`maxStateSize: maxStateSize,`
			`}`
			`pup.stateSizeBudget.Store(maxStateSize)`

			`return pup`
			`}`

			`type pipeUniqProcessor struct {`
			`pu *pipeUniq`
			`stopCh <-chan struct{}`
			`cancel func()`
			`ppBase pipeProcessor`

			`shards []pipeUniqProcessorShard`

			`maxStateSize int64`
			`stateSizeBudget atomic.Int64`
			`}`

			`type pipeUniqProcessorShard struct {`
			`pipeUniqProcessorShardNopad`

			`// The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 .`
			`_ [128 - unsafe.Sizeof(pipeUniqProcessorShardNopad{})%128]byte`
			`}`

			`type pipeUniqProcessorShardNopad struct {`
			`// pu points to the parent pipeUniq.`
			`pu *pipeUniq`

			`// m holds unique rows.`
			`m map[string]struct{}`

			`// keyBuf is a temporary buffer for building keys for m.`
			`keyBuf []byte`

			`// columnValues is a temporary buffer for the processed column values.`
			`columnValues [][]string`

			`// stateSizeBudget is the remaining budget for the whole state size for the shard.`
			`// The per-shard budget is provided in chunks from the parent pipeUniqProcessor.`
			`stateSizeBudget int`
			`}`

			`// writeBlock writes br to shard.`
			`//`
			`// It returns false if the block cannot be written because of the exceeded limit.`
			`func (shard pipeUniqProcessorShard) writeBlock(br blockResult) bool {`
			`if limit := shard.pu.limit; limit > 0 && uint64(len(shard.m)) >= limit {`
			`return false`
			`}`

			`m := shard.m`
			`byFields := shard.pu.byFields`
			`if len(byFields) == 0 {`
			`// Take into account all the columns in br.`
			`keyBuf := shard.keyBuf`
			`cs := br.getColumns()`
			`for i := range br.timestamps {`
			`keyBuf = keyBuf[:0]`
			`for _, c := range cs {`
			`v := c.getValueAtRow(br, i)`
			`keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(c.name))`
			`keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(v))`
			`}`
			`if _, ok := m[string(keyBuf)]; !ok {`
			`m[string(keyBuf)] = struct{}{}`
			`shard.stateSizeBudget -= len(keyBuf) + int(unsafe.Sizeof(""))`
			`}`
			`}`
			`shard.keyBuf = keyBuf`
			`return true`
			`}`

			`// Take into account only the selected columns.`
			`columnValues := shard.columnValues[:0]`
			`for _, f := range byFields {`
			`c := br.getColumnByName(f)`
			`columnValues = append(columnValues, c.getValues(br))`
			`}`
			`shard.columnValues = columnValues`

			`keyBuf := shard.keyBuf`
			`for i := range br.timestamps {`
			`seenValue := true`
			`for _, values := range columnValues {`
			`if i == 0 \|\| values[i-1] != values[i] {`
			`seenValue = false`
			`break`
			`}`
			`}`
			`if seenValue {`
			`continue`
			`}`

			`keyBuf = keyBuf[:0]`
			`for _, values := range columnValues {`
			`keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(values[i]))`
			`}`
			`if _, ok := m[string(keyBuf)]; !ok {`
			`m[string(keyBuf)] = struct{}{}`
			`shard.stateSizeBudget -= len(keyBuf) + int(unsafe.Sizeof(""))`
			`}`
			`}`
			`shard.keyBuf = keyBuf`

			`return true`
			`}`

			`func (pup pipeUniqProcessor) writeBlock(workerID uint, br blockResult) {`
			`if len(br.timestamps) == 0 {`
			`return`
			`}`

			`shard := &pup.shards[workerID]`

			`for shard.stateSizeBudget < 0 {`
			`// steal some budget for the state size from the global budget.`
			`remaining := pup.stateSizeBudget.Add(-stateSizeBudgetChunk)`
			`if remaining < 0 {`
			`// The state size is too big. Stop processing data in order to avoid OOM crash.`
			`if remaining+stateSizeBudgetChunk >= 0 {`
			`// Notify worker goroutines to stop calling writeBlock() in order to save CPU time.`
			`pup.cancel()`
			`}`
			`return`
			`}`
			`shard.stateSizeBudget += stateSizeBudgetChunk`
			`}`

			`if !shard.writeBlock(br) {`
			`pup.cancel()`
			`}`
			`}`

			`func (pup *pipeUniqProcessor) flush() error {`
			`if n := pup.stateSizeBudget.Load(); n <= 0 {`
			`return fmt.Errorf("cannot calculate [%s], since it requires more than %dMB of memory", pup.pu.String(), pup.maxStateSize/(1<<20))`
			`}`

			`// merge state across shards`
			`shards := pup.shards`
			`m := shards[0].m`
			`shards = shards[1:]`
			`for i := range shards {`
			`select {`
			`case <-pup.stopCh:`
			`return nil`
			`default:`
			`}`

			`for k := range shards[i].m {`
			`m[k] = struct{}{}`
			`}`
			`}`

			`// write result`
			`wctx := &pipeUniqWriteContext{`
			`pup: pup,`
			`}`
			`byFields := pup.pu.byFields`
			`var rowFields []Field`

			`if len(byFields) == 0 {`
			`for k := range m {`
			`select {`
			`case <-pup.stopCh:`
			`return nil`
			`default:`
			`}`

			`rowFields = rowFields[:0]`
			`keyBuf := bytesutil.ToUnsafeBytes(k)`
			`for len(keyBuf) > 0 {`
			`tail, name, err := encoding.UnmarshalBytes(keyBuf)`
			`if err != nil {`
			`logger.Panicf("BUG: cannot unmarshal field name: %s", err)`
			`}`
			`keyBuf = tail`

			`tail, value, err := encoding.UnmarshalBytes(keyBuf)`
			`if err != nil {`
			`logger.Panicf("BUG: cannot unmarshal field value: %s", err)`
			`}`
			`keyBuf = tail`

			`rowFields = append(rowFields, Field{`
			`Name: bytesutil.ToUnsafeString(name),`
			`Value: bytesutil.ToUnsafeString(value),`
			`})`
			`}`
			`wctx.writeRow(rowFields)`
			`}`
			`} else {`
			`for k := range m {`
			`select {`
			`case <-pup.stopCh:`
			`return nil`
			`default:`
			`}`

			`rowFields = rowFields[:0]`
			`keyBuf := bytesutil.ToUnsafeBytes(k)`
			`fieldIdx := 0`
			`for len(keyBuf) > 0 {`
			`tail, value, err := encoding.UnmarshalBytes(keyBuf)`
			`if err != nil {`
			`logger.Panicf("BUG: cannot unmarshal field value: %s", err)`
			`}`
			`keyBuf = tail`

			`rowFields = append(rowFields, Field{`
			`Name: byFields[fieldIdx],`
			`Value: bytesutil.ToUnsafeString(value),`
			`})`
			`fieldIdx++`
			`}`
			`wctx.writeRow(rowFields)`
			`}`
			`}`

			`wctx.flush()`

			`return nil`
			`}`

			`type pipeUniqWriteContext struct {`
			`pup *pipeUniqProcessor`
			`rcs []resultColumn`
			`br blockResult`

			`rowsWritten uint64`

			`valuesLen int`
			`}`

			`func (wctx *pipeUniqWriteContext) writeRow(rowFields []Field) {`
			`if limit := wctx.pup.pu.limit; limit > 0 && wctx.rowsWritten >= limit {`
			`return`
			`}`
			`wctx.rowsWritten++`

			`rcs := wctx.rcs`

			`areEqualColumns := len(rcs) == len(rowFields)`
			`if areEqualColumns {`
			`for i, f := range rowFields {`
			`if rcs[i].name != f.Name {`
			`areEqualColumns = false`
			`break`
			`}`
			`}`
			`}`
			`if !areEqualColumns {`
			`// send the current block to bbBase and construct a block with new set of columns`
			`wctx.flush()`

			`rcs = wctx.rcs[:0]`
			`for _, f := range rowFields {`
			`rcs = append(rcs, resultColumn{`
			`name: f.Name,`
			`})`
			`}`
			`wctx.rcs = rcs`
			`}`

			`for i, f := range rowFields {`
			`v := f.Value`
			`rcs[i].addValue(v)`
			`wctx.valuesLen += len(v)`
			`}`
			`if wctx.valuesLen >= 1_000_000 {`
			`wctx.flush()`
			`}`
			`}`

			`func (wctx *pipeUniqWriteContext) flush() {`
			`rcs := wctx.rcs`
			`br := &wctx.br`

			`wctx.valuesLen = 0`

			`if len(rcs) == 0 {`
			`return`
			`}`

			`// Flush rcs to ppBase`
			`br.setResultColumns(rcs)`
			`wctx.pup.ppBase.writeBlock(0, br)`
			`br.reset()`
			`for i := range rcs {`
			`rcs[i].resetKeepName()`
			`}`
			`}`

			`func parsePipeUniq(lex lexer) (pipeUniq, error) {`
			`if !lex.isKeyword("uniq") {`
			`return nil, fmt.Errorf("expecting 'uniq'; got %q", lex.token)`
			`}`
			`lex.nextToken()`

			`var pu pipeUniq`
			`if lex.isKeyword("by") {`
			`lex.nextToken()`
			`bfs, err := parseFieldNamesInParens(lex)`
			`if err != nil {`
			`return nil, fmt.Errorf("cannot parse 'by' clause: %w", err)`
			`}`
			`if slices.Contains(bfs, "*") {`
			`bfs = nil`
			`}`
			`pu.byFields = bfs`
			`}`

			`if lex.isKeyword("limit") {`
			`lex.nextToken()`
			`n, ok := tryParseUint64(lex.token)`
			`if !ok {`
			`return nil, fmt.Errorf("cannot parse 'limit %s'", lex.token)`
			`}`
			`lex.nextToken()`
			`pu.limit = n`
			`}`

			`return &pu, nil`
			`}`