package logstorage import ( "container/heap" "fmt" "math" "slices" "sort" "strings" "sync" "sync/atomic" "unsafe" "github.com/VictoriaMetrics/VictoriaMetrics/lib/contextutil" "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" ) // pipeStreamContext processes '| stream_context ...' queries. // // See https://docs.victoriametrics.com/victorialogs/logsql/#stream_context-pipe type pipeStreamContext struct { // linesBefore is the number of lines to return before the matching line linesBefore int // linesAfter is the number of lines to return after the matching line linesAfter int } func (pc *pipeStreamContext) String() string { s := "stream_context" if pc.linesBefore > 0 { s += fmt.Sprintf(" before %d", pc.linesBefore) } if pc.linesAfter > 0 { s += fmt.Sprintf(" after %d", pc.linesAfter) } return s } func (pc *pipeStreamContext) canLiveTail() bool { return false } var neededFieldsForStreamContext = []string{ "_time", "_stream_id", } func (pc *pipeStreamContext) updateNeededFields(neededFields, unneededFields fieldsSet) { neededFields.addFields(neededFieldsForStreamContext) unneededFields.removeFields(neededFieldsForStreamContext) } func (pc *pipeStreamContext) optimize() { // nothing to do } func (pc *pipeStreamContext) hasFilterInWithQuery() bool { return false } func (pc *pipeStreamContext) initFilterInValues(_ map[string][]string, _ getFieldValuesFunc) (pipe, error) { return pc, nil } func (pc *pipeStreamContext) newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppNext pipeProcessor) pipeProcessor { maxStateSize := int64(float64(memory.Allowed()) * 0.2) shards := make([]pipeStreamContextProcessorShard, workersCount) for i := range shards { shards[i] = pipeStreamContextProcessorShard{ pipeStreamContextProcessorShardNopad: pipeStreamContextProcessorShardNopad{ pc: pc, stateSizeBudget: stateSizeBudgetChunk, }, } maxStateSize -= stateSizeBudgetChunk } pcp := &pipeStreamContextProcessor{ pc: pc, stopCh: stopCh, cancel: cancel, ppNext: ppNext, shards: shards, maxStateSize: maxStateSize, } pcp.stateSizeBudget.Store(maxStateSize) return pcp } type pipeStreamContextProcessor struct { pc *pipeStreamContext stopCh <-chan struct{} cancel func() ppNext pipeProcessor s *Storage neededColumnNames []string unneededColumnNames []string shards []pipeStreamContextProcessorShard maxStateSize int64 stateSizeBudget atomic.Int64 } func (pcp *pipeStreamContextProcessor) init(s *Storage, neededColumnNames, unneededColumnNames []string) { pcp.s = s pcp.neededColumnNames = neededColumnNames pcp.unneededColumnNames = unneededColumnNames } func (pcp *pipeStreamContextProcessor) getStreamRowss(streamID string, neededRows []streamContextRow, stateSizeBudget int) ([][]*streamContextRow, error) { tenantID, ok := getTenantIDFromStreamIDString(streamID) if !ok { logger.Panicf("BUG: cannot obtain tenantID from streamID %q", streamID) } // construct the query for selecting all the rows for the given streamID qStr := "_stream_id:" + streamID if slices.Contains(pcp.neededColumnNames, "*") { if len(pcp.unneededColumnNames) > 0 { qStr += " | delete " + fieldNamesString(pcp.unneededColumnNames) } } else { if len(pcp.neededColumnNames) > 0 { qStr += " | fields " + fieldNamesString(pcp.neededColumnNames) } } q, err := ParseQuery(qStr) if err != nil { logger.Panicf("BUG: cannot parse query [%s]: %s", qStr, err) } // mu protects contextRows and stateSize inside writeBlock callback. var mu sync.Mutex contextRows := make([]streamContextRows, len(neededRows)) for i := range neededRows { contextRows[i] = streamContextRows{ neededTimestamp: neededRows[i].timestamp, linesBefore: pcp.pc.linesBefore, linesAfter: pcp.pc.linesAfter, } } sort.Slice(contextRows, func(i, j int) bool { return contextRows[i].neededTimestamp < contextRows[j].neededTimestamp }) stateSize := 0 ctxWithCancel, cancel := contextutil.NewStopChanContext(pcp.stopCh) defer cancel() writeBlock := func(_ uint, br *blockResult) { mu.Lock() defer mu.Unlock() if stateSize > stateSizeBudget { cancel() } timestamps := br.getTimestamps() for i, timestamp := range timestamps { if needStop(pcp.stopCh) { break } for j := range contextRows { if j > 0 && timestamp <= contextRows[j-1].neededTimestamp { continue } if j+1 < len(contextRows) && timestamp >= contextRows[j+1].neededTimestamp { continue } stateSize += contextRows[j].update(br, i, timestamp) } } } if err := pcp.s.runQuery(ctxWithCancel, []TenantID{tenantID}, q, writeBlock); err != nil { return nil, err } if stateSize > stateSizeBudget { return nil, fmt.Errorf("more than %dMB of memory is needed for fetching the surrounding logs for %d matching logs", stateSizeBudget/(1<<20), len(neededRows)) } // return sorted results from contextRows rowss := make([][]*streamContextRow, len(contextRows)) for i, ctx := range contextRows { rowss[i] = ctx.getSortedRows() } rowss = deduplicateStreamRowss(rowss) return rowss, nil } func deduplicateStreamRowss(streamRowss [][]*streamContextRow) [][]*streamContextRow { var lastSeenRow *streamContextRow for _, streamRows := range streamRowss { if len(streamRows) > 0 { lastSeenRow = streamRows[len(streamRows)-1] break } } if lastSeenRow == nil { return nil } resultRowss := streamRowss[:1] for _, streamRows := range streamRowss[1:] { i := 0 for i < len(streamRows) && !lastSeenRow.less(streamRows[i]) { i++ } streamRows = streamRows[i:] if len(streamRows) == 0 { continue } resultRowss = append(resultRowss, streamRows) lastSeenRow = streamRows[len(streamRows)-1] } return resultRowss } type streamContextRows struct { neededTimestamp int64 linesBefore int linesAfter int rowsBefore streamContextRowsHeapMin rowsAfter streamContextRowsHeapMax rowsMatched []*streamContextRow } func (ctx *streamContextRows) getSortedRows() []*streamContextRow { var rows []*streamContextRow rows = append(rows, ctx.rowsBefore...) rows = append(rows, ctx.rowsMatched...) rows = append(rows, ctx.rowsAfter...) sort.Slice(rows, func(i, j int) bool { return rows[i].less(rows[j]) }) return rows } func (ctx *streamContextRows) update(br *blockResult, rowIdx int, rowTimestamp int64) int { if rowTimestamp < ctx.neededTimestamp { if ctx.linesBefore <= 0 { return 0 } if len(ctx.rowsBefore) < ctx.linesBefore { r := ctx.copyRowAtIdx(br, rowIdx, rowTimestamp) heap.Push(&ctx.rowsBefore, r) return r.sizeBytes() } if rowTimestamp <= ctx.rowsBefore[0].timestamp { return 0 } r := ctx.copyRowAtIdx(br, rowIdx, rowTimestamp) stateSizeChange := r.sizeBytes() - ctx.rowsBefore[0].sizeBytes() ctx.rowsBefore[0] = r heap.Fix(&ctx.rowsBefore, 0) return stateSizeChange } if rowTimestamp > ctx.neededTimestamp { if ctx.linesAfter <= 0 { return 0 } if len(ctx.rowsAfter) < ctx.linesAfter { r := ctx.copyRowAtIdx(br, rowIdx, rowTimestamp) heap.Push(&ctx.rowsAfter, r) return r.sizeBytes() } if rowTimestamp >= ctx.rowsAfter[0].timestamp { return 0 } r := ctx.copyRowAtIdx(br, rowIdx, rowTimestamp) stateSizeChange := r.sizeBytes() - ctx.rowsAfter[0].sizeBytes() ctx.rowsAfter[0] = r heap.Fix(&ctx.rowsAfter, 0) return stateSizeChange } // rowTimestamp == ctx.neededTimestamp r := ctx.copyRowAtIdx(br, rowIdx, rowTimestamp) ctx.rowsMatched = append(ctx.rowsMatched, r) return r.sizeBytes() } func (ctx *streamContextRows) copyRowAtIdx(br *blockResult, rowIdx int, rowTimestamp int64) *streamContextRow { cs := br.getColumns() fields := make([]Field, len(cs)) for i, c := range cs { v := c.getValueAtRow(br, rowIdx) fields[i] = Field{ Name: strings.Clone(c.name), Value: strings.Clone(v), } } return &streamContextRow{ timestamp: rowTimestamp, fields: fields, } } func getTenantIDFromStreamIDString(s string) (TenantID, bool) { var sid streamID if !sid.tryUnmarshalFromString(s) { return TenantID{}, false } return sid.tenantID, true } type pipeStreamContextProcessorShard struct { pipeStreamContextProcessorShardNopad // The padding prevents false sharing on widespread platforms with 128 mod (cache line size) = 0 . _ [128 - unsafe.Sizeof(pipeStreamContextProcessorShardNopad{})%128]byte } type streamContextRow struct { timestamp int64 fields []Field } func (r *streamContextRow) sizeBytes() int { n := 0 fields := r.fields for _, f := range fields { n += len(f.Name) + len(f.Value) + int(unsafe.Sizeof(f)) } n += int(unsafe.Sizeof(*r) + unsafe.Sizeof(r)) return n } func (r *streamContextRow) less(other *streamContextRow) bool { // compare timestamps at first if r.timestamp != other.timestamp { return r.timestamp < other.timestamp } // compare fields then i := 0 aFields := r.fields bFields := other.fields for i < len(aFields) && i < len(bFields) { af := &aFields[i] bf := &bFields[i] if af.Name != bf.Name { return af.Name < bf.Name } if af.Value != bf.Value { return af.Value < bf.Value } i++ } if len(aFields) != len(bFields) { return len(aFields) < len(bFields) } return false } type pipeStreamContextProcessorShardNopad struct { // pc points to the parent pipeStreamContext. pc *pipeStreamContext // m holds per-stream matching rows m map[string][]streamContextRow // stateSizeBudget is the remaining budget for the whole state size for the shard. // The per-shard budget is provided in chunks from the parent pipeStreamContextProcessor. stateSizeBudget int } // writeBlock writes br to shard. func (shard *pipeStreamContextProcessorShard) writeBlock(br *blockResult) { m := shard.getM() cs := br.getColumns() cStreamID := br.getColumnByName("_stream_id") stateSize := 0 timestamps := br.getTimestamps() for i, timestamp := range timestamps { fields := make([]Field, len(cs)) stateSize += int(unsafe.Sizeof(fields[0])) * len(fields) for j, c := range cs { v := c.getValueAtRow(br, i) fields[j] = Field{ Name: strings.Clone(c.name), Value: strings.Clone(v), } stateSize += len(c.name) + len(v) } row := streamContextRow{ timestamp: timestamp, fields: fields, } stateSize += int(unsafe.Sizeof(row)) streamID := cStreamID.getValueAtRow(br, i) rows, ok := m[streamID] if !ok { stateSize += len(streamID) } rows = append(rows, row) streamID = strings.Clone(streamID) m[streamID] = rows } shard.stateSizeBudget -= stateSize } func (shard *pipeStreamContextProcessorShard) getM() map[string][]streamContextRow { if shard.m == nil { shard.m = make(map[string][]streamContextRow) } return shard.m } func (pcp *pipeStreamContextProcessor) writeBlock(workerID uint, br *blockResult) { if br.rowsLen == 0 { return } if pcp.pc.linesBefore <= 0 && pcp.pc.linesAfter <= 0 { // Fast path - there is no need to fetch stream context. pcp.ppNext.writeBlock(workerID, br) return } shard := &pcp.shards[workerID] for shard.stateSizeBudget < 0 { // steal some budget for the state size from the global budget. remaining := pcp.stateSizeBudget.Add(-stateSizeBudgetChunk) if remaining < 0 { // The state size is too big. Stop processing data in order to avoid OOM crash. if remaining+stateSizeBudgetChunk >= 0 { // Notify worker goroutines to stop calling writeBlock() in order to save CPU time. pcp.cancel() } return } shard.stateSizeBudget += stateSizeBudgetChunk } shard.writeBlock(br) } func (pcp *pipeStreamContextProcessor) flush() error { if pcp.pc.linesBefore <= 0 && pcp.pc.linesAfter <= 0 { // Fast path - nothing to do. return nil } n := pcp.stateSizeBudget.Load() if n <= 0 { return fmt.Errorf("cannot calculate [%s], since it requires more than %dMB of memory", pcp.pc.String(), pcp.maxStateSize/(1<<20)) } if n > math.MaxInt { logger.Panicf("BUG: stateSizeBudget shouldn't exceed math.MaxInt=%v; got %d", math.MaxInt, n) } stateSizeBudget := int(n) // merge state across shards shards := pcp.shards m := shards[0].getM() shards = shards[1:] for i := range shards { if needStop(pcp.stopCh) { return nil } for streamID, rowsSrc := range shards[i].getM() { rows, ok := m[streamID] if !ok { m[streamID] = rowsSrc } else { m[streamID] = append(rows, rowsSrc...) } } } // write result wctx := &pipeStreamContextWriteContext{ pcp: pcp, } for streamID, rows := range m { streamRowss, err := pcp.getStreamRowss(streamID, rows, stateSizeBudget) if err != nil { return err } if needStop(pcp.stopCh) { return nil } // Write streamRows to the output. for _, streamRows := range streamRowss { for _, streamRow := range streamRows { wctx.writeRow(streamRow.fields) } if len(streamRowss) > 1 { lastRow := streamRows[len(streamRows)-1] fields := newDelimiterRowFields(lastRow, streamID) wctx.writeRow(fields) } } } wctx.flush() return nil } func newDelimiterRowFields(r *streamContextRow, streamID string) []Field { return []Field{ { Name: "_time", Value: string(marshalTimestampRFC3339NanoString(nil, r.timestamp+1)), }, { Name: "_stream_id", Value: streamID, }, { Name: "_stream", Value: getFieldValue(r.fields, "_stream"), }, { Name: "_msg", Value: "---", }, } } type pipeStreamContextWriteContext struct { pcp *pipeStreamContextProcessor rcs []resultColumn br blockResult // rowsCount is the number of rows in the current block rowsCount int // valuesLen is the total length of values in the current block valuesLen int } func (wctx *pipeStreamContextWriteContext) writeRow(rowFields []Field) { rcs := wctx.rcs areEqualColumns := len(rcs) == len(rowFields) if areEqualColumns { for i, f := range rowFields { if rcs[i].name != f.Name { areEqualColumns = false break } } } if !areEqualColumns { // send the current block to ppNext and construct a block with new set of columns wctx.flush() rcs = wctx.rcs[:0] for _, f := range rowFields { rcs = appendResultColumnWithName(rcs, f.Name) } wctx.rcs = rcs } for i, f := range rowFields { v := f.Value rcs[i].addValue(v) wctx.valuesLen += len(v) } wctx.rowsCount++ if wctx.valuesLen >= 1_000_000 { wctx.flush() } } func (wctx *pipeStreamContextWriteContext) flush() { rcs := wctx.rcs br := &wctx.br wctx.valuesLen = 0 // Flush rcs to ppNext br.setResultColumns(rcs, wctx.rowsCount) wctx.rowsCount = 0 wctx.pcp.ppNext.writeBlock(0, br) br.reset() for i := range rcs { rcs[i].resetValues() } } func parsePipeStreamContext(lex *lexer) (*pipeStreamContext, error) { if !lex.isKeyword("stream_context") { return nil, fmt.Errorf("expecting 'stream_context'; got %q", lex.token) } lex.nextToken() linesBefore, linesAfter, err := parsePipeStreamContextBeforeAfter(lex) if err != nil { return nil, err } pc := &pipeStreamContext{ linesBefore: linesBefore, linesAfter: linesAfter, } return pc, nil } func parsePipeStreamContextBeforeAfter(lex *lexer) (int, int, error) { linesBefore := 0 linesAfter := 0 beforeSet := false afterSet := false for { switch { case lex.isKeyword("before"): lex.nextToken() f, s, err := parseNumber(lex) if err != nil { return 0, 0, fmt.Errorf("cannot parse 'before' value in 'stream_context': %w", err) } if f < 0 { return 0, 0, fmt.Errorf("'before' value cannot be smaller than 0; got %q", s) } linesBefore = int(f) beforeSet = true case lex.isKeyword("after"): lex.nextToken() f, s, err := parseNumber(lex) if err != nil { return 0, 0, fmt.Errorf("cannot parse 'after' value in 'stream_context': %w", err) } if f < 0 { return 0, 0, fmt.Errorf("'after' value cannot be smaller than 0; got %q", s) } linesAfter = int(f) afterSet = true default: if !beforeSet && !afterSet { return 0, 0, fmt.Errorf("missing 'before N' or 'after N' in 'stream_context'") } return linesBefore, linesAfter, nil } } } type streamContextRowsHeapMax []*streamContextRow func (h *streamContextRowsHeapMax) Len() int { return len(*h) } func (h *streamContextRowsHeapMax) Less(i, j int) bool { a := *h return a[i].timestamp > a[j].timestamp } func (h *streamContextRowsHeapMax) Swap(i, j int) { a := *h a[i], a[j] = a[j], a[i] } func (h *streamContextRowsHeapMax) Push(v any) { x := v.(*streamContextRow) *h = append(*h, x) } func (h *streamContextRowsHeapMax) Pop() any { a := *h x := a[len(a)-1] a[len(a)-1] = nil *h = a[:len(a)-1] return x } type streamContextRowsHeapMin streamContextRowsHeapMax func (h *streamContextRowsHeapMin) Len() int { return len(*h) } func (h *streamContextRowsHeapMin) Less(i, j int) bool { a := *h return a[i].timestamp < a[j].timestamp } func (h *streamContextRowsHeapMin) Swap(i, j int) { a := *h a[i], a[j] = a[j], a[i] } func (h *streamContextRowsHeapMin) Push(v any) { x := v.(*streamContextRow) *h = append(*h, x) } func (h *streamContextRowsHeapMin) Pop() any { a := *h x := a[len(a)-1] a[len(a)-1] = nil *h = a[:len(a)-1] return x }