2023-06-20 07:55:12 +02:00
package logstorage
import (
"fmt"
"sort"
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
// block represents a block of log entries.
type block struct {
// timestamps contains timestamps for log entries.
timestamps [ ] int64
// columns contains values for fields seen in log entries.
columns [ ] column
// constColumns contains fields with constant values across all the block entries.
constColumns [ ] Field
}
func ( b * block ) reset ( ) {
b . timestamps = b . timestamps [ : 0 ]
cs := b . columns
for i := range cs {
cs [ i ] . reset ( )
}
b . columns = cs [ : 0 ]
ccs := b . constColumns
for i := range ccs {
ccs [ i ] . Reset ( )
}
b . constColumns = ccs [ : 0 ]
}
// uncompressedSizeBytes returns the total size of the origianl log entries stored in b.
//
// It is supposed that every log entry has the following format:
//
// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
func ( b * block ) uncompressedSizeBytes ( ) uint64 {
rowsCount := uint64 ( b . Len ( ) )
// Take into account timestamps
n := rowsCount * uint64 ( len ( time . RFC3339Nano ) )
// Take into account columns
cs := b . columns
for i := range cs {
c := & cs [ i ]
nameLen := uint64 ( len ( c . name ) )
if nameLen == 0 {
nameLen = uint64 ( len ( "_msg" ) )
}
for _ , v := range c . values {
if len ( v ) > 0 {
n += nameLen + 2 + uint64 ( len ( v ) )
}
}
}
// Take into account constColumns
ccs := b . constColumns
for i := range ccs {
cc := & ccs [ i ]
nameLen := uint64 ( len ( cc . Name ) )
if nameLen == 0 {
nameLen = uint64 ( len ( "_msg" ) )
}
n += rowsCount * ( 2 + nameLen + uint64 ( len ( cc . Value ) ) )
}
return n
}
// uncompressedRowsSizeBytes returns the size of the uncompressed rows.
//
// It is supposed that every row has the following format:
//
// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
func uncompressedRowsSizeBytes ( rows [ ] [ ] Field ) uint64 {
n := uint64 ( 0 )
for _ , fields := range rows {
n += uncompressedRowSizeBytes ( fields )
}
return n
}
// uncompressedRowSizeBytes returns the size of uncompressed row.
//
// It is supposed that the row has the following format:
//
// 2006-01-02T15:04:05.999999999Z07:00 field1=value1 ... fieldN=valueN
func uncompressedRowSizeBytes ( fields [ ] Field ) uint64 {
n := uint64 ( len ( time . RFC3339Nano ) ) // log timestamp
for _ , f := range fields {
nameLen := len ( f . Name )
if nameLen == 0 {
nameLen = len ( "_msg" )
}
n += uint64 ( 2 + nameLen + len ( f . Value ) )
}
return n
}
// column contains values for the given field name seen in log entries.
type column struct {
// name is the field name
name string
// values is the values seen for the given log entries.
values [ ] string
}
func ( c * column ) reset ( ) {
c . name = ""
values := c . values
for i := range values {
values [ i ] = ""
}
c . values = values [ : 0 ]
}
2023-10-02 19:01:17 +02:00
func ( c * column ) canStoreInConstColumn ( ) bool {
2023-06-20 07:55:12 +02:00
values := c . values
2023-10-02 19:01:17 +02:00
if len ( values ) == 0 {
2023-06-20 07:55:12 +02:00
return true
}
value := values [ 0 ]
2023-10-02 19:01:17 +02:00
if len ( value ) > maxConstColumnValueSize {
return false
}
2023-06-20 07:55:12 +02:00
for _ , v := range values [ 1 : ] {
if value != v {
return false
}
}
return true
}
func ( c * column ) resizeValues ( valuesLen int ) [ ] string {
values := c . values
if n := valuesLen - cap ( values ) ; n > 0 {
values = append ( values [ : cap ( values ) ] , make ( [ ] string , n ) ... )
}
values = values [ : valuesLen ]
c . values = values
return values
}
// mustWriteTo writes c to sw and updates ch accordingly.
func ( c * column ) mustWriteTo ( ch * columnHeader , sw * streamWriters ) {
ch . reset ( )
valuesWriter := & sw . fieldValuesWriter
bloomFilterWriter := & sw . fieldBloomFilterWriter
if c . name == "" {
valuesWriter = & sw . messageValuesWriter
bloomFilterWriter = & sw . messageBloomFilterWriter
}
ch . name = c . name
// encode values
ve := getValuesEncoder ( )
ch . valueType , ch . minValue , ch . maxValue = ve . encode ( c . values , & ch . valuesDict )
bb := longTermBufPool . Get ( )
defer longTermBufPool . Put ( bb )
// marshal values
bb . B = marshalStringsBlock ( bb . B [ : 0 ] , ve . values )
putValuesEncoder ( ve )
ch . valuesSize = uint64 ( len ( bb . B ) )
if ch . valuesSize > maxValuesBlockSize {
logger . Panicf ( "BUG: too valuesSize: %d bytes; mustn't exceed %d bytes" , ch . valuesSize , maxValuesBlockSize )
}
ch . valuesOffset = valuesWriter . bytesWritten
valuesWriter . MustWrite ( bb . B )
// create and marshal bloom filter for c.values
if ch . valueType != valueTypeDict {
tokensBuf := getTokensBuf ( )
tokensBuf . A = tokenizeStrings ( tokensBuf . A [ : 0 ] , c . values )
bb . B = bloomFilterMarshal ( bb . B [ : 0 ] , tokensBuf . A )
putTokensBuf ( tokensBuf )
} else {
// there is no need in ecoding bloom filter for dictiory type,
// since it isn't used during querying - all the dictionary values are available in ch.valuesDict
bb . B = bb . B [ : 0 ]
}
ch . bloomFilterSize = uint64 ( len ( bb . B ) )
if ch . bloomFilterSize > maxBloomFilterBlockSize {
logger . Panicf ( "BUG: too big bloomFilterSize: %d bytes; mustn't exceed %d bytes" , ch . bloomFilterSize , maxBloomFilterBlockSize )
}
ch . bloomFilterOffset = bloomFilterWriter . bytesWritten
bloomFilterWriter . MustWrite ( bb . B )
}
func ( b * block ) assertValid ( ) {
// Check that timestamps are in ascending order
timestamps := b . timestamps
for i := 1 ; i < len ( timestamps ) ; i ++ {
if timestamps [ i - 1 ] > timestamps [ i ] {
logger . Panicf ( "BUG: log entries must be sorted by timestamp; got the previous entry with bigger timestamp %d than the current entry with timestamp %d" ,
timestamps [ i - 1 ] , timestamps [ i ] )
}
}
// Check that the number of items in each column matches the number of items in the block.
itemsCount := len ( timestamps )
columns := b . columns
for _ , c := range columns {
if len ( c . values ) != itemsCount {
logger . Panicf ( "BUG: unexpected number of values for column %q: got %d; want %d" , c . name , len ( c . values ) , itemsCount )
}
}
}
// MustInitFromRows initializes b from the given timestamps and rows.
//
// It is expected that timestamps are sorted.
func ( b * block ) MustInitFromRows ( timestamps [ ] int64 , rows [ ] [ ] Field ) {
b . reset ( )
assertTimestampsSorted ( timestamps )
b . timestamps = append ( b . timestamps , timestamps ... )
b . mustInitFromRows ( rows )
b . sortColumnsByName ( )
}
func ( b * block ) mustInitFromRows ( rows [ ] [ ] Field ) {
rowsLen := len ( rows )
if rowsLen == 0 {
// Nothing to do
return
}
if areSameFieldsInRows ( rows ) {
// Fast path - all the log entries have the same fields
fields := rows [ 0 ]
for i := range fields {
f := & fields [ i ]
2023-10-02 19:01:17 +02:00
if canStoreInConstColumn ( rows , i ) {
2023-06-20 07:55:12 +02:00
cc := b . extendConstColumns ( )
cc . Name = f . Name
cc . Value = f . Value
} else {
c := b . extendColumns ( )
c . name = f . Name
values := c . resizeValues ( rowsLen )
for j := range rows {
values [ j ] = rows [ j ] [ i ] . Value
}
}
}
return
}
// Slow path - log entries contain different set of fields
// Determine indexes for columns
columnIdxs := getColumnIdxs ( )
for i := range rows {
fields := rows [ i ]
for j := range fields {
name := fields [ j ] . Name
if _ , ok := columnIdxs [ name ] ; ! ok {
columnIdxs [ name ] = len ( columnIdxs )
}
}
}
// Initialize columns
cs := b . resizeColumns ( len ( columnIdxs ) )
for name , idx := range columnIdxs {
c := & cs [ idx ]
c . name = name
c . resizeValues ( rowsLen )
}
// Write rows to block
for i := range rows {
for _ , f := range rows [ i ] {
idx := columnIdxs [ f . Name ]
cs [ idx ] . values [ i ] = f . Value
}
}
putColumnIdxs ( columnIdxs )
// Detect const columns
for i := len ( cs ) - 1 ; i >= 0 ; i -- {
c := & cs [ i ]
2023-10-02 19:01:17 +02:00
if ! c . canStoreInConstColumn ( ) {
2023-06-20 07:55:12 +02:00
continue
}
cc := b . extendConstColumns ( )
cc . Name = c . name
cc . Value = c . values [ 0 ]
c . reset ( )
if i < len ( cs ) - 1 {
swapColumns ( c , & cs [ len ( cs ) - 1 ] )
}
cs = cs [ : len ( cs ) - 1 ]
}
b . columns = cs
}
func swapColumns ( a , b * column ) {
* a , * b = * b , * a
}
2023-10-02 19:01:17 +02:00
func canStoreInConstColumn ( rows [ ] [ ] Field , colIdx int ) bool {
if len ( rows ) == 0 {
2023-06-20 07:55:12 +02:00
return true
}
value := rows [ 0 ] [ colIdx ] . Value
2023-10-02 19:01:17 +02:00
if len ( value ) > maxConstColumnValueSize {
return false
}
2023-06-20 07:55:12 +02:00
rows = rows [ 1 : ]
for i := range rows {
if value != rows [ i ] [ colIdx ] . Value {
return false
}
}
return true
}
func assertTimestampsSorted ( timestamps [ ] int64 ) {
for i := range timestamps {
if i > 0 && timestamps [ i - 1 ] > timestamps [ i ] {
logger . Panicf ( "BUG: log entries must be sorted by timestamp; got the previous entry with bigger timestamp %d than the current entry with timestamp %d" ,
timestamps [ i - 1 ] , timestamps [ i ] )
}
}
}
func ( b * block ) extendConstColumns ( ) * Field {
ccs := b . constColumns
if cap ( ccs ) > len ( ccs ) {
ccs = ccs [ : len ( ccs ) + 1 ]
} else {
ccs = append ( ccs , Field { } )
}
b . constColumns = ccs
return & ccs [ len ( ccs ) - 1 ]
}
func ( b * block ) extendColumns ( ) * column {
cs := b . columns
if cap ( cs ) > len ( cs ) {
cs = cs [ : len ( cs ) + 1 ]
} else {
cs = append ( cs , column { } )
}
b . columns = cs
return & cs [ len ( cs ) - 1 ]
}
func ( b * block ) resizeColumns ( columnsLen int ) [ ] column {
cs := b . columns [ : 0 ]
if n := columnsLen - cap ( cs ) ; n > 0 {
cs = append ( cs [ : cap ( cs ) ] , make ( [ ] column , n ) ... )
}
cs = cs [ : columnsLen ]
b . columns = cs
return cs
}
func ( b * block ) sortColumnsByName ( ) {
if len ( b . columns ) + len ( b . constColumns ) > maxColumnsPerBlock {
logger . Panicf ( "BUG: too big number of columns detected in the block: %d; the number of columns mustn't exceed %d" ,
len ( b . columns ) + len ( b . constColumns ) , maxColumnsPerBlock )
}
cs := getColumnsSorter ( )
cs . columns = b . columns
sort . Sort ( cs )
putColumnsSorter ( cs )
ccs := getConstColumnsSorter ( )
ccs . columns = b . constColumns
sort . Sort ( ccs )
putConstColumnsSorter ( ccs )
}
// Len returns the number of log entries in b.
func ( b * block ) Len ( ) int {
return len ( b . timestamps )
}
// InitFromBlockData unmarshals bd to b.
//
// sbu and vd are used as a temporary storage for unmarshaled column values.
//
// The b becomes outdated after sbu or vd is reset.
func ( b * block ) InitFromBlockData ( bd * blockData , sbu * stringsBlockUnmarshaler , vd * valuesDecoder ) error {
b . reset ( )
if bd . rowsCount > maxRowsPerBlock {
return fmt . Errorf ( "too many entries found in the block: %d; mustn't exceed %d" , bd . rowsCount , maxRowsPerBlock )
}
rowsCount := int ( bd . rowsCount )
// unmarshal timestamps
td := & bd . timestampsData
var err error
b . timestamps , err = encoding . UnmarshalTimestamps ( b . timestamps [ : 0 ] , td . data , td . marshalType , td . minTimestamp , rowsCount )
if err != nil {
return fmt . Errorf ( "cannot unmarshal timestamps: %w" , err )
}
// unmarshal columns
cds := bd . columnsData
cs := b . resizeColumns ( len ( cds ) )
for i := range cds {
cd := & cds [ i ]
c := & cs [ i ]
c . name = cd . name
c . values , err = sbu . unmarshal ( c . values [ : 0 ] , cd . valuesData , uint64 ( rowsCount ) )
if err != nil {
return fmt . Errorf ( "cannot unmarshal column %d: %w" , i , err )
}
if err = vd . decodeInplace ( c . values , cd . valueType , & cd . valuesDict ) ; err != nil {
return fmt . Errorf ( "cannot decode column values: %w" , err )
}
}
// unmarshal constColumns
b . constColumns = append ( b . constColumns [ : 0 ] , bd . constColumns ... )
return nil
}
// mustWriteTo writes b with the given sid to sw and updates bh accordingly
func ( b * block ) mustWriteTo ( sid * streamID , bh * blockHeader , sw * streamWriters ) {
// Do not store the version used for encoding directly in the block data, since:
// - all the blocks in the same part use the same encoding
// - the block encoding version can be put in metadata file for the part (aka metadataFilename)
b . assertValid ( )
bh . reset ( )
bh . streamID = * sid
bh . uncompressedSizeBytes = b . uncompressedSizeBytes ( )
bh . rowsCount = uint64 ( b . Len ( ) )
// Marshal timestamps
mustWriteTimestampsTo ( & bh . timestampsHeader , b . timestamps , sw )
// Marshal columns
cs := b . columns
csh := getColumnsHeader ( )
chs := csh . resizeColumnHeaders ( len ( cs ) )
for i := range cs {
cs [ i ] . mustWriteTo ( & chs [ i ] , sw )
}
csh . constColumns = append ( csh . constColumns [ : 0 ] , b . constColumns ... )
bb := longTermBufPool . Get ( )
bb . B = csh . marshal ( bb . B )
putColumnsHeader ( csh )
bh . columnsHeaderOffset = sw . columnsHeaderWriter . bytesWritten
bh . columnsHeaderSize = uint64 ( len ( bb . B ) )
if bh . columnsHeaderSize > maxColumnsHeaderSize {
logger . Panicf ( "BUG: too big columnsHeaderSize: %d bytes; mustn't exceed %d bytes" , bh . columnsHeaderSize , maxColumnsHeaderSize )
}
sw . columnsHeaderWriter . MustWrite ( bb . B )
longTermBufPool . Put ( bb )
}
2023-10-02 19:01:17 +02:00
// appendRowsTo appends log entries from b to dst.
func ( b * block ) appendRowsTo ( dst * rows ) {
2023-06-20 07:55:12 +02:00
// copy timestamps
dst . timestamps = append ( dst . timestamps , b . timestamps ... )
// copy columns
fieldsBuf := dst . fieldsBuf
ccs := b . constColumns
cs := b . columns
for i := range b . timestamps {
fieldsLen := len ( fieldsBuf )
// copy const columns
for j := range ccs {
cc := & ccs [ j ]
fieldsBuf = append ( fieldsBuf , Field {
Name : cc . Name ,
Value : cc . Value ,
} )
}
// copy other columns
for j := range cs {
c := & cs [ j ]
value := c . values [ i ]
if len ( value ) == 0 {
continue
}
fieldsBuf = append ( fieldsBuf , Field {
Name : c . name ,
Value : value ,
} )
}
dst . rows = append ( dst . rows , fieldsBuf [ fieldsLen : ] )
}
dst . fieldsBuf = fieldsBuf
}
func areSameFieldsInRows ( rows [ ] [ ] Field ) bool {
if len ( rows ) < 2 {
return true
}
fields := rows [ 0 ]
2023-10-02 19:01:17 +02:00
// Verify that all the field names are unique
m := make ( map [ string ] struct { } , len ( fields ) )
for i := range fields {
f := & fields [ i ]
if _ , ok := m [ f . Name ] ; ok {
// Field name isn't unique
return false
}
m [ f . Name ] = struct { } { }
}
// Verify that all the fields are the same across rows
2023-06-20 07:55:12 +02:00
rows = rows [ 1 : ]
for i := range rows {
leFields := rows [ i ]
if len ( fields ) != len ( leFields ) {
return false
}
for j := range leFields {
if leFields [ j ] . Name != fields [ j ] . Name {
return false
}
}
}
return true
}
var columnIdxsPool sync . Pool
func getColumnIdxs ( ) map [ string ] int {
v := columnIdxsPool . Get ( )
if v == nil {
return make ( map [ string ] int )
}
return v . ( map [ string ] int )
}
func putColumnIdxs ( m map [ string ] int ) {
for k := range m {
delete ( m , k )
}
columnIdxsPool . Put ( m )
}
func getBlock ( ) * block {
v := blockPool . Get ( )
if v == nil {
return & block { }
}
return v . ( * block )
}
func putBlock ( b * block ) {
b . reset ( )
blockPool . Put ( b )
}
var blockPool sync . Pool
type columnsSorter struct {
columns [ ] column
}
func ( cs * columnsSorter ) reset ( ) {
cs . columns = nil
}
func ( cs * columnsSorter ) Len ( ) int {
return len ( cs . columns )
}
func ( cs * columnsSorter ) Less ( i , j int ) bool {
columns := cs . columns
return columns [ i ] . name < columns [ j ] . name
}
func ( cs * columnsSorter ) Swap ( i , j int ) {
columns := cs . columns
columns [ i ] , columns [ j ] = columns [ j ] , columns [ i ]
}
func getColumnsSorter ( ) * columnsSorter {
v := columnsSorterPool . Get ( )
if v == nil {
return & columnsSorter { }
}
return v . ( * columnsSorter )
}
func putColumnsSorter ( cs * columnsSorter ) {
cs . reset ( )
columnsSorterPool . Put ( cs )
}
var columnsSorterPool sync . Pool
type constColumnsSorter struct {
columns [ ] Field
}
func ( ccs * constColumnsSorter ) reset ( ) {
ccs . columns = nil
}
func ( ccs * constColumnsSorter ) Len ( ) int {
return len ( ccs . columns )
}
func ( ccs * constColumnsSorter ) Less ( i , j int ) bool {
columns := ccs . columns
return columns [ i ] . Name < columns [ j ] . Name
}
func ( ccs * constColumnsSorter ) Swap ( i , j int ) {
columns := ccs . columns
columns [ i ] , columns [ j ] = columns [ j ] , columns [ i ]
}
func getConstColumnsSorter ( ) * constColumnsSorter {
v := constColumnsSorterPool . Get ( )
if v == nil {
return & constColumnsSorter { }
}
return v . ( * constColumnsSorter )
}
func putConstColumnsSorter ( ccs * constColumnsSorter ) {
ccs . reset ( )
constColumnsSorterPool . Put ( ccs )
}
var constColumnsSorterPool sync . Pool
// mustWriteTimestampsTo writes timestamps to sw and updates th accordingly
func mustWriteTimestampsTo ( th * timestampsHeader , timestamps [ ] int64 , sw * streamWriters ) {
th . reset ( )
bb := longTermBufPool . Get ( )
bb . B , th . marshalType , th . minTimestamp = encoding . MarshalTimestamps ( bb . B [ : 0 ] , timestamps , 64 )
if len ( bb . B ) > maxTimestampsBlockSize {
logger . Panicf ( "BUG: too big block with timestamps: %d bytes; the maximum supported size is %d bytes" , len ( bb . B ) , maxTimestampsBlockSize )
}
th . maxTimestamp = timestamps [ len ( timestamps ) - 1 ]
th . blockOffset = sw . timestampsWriter . bytesWritten
th . blockSize = uint64 ( len ( bb . B ) )
sw . timestampsWriter . MustWrite ( bb . B )
longTermBufPool . Put ( bb )
}