2023-06-20 07:55:12 +02:00
|
|
|
package logstorage
|
|
|
|
|
|
|
|
import (
|
|
|
|
"reflect"
|
|
|
|
"testing"
|
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
|
|
|
|
)
|
|
|
|
|
|
|
|
func TestBlockHeaderMarshalUnmarshal(t *testing.T) {
|
|
|
|
f := func(bh *blockHeader, marshaledLen int) {
|
|
|
|
t.Helper()
|
|
|
|
data := bh.marshal(nil)
|
|
|
|
if len(data) != marshaledLen {
|
|
|
|
t.Fatalf("unexpected lengths of the marshaled blockHeader; got %d; want %d", len(data), marshaledLen)
|
|
|
|
}
|
|
|
|
bh2 := &blockHeader{}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
tail, err := bh2.unmarshal(data, partFormatLatestVersion)
|
2023-06-20 07:55:12 +02:00
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("unexpected error in unmarshal: %s", err)
|
|
|
|
}
|
|
|
|
if len(tail) > 0 {
|
|
|
|
t.Fatalf("unexpected non-empty tail after unmarshal: %X", tail)
|
|
|
|
}
|
|
|
|
if !reflect.DeepEqual(bh, bh2) {
|
|
|
|
t.Fatalf("unexpected blockHeader unmarshaled\ngot\n%v\nwant\n%v", bh2, bh)
|
|
|
|
}
|
|
|
|
}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
f(&blockHeader{}, 63)
|
2023-06-20 07:55:12 +02:00
|
|
|
f(&blockHeader{
|
|
|
|
streamID: streamID{
|
|
|
|
tenantID: TenantID{
|
|
|
|
AccountID: 123,
|
|
|
|
ProjectID: 456,
|
|
|
|
},
|
|
|
|
id: u128{
|
|
|
|
lo: 3443,
|
|
|
|
hi: 23434,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
uncompressedSizeBytes: 4344,
|
|
|
|
rowsCount: 1234,
|
|
|
|
timestampsHeader: timestampsHeader{
|
|
|
|
blockOffset: 13234,
|
|
|
|
blockSize: 8843,
|
|
|
|
minTimestamp: -4334,
|
|
|
|
maxTimestamp: 23434,
|
|
|
|
marshalType: encoding.MarshalTypeNearestDelta2,
|
|
|
|
},
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
columnsHeaderIndexOffset: 8923481,
|
|
|
|
columnsHeaderIndexSize: 8989832,
|
|
|
|
columnsHeaderOffset: 4384,
|
|
|
|
columnsHeaderSize: 894,
|
|
|
|
}, 73)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestColumnsHeaderIndexMarshalUnmarshal(t *testing.T) {
|
|
|
|
f := func(cshIndex *columnsHeaderIndex, marshaledLen int) {
|
|
|
|
t.Helper()
|
|
|
|
|
|
|
|
data := cshIndex.marshal(nil)
|
|
|
|
if len(data) != marshaledLen {
|
|
|
|
t.Fatalf("unexpected lengths of the marshaled columnsHeader; got %d; want %d", len(data), marshaledLen)
|
|
|
|
}
|
|
|
|
cshIndex2 := &columnsHeaderIndex{}
|
|
|
|
if err := cshIndex2.unmarshalNoArena(data); err != nil {
|
|
|
|
t.Fatalf("unexpected error in unmarshal: %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if !reflect.DeepEqual(cshIndex, cshIndex2) {
|
|
|
|
t.Fatalf("unexpected blockHeaderIndex unmarshaled\ngot\n%v\nwant\n%v", cshIndex2, cshIndex)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
f(&columnsHeaderIndex{}, 2)
|
|
|
|
f(&columnsHeaderIndex{
|
|
|
|
columnHeadersRefs: []columnHeaderRef{
|
|
|
|
{
|
|
|
|
columnNameID: 234,
|
|
|
|
offset: 123432,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
columnNameID: 23898,
|
|
|
|
offset: 0,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
constColumnsRefs: []columnHeaderRef{
|
|
|
|
{
|
|
|
|
columnNameID: 0,
|
|
|
|
offset: 8989,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}, 14)
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestColumnsHeaderMarshalUnmarshal(t *testing.T) {
|
|
|
|
f := func(csh *columnsHeader, marshaledLen int) {
|
|
|
|
t.Helper()
|
2024-05-12 16:33:29 +02:00
|
|
|
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
cshIndex := getColumnsHeaderIndex()
|
|
|
|
g := &columnNameIDGenerator{}
|
|
|
|
|
|
|
|
data := csh.marshal(nil, cshIndex, g)
|
2023-06-20 07:55:12 +02:00
|
|
|
if len(data) != marshaledLen {
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
t.Fatalf("unexpected length of the marshaled columnsHeader; got %d; want %d", len(data), marshaledLen)
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
csh2 := &columnsHeader{}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
if err := csh2.unmarshalNoArena(data, partFormatLatestVersion); err != nil {
|
2023-06-20 07:55:12 +02:00
|
|
|
t.Fatalf("unexpected error in unmarshal: %s", err)
|
|
|
|
}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
if err := csh2.setColumnNames(cshIndex, g.columnNames); err != nil {
|
|
|
|
t.Fatalf("cannot set column names: %s", err)
|
|
|
|
}
|
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
if !reflect.DeepEqual(csh, csh2) {
|
|
|
|
t.Fatalf("unexpected blockHeader unmarshaled\ngot\n%v\nwant\n%v", csh2, csh)
|
|
|
|
}
|
|
|
|
}
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
f(&columnsHeader{}, 2)
|
|
|
|
f(&columnsHeader{
|
|
|
|
columnHeaders: []columnHeader{
|
|
|
|
{
|
|
|
|
name: "foobar",
|
|
|
|
valueType: valueTypeString,
|
|
|
|
valuesOffset: 12345,
|
|
|
|
valuesSize: 23434,
|
|
|
|
bloomFilterOffset: 89843,
|
|
|
|
bloomFilterSize: 8934,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "message",
|
|
|
|
valueType: valueTypeUint16,
|
|
|
|
minValue: 123,
|
|
|
|
maxValue: 456,
|
|
|
|
valuesOffset: 3412345,
|
|
|
|
valuesSize: 234434,
|
|
|
|
bloomFilterOffset: 83,
|
|
|
|
bloomFilterSize: 34,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
constColumns: []Field{
|
|
|
|
{
|
|
|
|
Name: "foo",
|
|
|
|
Value: "bar",
|
|
|
|
},
|
|
|
|
},
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
}, 31)
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestBlockHeaderUnmarshalFailure(t *testing.T) {
|
|
|
|
f := func(data []byte) {
|
|
|
|
t.Helper()
|
|
|
|
dataOrig := append([]byte{}, data...)
|
|
|
|
bh := getBlockHeader()
|
|
|
|
defer putBlockHeader(bh)
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
tail, err := bh.unmarshal(data, partFormatLatestVersion)
|
2023-06-20 07:55:12 +02:00
|
|
|
if err == nil {
|
|
|
|
t.Fatalf("expecting non-nil error")
|
|
|
|
}
|
|
|
|
if string(tail) != string(dataOrig) {
|
|
|
|
t.Fatalf("unexpected tail;\ngot\n%q\nwant\n%q", tail, dataOrig)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
f(nil)
|
|
|
|
f([]byte("foo"))
|
|
|
|
|
|
|
|
bh := blockHeader{
|
|
|
|
streamID: streamID{
|
|
|
|
tenantID: TenantID{
|
|
|
|
AccountID: 123,
|
|
|
|
ProjectID: 456,
|
|
|
|
},
|
|
|
|
id: u128{
|
|
|
|
lo: 3443,
|
|
|
|
hi: 23434,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
uncompressedSizeBytes: 4344,
|
|
|
|
rowsCount: 1234,
|
|
|
|
timestampsHeader: timestampsHeader{
|
|
|
|
blockOffset: 13234,
|
|
|
|
blockSize: 8843,
|
|
|
|
minTimestamp: -4334,
|
|
|
|
maxTimestamp: 23434,
|
|
|
|
marshalType: encoding.MarshalTypeNearestDelta2,
|
|
|
|
},
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
columnsHeaderIndexOffset: 89434,
|
|
|
|
columnsHeaderIndexSize: 89123,
|
|
|
|
columnsHeaderOffset: 4384,
|
|
|
|
columnsHeaderSize: 894,
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
data := bh.marshal(nil)
|
|
|
|
for len(data) > 0 {
|
|
|
|
data = data[:len(data)-1]
|
|
|
|
f(data)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
func TestColumnsHeaderIndexUnmarshalFailure(t *testing.T) {
|
|
|
|
f := func(data []byte) {
|
|
|
|
t.Helper()
|
|
|
|
|
|
|
|
cshIndex := getColumnsHeaderIndex()
|
|
|
|
defer putColumnsHeaderIndex(cshIndex)
|
|
|
|
if err := cshIndex.unmarshalNoArena(data); err == nil {
|
|
|
|
t.Fatalf("expecting non-nil error")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
f(nil)
|
|
|
|
f([]byte("foo"))
|
|
|
|
|
|
|
|
cshIndex := &columnsHeaderIndex{
|
|
|
|
columnHeadersRefs: []columnHeaderRef{
|
|
|
|
{
|
|
|
|
columnNameID: 0,
|
|
|
|
offset: 123,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
constColumnsRefs: []columnHeaderRef{
|
|
|
|
{
|
|
|
|
columnNameID: 2,
|
|
|
|
offset: 89834,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
columnNameID: 234,
|
|
|
|
offset: 8934,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
data := cshIndex.marshal(nil)
|
|
|
|
for len(data) > 0 {
|
|
|
|
data = data[:len(data)-1]
|
|
|
|
f(data)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
func TestColumnsHeaderUnmarshalFailure(t *testing.T) {
|
|
|
|
f := func(data []byte) {
|
|
|
|
t.Helper()
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
csh := getColumnsHeader()
|
|
|
|
defer putColumnsHeader(csh)
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
if err := csh.unmarshalNoArena(data, partFormatLatestVersion); err == nil {
|
2023-06-20 07:55:12 +02:00
|
|
|
t.Fatalf("expecting non-nil error")
|
|
|
|
}
|
|
|
|
}
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
f(nil)
|
|
|
|
f([]byte("foo"))
|
|
|
|
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
csh := &columnsHeader{
|
2023-06-20 07:55:12 +02:00
|
|
|
columnHeaders: []columnHeader{
|
|
|
|
{
|
|
|
|
name: "foobar",
|
|
|
|
valueType: valueTypeString,
|
|
|
|
valuesOffset: 12345,
|
|
|
|
valuesSize: 23434,
|
|
|
|
bloomFilterOffset: 89843,
|
|
|
|
bloomFilterSize: 8934,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "message",
|
|
|
|
valueType: valueTypeUint16,
|
|
|
|
minValue: 123,
|
|
|
|
maxValue: 456,
|
|
|
|
valuesOffset: 3412345,
|
|
|
|
valuesSize: 234434,
|
|
|
|
bloomFilterOffset: 83,
|
|
|
|
bloomFilterSize: 34,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
constColumns: []Field{
|
|
|
|
{
|
|
|
|
Name: "foo",
|
|
|
|
Value: "bar",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
cshIndex := getColumnsHeaderIndex()
|
|
|
|
g := &columnNameIDGenerator{}
|
|
|
|
data := csh.marshal(nil, cshIndex, g)
|
2023-06-20 07:55:12 +02:00
|
|
|
for len(data) > 0 {
|
|
|
|
data = data[:len(data)-1]
|
|
|
|
f(data)
|
|
|
|
}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
putColumnsHeaderIndex(cshIndex)
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestBlockHeaderReset(t *testing.T) {
|
|
|
|
bh := &blockHeader{
|
|
|
|
streamID: streamID{
|
|
|
|
tenantID: TenantID{
|
|
|
|
AccountID: 123,
|
|
|
|
ProjectID: 456,
|
|
|
|
},
|
|
|
|
id: u128{
|
|
|
|
lo: 3443,
|
|
|
|
hi: 23434,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
uncompressedSizeBytes: 8984,
|
|
|
|
rowsCount: 1234,
|
|
|
|
timestampsHeader: timestampsHeader{
|
|
|
|
blockOffset: 13234,
|
|
|
|
blockSize: 8843,
|
|
|
|
minTimestamp: -4334,
|
|
|
|
maxTimestamp: 23434,
|
|
|
|
marshalType: encoding.MarshalTypeNearestDelta2,
|
|
|
|
},
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
columnsHeaderIndexOffset: 18934,
|
|
|
|
columnsHeaderIndexSize: 8912,
|
|
|
|
columnsHeaderOffset: 12332,
|
|
|
|
columnsHeaderSize: 234,
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
bh.reset()
|
|
|
|
bhZero := &blockHeader{}
|
|
|
|
if !reflect.DeepEqual(bh, bhZero) {
|
|
|
|
t.Fatalf("unexpected non-zero blockHeader after reset: %v", bh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
func TestColumnsHeaderIndexReset(t *testing.T) {
|
|
|
|
cshIndex := &columnsHeaderIndex{
|
|
|
|
columnHeadersRefs: []columnHeaderRef{
|
|
|
|
{
|
|
|
|
columnNameID: 234,
|
|
|
|
offset: 1234,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
constColumnsRefs: []columnHeaderRef{
|
|
|
|
{
|
|
|
|
columnNameID: 328,
|
|
|
|
offset: 21344,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
columnNameID: 1,
|
|
|
|
offset: 234,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
cshIndex.reset()
|
|
|
|
cshIndexZero := &columnsHeaderIndex{
|
|
|
|
columnHeadersRefs: []columnHeaderRef{},
|
|
|
|
constColumnsRefs: []columnHeaderRef{},
|
|
|
|
}
|
|
|
|
if !reflect.DeepEqual(cshIndex, cshIndexZero) {
|
|
|
|
t.Fatalf("unexpected non-zero columnsHeaderIndex after reset: %v", cshIndex)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
func TestColumnsHeaderReset(t *testing.T) {
|
|
|
|
csh := &columnsHeader{
|
|
|
|
columnHeaders: []columnHeader{
|
|
|
|
{
|
|
|
|
name: "foobar",
|
|
|
|
valueType: valueTypeString,
|
|
|
|
valuesOffset: 12345,
|
|
|
|
valuesSize: 23434,
|
|
|
|
bloomFilterOffset: 89843,
|
|
|
|
bloomFilterSize: 8934,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "message",
|
|
|
|
valueType: valueTypeUint16,
|
|
|
|
minValue: 123,
|
|
|
|
maxValue: 456,
|
|
|
|
valuesOffset: 3412345,
|
|
|
|
valuesSize: 234434,
|
|
|
|
bloomFilterOffset: 83,
|
|
|
|
bloomFilterSize: 34,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
constColumns: []Field{
|
|
|
|
{
|
|
|
|
Name: "foo",
|
|
|
|
Value: "bar",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
csh.reset()
|
|
|
|
cshZero := &columnsHeader{
|
|
|
|
columnHeaders: []columnHeader{},
|
|
|
|
constColumns: []Field{},
|
|
|
|
}
|
|
|
|
if !reflect.DeepEqual(csh, cshZero) {
|
|
|
|
t.Fatalf("unexpected non-zero columnsHeader after reset: %v", csh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestMarshalUnmarshalBlockHeaders(t *testing.T) {
|
|
|
|
f := func(bhs []blockHeader, marshaledLen int) {
|
|
|
|
t.Helper()
|
|
|
|
var data []byte
|
|
|
|
for i := range bhs {
|
|
|
|
data = bhs[i].marshal(data)
|
|
|
|
}
|
|
|
|
if len(data) != marshaledLen {
|
|
|
|
t.Fatalf("unexpected length for marshaled blockHeader entries; got %d; want %d", len(data), marshaledLen)
|
|
|
|
}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
bhs2, err := unmarshalBlockHeaders(nil, data, partFormatLatestVersion)
|
2023-06-20 07:55:12 +02:00
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("unexpected error when unmarshaling blockHeader entries: %s", err)
|
|
|
|
}
|
|
|
|
if !reflect.DeepEqual(bhs, bhs2) {
|
|
|
|
t.Fatalf("unexpected blockHeader entries unmarshaled\ngot\n%v\nwant\n%v", bhs2, bhs)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
f(nil, 0)
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
f([]blockHeader{{}}, 63)
|
2023-06-20 07:55:12 +02:00
|
|
|
f([]blockHeader{
|
|
|
|
{},
|
|
|
|
{
|
|
|
|
streamID: streamID{
|
|
|
|
tenantID: TenantID{
|
|
|
|
AccountID: 123,
|
|
|
|
ProjectID: 456,
|
|
|
|
},
|
|
|
|
id: u128{
|
|
|
|
lo: 3443,
|
|
|
|
hi: 23434,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
uncompressedSizeBytes: 89894,
|
|
|
|
rowsCount: 1234,
|
|
|
|
timestampsHeader: timestampsHeader{
|
|
|
|
blockOffset: 13234,
|
|
|
|
blockSize: 8843,
|
|
|
|
minTimestamp: -4334,
|
|
|
|
maxTimestamp: 23434,
|
|
|
|
marshalType: encoding.MarshalTypeNearestDelta2,
|
|
|
|
},
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
columnsHeaderIndexOffset: 1234,
|
|
|
|
columnsHeaderIndexSize: 89324,
|
|
|
|
columnsHeaderOffset: 12332,
|
|
|
|
columnsHeaderSize: 234,
|
2023-06-20 07:55:12 +02:00
|
|
|
},
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
}, 134)
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestColumnHeaderMarshalUnmarshal(t *testing.T) {
|
|
|
|
f := func(ch *columnHeader, marshaledLen int) {
|
|
|
|
t.Helper()
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
data := ch.marshal(nil)
|
|
|
|
if len(data) != marshaledLen {
|
|
|
|
t.Fatalf("unexpected marshaled length of columnHeader; got %d; want %d", len(data), marshaledLen)
|
|
|
|
}
|
|
|
|
var ch2 columnHeader
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
tail, err := ch2.unmarshalNoArena(data, partFormatLatestVersion)
|
2023-06-20 07:55:12 +02:00
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("unexpected error in umarshal(%v): %s", ch, err)
|
|
|
|
}
|
|
|
|
if len(tail) > 0 {
|
|
|
|
t.Fatalf("unexpected non-empty tail after unmarshal(%v): %X", ch, tail)
|
|
|
|
}
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
|
|
|
|
// columnHeader.name isn't marshaled, since it is marshaled via columnsHeaderIndex starting from part format v1.
|
|
|
|
ch2.name = ch.name
|
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
if !reflect.DeepEqual(ch, &ch2) {
|
|
|
|
t.Fatalf("unexpected columnHeader after unmarshal;\ngot\n%v\nwant\n%v", &ch2, ch)
|
|
|
|
}
|
|
|
|
}
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
f(&columnHeader{
|
|
|
|
name: "foo",
|
|
|
|
valueType: valueTypeUint8,
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
}, 7)
|
2023-06-20 07:55:12 +02:00
|
|
|
ch := &columnHeader{
|
|
|
|
name: "foobar",
|
|
|
|
valueType: valueTypeDict,
|
|
|
|
|
|
|
|
valuesOffset: 12345,
|
|
|
|
valuesSize: 254452,
|
|
|
|
}
|
|
|
|
ch.valuesDict.getOrAdd("abc")
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
f(ch, 11)
|
2023-06-20 07:55:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func TestColumnHeaderUnmarshalFailure(t *testing.T) {
|
|
|
|
f := func(data []byte) {
|
|
|
|
t.Helper()
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
dataOrig := append([]byte{}, data...)
|
|
|
|
var ch columnHeader
|
lib/logstorage: refactor storage format to be more efficient for querying wide events
It has been appeared that VictoriaLogs is frequently used for collecting logs with tens of fields.
For example, standard Kuberntes setup on top of Filebeat generates more than 20 fields per each log.
Such logs are also known as "wide events".
The previous storage format was optimized for logs with a few fields. When at least a single field
was referenced in the query, then the all the meta-information about all the log fields was unpacked
and parsed per each scanned block during the query. This could require a lot of additional disk IO
and CPU time when logs contain many fields. Resolve this issue by providing an (field -> metainfo_offset)
index per each field in every data block. This index allows reading and extracting only the needed
metainfo for fields used in the query. This index is stored in columnsHeaderIndexFilename ( columns_header_index.bin ).
This allows increasing performance for queries over wide events by 10x and more.
Another issue was that the data for bloom filters and field values across all the log fields except of _msg
was intermixed in two files - fieldBloomFilename ( field_bloom.bin ) and fieldValuesFilename ( field_values.bin ).
This could result in huge disk read IO overhead when some small field was referred in the query,
since the Operating System usually reads more data than requested. It reads the data from disk
in at least 4KiB blocks (usually the block size is much bigger in the range 64KiB - 512KiB).
So, if 512-byte bloom filter or values' block is read from the file, then the Operating System
reads up to 512KiB of data from disk, which results in 1000x disk read IO overhead. This overhead isn't visible
for recently accessed data, since this data is usually stored in RAM (aka Operating System page cache),
but this overhead may become very annoying when performing the query over large volumes of data
which isn't present in OS page cache.
The solution for this issue is to split bloom filters and field values across multiple shards.
This reduces the worst-case disk read IO overhead by at least Nx where N is the number of shards,
while the disk read IO overhead is completely removed in best case when the number of columns doesn't exceed N.
Currently the number of shards is 8 - see bloomValuesShardsCount . This solution increases
performance for queries over large volumes of newly ingested data by up to 1000x.
The new storage format is versioned as v1, while the old storage format is version as v0.
It is stored in the partHeader.FormatVersion.
Parts with the old storage format are converted into parts with the new storage format during background merge.
It is possible to force merge by querying /internal/force_merge HTTP endpoint - see https://docs.victoriametrics.com/victorialogs/#forced-merge .
2024-10-16 16:18:28 +02:00
|
|
|
tail, err := ch.unmarshalNoArena(data, partFormatLatestVersion)
|
2023-06-20 07:55:12 +02:00
|
|
|
if err == nil {
|
|
|
|
t.Fatalf("expecting non-nil error")
|
|
|
|
}
|
|
|
|
if string(tail) != string(dataOrig) {
|
|
|
|
t.Fatalf("unexpected tail left; got %q; want %q", tail, dataOrig)
|
|
|
|
}
|
|
|
|
}
|
2024-05-12 16:33:29 +02:00
|
|
|
|
2023-06-20 07:55:12 +02:00
|
|
|
f(nil)
|
|
|
|
f([]byte("foo"))
|
|
|
|
|
|
|
|
ch := &columnHeader{
|
|
|
|
name: "abc",
|
|
|
|
valueType: valueTypeUint16,
|
|
|
|
bloomFilterSize: 3244,
|
|
|
|
}
|
|
|
|
data := ch.marshal(nil)
|
|
|
|
f(data[:len(data)-1])
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestColumnHeaderReset(t *testing.T) {
|
|
|
|
ch := &columnHeader{
|
|
|
|
name: "foobar",
|
|
|
|
valueType: valueTypeUint16,
|
|
|
|
|
|
|
|
valuesOffset: 12345,
|
|
|
|
valuesSize: 254452,
|
|
|
|
|
|
|
|
bloomFilterOffset: 34898234,
|
|
|
|
bloomFilterSize: 873434,
|
|
|
|
}
|
|
|
|
ch.valuesDict.getOrAdd("abc")
|
|
|
|
ch.reset()
|
|
|
|
chZero := &columnHeader{}
|
|
|
|
chZero.valuesDict.values = []string{}
|
|
|
|
if !reflect.DeepEqual(ch, chZero) {
|
|
|
|
t.Fatalf("unexpected non-zero columnHeader after reset: %v", ch)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestTimestampsHeaderMarshalUnmarshal(t *testing.T) {
|
|
|
|
f := func(th *timestampsHeader, marshaledLen int) {
|
|
|
|
t.Helper()
|
|
|
|
data := th.marshal(nil)
|
|
|
|
if len(data) != marshaledLen {
|
|
|
|
t.Fatalf("unexpected length of marshaled timestampsHeader; got %d; want %d", len(data), marshaledLen)
|
|
|
|
}
|
|
|
|
var th2 timestampsHeader
|
|
|
|
tail, err := th2.unmarshal(data)
|
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("unexpected error in unmarshal(%v): %s", th, err)
|
|
|
|
}
|
|
|
|
if len(tail) > 0 {
|
|
|
|
t.Fatalf("unexpected non-nil tail after unmarshal(%v): %X", th, tail)
|
|
|
|
}
|
|
|
|
if !reflect.DeepEqual(th, &th2) {
|
|
|
|
t.Fatalf("unexpected timestampsHeader after unmarshal; got\n%v\nwant\n%v", &th2, th)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
f(×tampsHeader{}, 33)
|
|
|
|
|
|
|
|
f(×tampsHeader{
|
|
|
|
blockOffset: 12345,
|
|
|
|
blockSize: 3424834,
|
|
|
|
minTimestamp: -123443,
|
|
|
|
maxTimestamp: 234343,
|
|
|
|
marshalType: encoding.MarshalTypeZSTDNearestDelta,
|
|
|
|
}, 33)
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestTimestampsHeaderUnmarshalFailure(t *testing.T) {
|
|
|
|
f := func(data []byte) {
|
|
|
|
t.Helper()
|
|
|
|
dataOrig := append([]byte{}, data...)
|
|
|
|
var th timestampsHeader
|
|
|
|
tail, err := th.unmarshal(data)
|
|
|
|
if err == nil {
|
|
|
|
t.Fatalf("expecting non-nil error")
|
|
|
|
}
|
|
|
|
if string(tail) != string(dataOrig) {
|
|
|
|
t.Fatalf("unexpected tail left; got %q; want %q", tail, dataOrig)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
f(nil)
|
|
|
|
f([]byte("foo"))
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestTimestampsHeaderReset(t *testing.T) {
|
|
|
|
th := ×tampsHeader{
|
|
|
|
blockOffset: 12345,
|
|
|
|
blockSize: 3424834,
|
|
|
|
minTimestamp: -123443,
|
|
|
|
maxTimestamp: 234343,
|
|
|
|
marshalType: encoding.MarshalTypeZSTDNearestDelta,
|
|
|
|
}
|
|
|
|
th.reset()
|
|
|
|
thZero := ×tampsHeader{}
|
|
|
|
if !reflect.DeepEqual(th, thZero) {
|
|
|
|
t.Fatalf("unexpected non-zero timestampsHeader after reset: %v", th)
|
|
|
|
}
|
|
|
|
}
|