From 7d13c31566e038f378e1e069b612bcb8543ee1ca Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Fri, 20 Sep 2019 19:46:47 +0300 Subject: [PATCH] lib/{storage,mergeset}: merge `tag->metricID` rows into `tag->metricIDs` rows for common `tag` values This should improve lookup performance if the same `label=value` pair exists in big number of time series. This should also reduce memory usage for mergeset data cache, since `tag->metricIDs` rows occupy less space than the original `tag->metricID` rows. --- lib/mergeset/encoding.go | 2 +- lib/mergeset/merge.go | 30 +- lib/mergeset/merge_test.go | 10 +- lib/mergeset/part_search_test.go | 2 +- lib/mergeset/table.go | 98 +++- lib/mergeset/table_search_test.go | 6 +- lib/mergeset/table_search_timing_test.go | 2 +- lib/mergeset/table_test.go | 33 +- lib/storage/index_db.go | 541 ++++++++++++++++------- lib/storage/metric_name.go | 11 + lib/storage/partition.go | 2 +- lib/storage/tag_filters.go | 8 +- 12 files changed, 554 insertions(+), 191 deletions(-) diff --git a/lib/mergeset/encoding.go b/lib/mergeset/encoding.go index 94eaa14157..9eab5ebd1f 100644 --- a/lib/mergeset/encoding.go +++ b/lib/mergeset/encoding.go @@ -185,7 +185,7 @@ func (ib *inmemoryBlock) marshalData(sb *storageBlock, firstItemDst, commonPrefi firstItemDst = append(firstItemDst, ib.items[0]...) commonPrefixDst = append(commonPrefixDst, ib.commonPrefix...) - if len(ib.data)-len(ib.commonPrefix)*len(ib.items) < 64 || len(ib.items) < 10 { + if len(ib.data)-len(ib.commonPrefix)*len(ib.items) < 64 || len(ib.items) < 2 { // Use plain encoding form small block, since it is cheaper. ib.marshalDataPlain(sb) return firstItemDst, commonPrefixDst, uint32(len(ib.items)), marshalTypePlain diff --git a/lib/mergeset/merge.go b/lib/mergeset/merge.go index 27a6a014db..398fffa9d8 100644 --- a/lib/mergeset/merge.go +++ b/lib/mergeset/merge.go @@ -7,16 +7,27 @@ import ( "sync/atomic" ) +// PrepareBlockCallback can transform the passed items allocated at the given data. +// +// The callback is called during merge before flushing full block of the given items +// to persistent storage. +// +// The callback must return sorted items. +// The callback can re-use data and items for storing the result. +type PrepareBlockCallback func(data []byte, items [][]byte) ([]byte, [][]byte) + // mergeBlockStreams merges bsrs and writes result to bsw. // // It also fills ph. // +// prepareBlock is optional. +// // The function immediately returns when stopCh is closed. // // It also atomically adds the number of items merged to itemsMerged. -func mergeBlockStreams(ph *partHeader, bsw *blockStreamWriter, bsrs []*blockStreamReader, stopCh <-chan struct{}, itemsMerged *uint64) error { +func mergeBlockStreams(ph *partHeader, bsw *blockStreamWriter, bsrs []*blockStreamReader, prepareBlock PrepareBlockCallback, stopCh <-chan struct{}, itemsMerged *uint64) error { bsm := bsmPool.Get().(*blockStreamMerger) - if err := bsm.Init(bsrs); err != nil { + if err := bsm.Init(bsrs, prepareBlock); err != nil { return fmt.Errorf("cannot initialize blockStreamMerger: %s", err) } err := bsm.Merge(bsw, ph, stopCh, itemsMerged) @@ -39,6 +50,8 @@ var bsmPool = &sync.Pool{ } type blockStreamMerger struct { + prepareBlock PrepareBlockCallback + bsrHeap bsrHeap // ib is a scratch block with pending items. @@ -48,6 +61,8 @@ type blockStreamMerger struct { } func (bsm *blockStreamMerger) reset() { + bsm.prepareBlock = nil + for i := range bsm.bsrHeap { bsm.bsrHeap[i] = nil } @@ -57,8 +72,9 @@ func (bsm *blockStreamMerger) reset() { bsm.phFirstItemCaught = false } -func (bsm *blockStreamMerger) Init(bsrs []*blockStreamReader) error { +func (bsm *blockStreamMerger) Init(bsrs []*blockStreamReader, prepareBlock PrepareBlockCallback) error { bsm.reset() + bsm.prepareBlock = prepareBlock for _, bsr := range bsrs { if bsr.Next() { bsm.bsrHeap = append(bsm.bsrHeap, bsr) @@ -134,9 +150,11 @@ func (bsm *blockStreamMerger) flushIB(bsw *blockStreamWriter, ph *partHeader, it // Nothing to flush. return } - itemsCount := uint64(len(bsm.ib.items)) - ph.itemsCount += itemsCount - atomic.AddUint64(itemsMerged, itemsCount) + atomic.AddUint64(itemsMerged, uint64(len(bsm.ib.items))) + if bsm.prepareBlock != nil { + bsm.ib.data, bsm.ib.items = bsm.prepareBlock(bsm.ib.data, bsm.ib.items) + } + ph.itemsCount += uint64(len(bsm.ib.items)) if !bsm.phFirstItemCaught { ph.firstItem = append(ph.firstItem[:0], bsm.ib.items[0]...) bsm.phFirstItemCaught = true diff --git a/lib/mergeset/merge_test.go b/lib/mergeset/merge_test.go index 8fb80023dc..02499ffd57 100644 --- a/lib/mergeset/merge_test.go +++ b/lib/mergeset/merge_test.go @@ -30,14 +30,14 @@ func TestMultilevelMerge(t *testing.T) { var dstIP1 inmemoryPart var bsw1 blockStreamWriter bsw1.InitFromInmemoryPart(&dstIP1, 0) - if err := mergeBlockStreams(&dstIP1.ph, &bsw1, bsrs[:5], nil, &itemsMerged); err != nil { + if err := mergeBlockStreams(&dstIP1.ph, &bsw1, bsrs[:5], nil, nil, &itemsMerged); err != nil { t.Fatalf("cannot merge first level part 1: %s", err) } var dstIP2 inmemoryPart var bsw2 blockStreamWriter bsw2.InitFromInmemoryPart(&dstIP2, 0) - if err := mergeBlockStreams(&dstIP2.ph, &bsw2, bsrs[5:], nil, &itemsMerged); err != nil { + if err := mergeBlockStreams(&dstIP2.ph, &bsw2, bsrs[5:], nil, nil, &itemsMerged); err != nil { t.Fatalf("cannot merge first level part 2: %s", err) } @@ -54,7 +54,7 @@ func TestMultilevelMerge(t *testing.T) { newTestBlockStreamReader(&dstIP2), } bsw.InitFromInmemoryPart(&dstIP, 0) - if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrsTop, nil, &itemsMerged); err != nil { + if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrsTop, nil, nil, &itemsMerged); err != nil { t.Fatalf("cannot merge second level: %s", err) } if itemsMerged != uint64(len(items)) { @@ -76,7 +76,7 @@ func TestMergeForciblyStop(t *testing.T) { ch := make(chan struct{}) var itemsMerged uint64 close(ch) - if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrs, ch, &itemsMerged); err != errForciblyStopped { + if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrs, nil, ch, &itemsMerged); err != errForciblyStopped { t.Fatalf("unexpected error during merge: got %v; want %v", err, errForciblyStopped) } if itemsMerged != 0 { @@ -120,7 +120,7 @@ func testMergeBlockStreamsSerial(blocksToMerge, maxItemsPerBlock int) error { var dstIP inmemoryPart var bsw blockStreamWriter bsw.InitFromInmemoryPart(&dstIP, 0) - if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrs, nil, &itemsMerged); err != nil { + if err := mergeBlockStreams(&dstIP.ph, &bsw, bsrs, nil, nil, &itemsMerged); err != nil { return fmt.Errorf("cannot merge block streams: %s", err) } if itemsMerged != uint64(len(items)) { diff --git a/lib/mergeset/part_search_test.go b/lib/mergeset/part_search_test.go index 3e35f54a44..29dce77093 100644 --- a/lib/mergeset/part_search_test.go +++ b/lib/mergeset/part_search_test.go @@ -150,7 +150,7 @@ func newTestPart(blocksCount, maxItemsPerBlock int) (*part, []string, error) { var ip inmemoryPart var bsw blockStreamWriter bsw.InitFromInmemoryPart(&ip, 0) - if err := mergeBlockStreams(&ip.ph, &bsw, bsrs, nil, &itemsMerged); err != nil { + if err := mergeBlockStreams(&ip.ph, &bsw, bsrs, nil, nil, &itemsMerged); err != nil { return nil, nil, fmt.Errorf("cannot merge blocks: %s", err) } if itemsMerged != uint64(len(items)) { diff --git a/lib/mergeset/table.go b/lib/mergeset/table.go index 458b09a7e0..77d4fb3257 100644 --- a/lib/mergeset/table.go +++ b/lib/mergeset/table.go @@ -74,6 +74,8 @@ type Table struct { flushCallback func() + prepareBlock PrepareBlockCallback + partsLock sync.Mutex parts []*partWrapper @@ -94,6 +96,8 @@ type Table struct { rawItemsFlusherWG sync.WaitGroup + convertersWG sync.WaitGroup + // Use syncwg instead of sync, since Add/Wait may be called from concurrent goroutines. rawItemsPendingFlushesWG syncwg.WaitGroup @@ -139,8 +143,11 @@ func (pw *partWrapper) decRef() { // Optional flushCallback is called every time new data batch is flushed // to the underlying storage and becomes visible to search. // +// Optional prepareBlock is called during merge before flushing the prepared block +// to persistent storage. +// // The table is created if it doesn't exist yet. -func OpenTable(path string, flushCallback func()) (*Table, error) { +func OpenTable(path string, flushCallback func(), prepareBlock PrepareBlockCallback) (*Table, error) { path = filepath.Clean(path) logger.Infof("opening table %q...", path) startTime := time.Now() @@ -165,6 +172,7 @@ func OpenTable(path string, flushCallback func()) (*Table, error) { tb := &Table{ path: path, flushCallback: flushCallback, + prepareBlock: prepareBlock, parts: pws, mergeIdx: uint64(time.Now().UnixNano()), flockF: flockF, @@ -178,6 +186,12 @@ func OpenTable(path string, flushCallback func()) (*Table, error) { logger.Infof("table %q has been opened in %s; partsCount: %d; blocksCount: %d, itemsCount: %d; sizeBytes: %d", path, time.Since(startTime), m.PartsCount, m.BlocksCount, m.ItemsCount, m.SizeBytes) + tb.convertersWG.Add(1) + go func() { + tb.convertToV1280() + tb.convertersWG.Done() + }() + return tb, nil } @@ -190,6 +204,11 @@ func (tb *Table) MustClose() { tb.rawItemsFlusherWG.Wait() logger.Infof("raw items flusher stopped in %s on %q", time.Since(startTime), tb.path) + logger.Infof("waiting for converters to stop on %q...", tb.path) + startTime = time.Now() + tb.convertersWG.Wait() + logger.Infof("converters stopped in %s on %q", time.Since(startTime), tb.path) + logger.Infof("waiting for part mergers to stop on %q...", tb.path) startTime = time.Now() tb.partMergersWG.Wait() @@ -216,7 +235,7 @@ func (tb *Table) MustClose() { } tb.partsLock.Unlock() - if err := tb.mergePartsOptimal(pws); err != nil { + if err := tb.mergePartsOptimal(pws, nil); err != nil { logger.Panicf("FATAL: cannot flush inmemory parts to files in %q: %s", tb.path, err) } logger.Infof("%d inmemory parts have been flushed to files in %s on %q", len(pws), time.Since(startTime), tb.path) @@ -393,15 +412,67 @@ func (tb *Table) rawItemsFlusher() { } } -func (tb *Table) mergePartsOptimal(pws []*partWrapper) error { +const convertToV1280FileName = "converted-to-v1.28.0" + +func (tb *Table) convertToV1280() { + // Convert tag->metricID rows into tag->metricIDs rows when upgrading to v1.28.0+. + flagFilePath := tb.path + "/" + convertToV1280FileName + if fs.IsPathExist(flagFilePath) { + // The conversion has been already performed. + return + } + + getAllPartsForMerge := func() []*partWrapper { + var pws []*partWrapper + tb.partsLock.Lock() + for _, pw := range tb.parts { + if pw.isInMerge { + continue + } + pw.isInMerge = true + pws = append(pws, pw) + } + tb.partsLock.Unlock() + return pws + } + pws := getAllPartsForMerge() + if len(pws) > 0 { + logger.Infof("started round 1 of background conversion of %q to v1.28.0 format; merge %d parts", tb.path, len(pws)) + startTime := time.Now() + if err := tb.mergePartsOptimal(pws, tb.stopCh); err != nil { + logger.Errorf("failed round 1 of background conversion of %q to v1.28.0 format: %s", tb.path, err) + return + } + logger.Infof("finished round 1 of background conversion of %q to v1.28.0 format in %s", tb.path, time.Since(startTime)) + + // The second round is needed in order to merge small blocks + // with tag->metricIDs rows left after the first round. + pws = getAllPartsForMerge() + logger.Infof("started round 2 of background conversion of %q to v1.28.0 format; merge %d parts", tb.path, len(pws)) + startTime = time.Now() + if len(pws) > 0 { + if err := tb.mergePartsOptimal(pws, tb.stopCh); err != nil { + logger.Errorf("failed round 2 of background conversion of %q to v1.28.0 format: %s", tb.path, err) + return + } + } + logger.Infof("finished round 2 of background conversion of %q to v1.28.0 format in %s", tb.path, time.Since(startTime)) + } + + if err := fs.WriteFileAtomically(flagFilePath, []byte("ok")); err != nil { + logger.Panicf("FATAL: cannot create %q: %s", flagFilePath, err) + } +} + +func (tb *Table) mergePartsOptimal(pws []*partWrapper, stopCh <-chan struct{}) error { for len(pws) > defaultPartsToMerge { - if err := tb.mergeParts(pws[:defaultPartsToMerge], nil, false); err != nil { + if err := tb.mergeParts(pws[:defaultPartsToMerge], stopCh, false); err != nil { return fmt.Errorf("cannot merge %d parts: %s", defaultPartsToMerge, err) } pws = pws[defaultPartsToMerge:] } if len(pws) > 0 { - if err := tb.mergeParts(pws, nil, false); err != nil { + if err := tb.mergeParts(pws, stopCh, false); err != nil { return fmt.Errorf("cannot merge %d parts: %s", len(pws), err) } } @@ -541,7 +612,7 @@ func (tb *Table) mergeInmemoryBlocks(blocksToMerge []*inmemoryBlock) *partWrappe // Merge parts. // The merge shouldn't be interrupted by stopCh, // since it may be final after stopCh is closed. - if err := mergeBlockStreams(&mpDst.ph, bsw, bsrs, nil, &tb.itemsMerged); err != nil { + if err := mergeBlockStreams(&mpDst.ph, bsw, bsrs, tb.prepareBlock, nil, &tb.itemsMerged); err != nil { logger.Panicf("FATAL: cannot merge inmemoryBlocks: %s", err) } putBlockStreamWriter(bsw) @@ -700,7 +771,7 @@ func (tb *Table) mergeParts(pws []*partWrapper, stopCh <-chan struct{}, isOuterP // Merge parts into a temporary location. var ph partHeader - err := mergeBlockStreams(&ph, bsw, bsrs, stopCh, &tb.itemsMerged) + err := mergeBlockStreams(&ph, bsw, bsrs, tb.prepareBlock, stopCh, &tb.itemsMerged) putBlockStreamWriter(bsw) if err != nil { if err == errForciblyStopped { @@ -950,11 +1021,20 @@ func (tb *Table) CreateSnapshotAt(dstDir string) error { return fmt.Errorf("cannot read directory: %s", err) } for _, fi := range fis { + fn := fi.Name() if !fs.IsDirOrSymlink(fi) { - // Skip non-directories. + switch fn { + case convertToV1280FileName: + srcPath := srcDir + "/" + fn + dstPath := dstDir + "/" + fn + if err := os.Link(srcPath, dstPath); err != nil { + return fmt.Errorf("cannot hard link from %q to %q: %s", srcPath, dstPath, err) + } + default: + // Skip other non-directories. + } continue } - fn := fi.Name() if isSpecialDir(fn) { // Skip special dirs. continue diff --git a/lib/mergeset/table_search_test.go b/lib/mergeset/table_search_test.go index 806e2a8878..3673ac8ab2 100644 --- a/lib/mergeset/table_search_test.go +++ b/lib/mergeset/table_search_test.go @@ -41,7 +41,7 @@ func TestTableSearchSerial(t *testing.T) { func() { // Re-open the table and verify the search works. - tb, err := OpenTable(path, nil) + tb, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open table: %s", err) } @@ -76,7 +76,7 @@ func TestTableSearchConcurrent(t *testing.T) { // Re-open the table and verify the search works. func() { - tb, err := OpenTable(path, nil) + tb, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open table: %s", err) } @@ -152,7 +152,7 @@ func newTestTable(path string, itemsCount int) (*Table, []string, error) { flushCallback := func() { atomic.AddUint64(&flushes, 1) } - tb, err := OpenTable(path, flushCallback) + tb, err := OpenTable(path, flushCallback, nil) if err != nil { return nil, nil, fmt.Errorf("cannot open table: %s", err) } diff --git a/lib/mergeset/table_search_timing_test.go b/lib/mergeset/table_search_timing_test.go index 16c2f6035f..f8e0c9851b 100644 --- a/lib/mergeset/table_search_timing_test.go +++ b/lib/mergeset/table_search_timing_test.go @@ -32,7 +32,7 @@ func benchmarkTableSearch(b *testing.B, itemsCount int) { // Force finishing pending merges tb.MustClose() - tb, err = OpenTable(path, nil) + tb, err = OpenTable(path, nil, nil) if err != nil { b.Fatalf("unexpected error when re-opening table %q: %s", path, err) } diff --git a/lib/mergeset/table_test.go b/lib/mergeset/table_test.go index db063f1ebf..aa43fe9e11 100644 --- a/lib/mergeset/table_test.go +++ b/lib/mergeset/table_test.go @@ -21,7 +21,7 @@ func TestTableOpenClose(t *testing.T) { }() // Create a new table - tb, err := OpenTable(path, nil) + tb, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot create new table: %s", err) } @@ -31,7 +31,7 @@ func TestTableOpenClose(t *testing.T) { // Re-open created table multiple times. for i := 0; i < 10; i++ { - tb, err := OpenTable(path, nil) + tb, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open created table: %s", err) } @@ -45,14 +45,14 @@ func TestTableOpenMultipleTimes(t *testing.T) { _ = os.RemoveAll(path) }() - tb1, err := OpenTable(path, nil) + tb1, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open table: %s", err) } defer tb1.MustClose() for i := 0; i < 10; i++ { - tb2, err := OpenTable(path, nil) + tb2, err := OpenTable(path, nil, nil) if err == nil { tb2.MustClose() t.Fatalf("expecting non-nil error when opening already opened table") @@ -73,7 +73,7 @@ func TestTableAddItemSerial(t *testing.T) { flushCallback := func() { atomic.AddUint64(&flushes, 1) } - tb, err := OpenTable(path, flushCallback) + tb, err := OpenTable(path, flushCallback, nil) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } @@ -99,7 +99,7 @@ func TestTableAddItemSerial(t *testing.T) { testReopenTable(t, path, itemsCount) // Add more items in order to verify merge between inmemory parts and file-based parts. - tb, err = OpenTable(path, nil) + tb, err = OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } @@ -132,7 +132,7 @@ func TestTableCreateSnapshotAt(t *testing.T) { _ = os.RemoveAll(path) }() - tb, err := OpenTable(path, nil) + tb, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } @@ -163,13 +163,13 @@ func TestTableCreateSnapshotAt(t *testing.T) { }() // Verify snapshots contain all the data. - tb1, err := OpenTable(snapshot1, nil) + tb1, err := OpenTable(snapshot1, nil, nil) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } defer tb1.MustClose() - tb2, err := OpenTable(snapshot2, nil) + tb2, err := OpenTable(snapshot2, nil, nil) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } @@ -217,7 +217,12 @@ func TestTableAddItemsConcurrent(t *testing.T) { flushCallback := func() { atomic.AddUint64(&flushes, 1) } - tb, err := OpenTable(path, flushCallback) + var itemsMerged uint64 + prepareBlock := func(data []byte, items [][]byte) ([]byte, [][]byte) { + atomic.AddUint64(&itemsMerged, uint64(len(items))) + return data, items + } + tb, err := OpenTable(path, flushCallback, prepareBlock) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } @@ -230,6 +235,10 @@ func TestTableAddItemsConcurrent(t *testing.T) { if atomic.LoadUint64(&flushes) == 0 { t.Fatalf("unexpected zero flushes") } + n := atomic.LoadUint64(&itemsMerged) + if n < itemsCount { + t.Fatalf("too low number of items merged; got %v; must be at least %v", n, itemsCount) + } var m TableMetrics tb.UpdateMetrics(&m) @@ -243,7 +252,7 @@ func TestTableAddItemsConcurrent(t *testing.T) { testReopenTable(t, path, itemsCount) // Add more items in order to verify merge between inmemory parts and file-based parts. - tb, err = OpenTable(path, nil) + tb, err = OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot open %q: %s", path, err) } @@ -285,7 +294,7 @@ func testReopenTable(t *testing.T, path string, itemsCount int) { t.Helper() for i := 0; i < 10; i++ { - tb, err := OpenTable(path, nil) + tb, err := OpenTable(path, nil, nil) if err != nil { t.Fatalf("cannot re-open %q: %s", path, err) } diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index bacf2c4dbf..43d5e73925 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -28,7 +28,7 @@ const ( nsPrefixMetricNameToTSID = 0 // Prefix for Tag->MetricID entries. - nsPrefixTagToMetricID = 1 + nsPrefixTagToMetricIDs = 1 // Prefix for MetricID->TSID entries. nsPrefixMetricIDToTSID = 2 @@ -116,7 +116,7 @@ func openIndexDB(path string, metricIDCache, metricNameCache *workingsetcache.Ca logger.Panicf("BUG: prevHourMetricIDs must be non-nil") } - tb, err := mergeset.OpenTable(path, invalidateTagCache) + tb, err := mergeset.OpenTable(path, invalidateTagCache, mergeTagToMetricIDsRows) if err != nil { return nil, fmt.Errorf("cannot open indexDB %q: %s", path, err) } @@ -451,6 +451,7 @@ type indexSearch struct { db *indexDB ts mergeset.TableSearch kb bytesutil.ByteBuffer + mp tagToMetricIDsRowParser // tsidByNameMisses and tsidByNameSkips is used for a performance // hack in GetOrCreateTSIDByName. See the comment there. @@ -505,6 +506,7 @@ func (db *indexDB) getIndexSearch() *indexSearch { func (db *indexDB) putIndexSearch(is *indexSearch) { is.ts.MustClose() is.kb.Reset() + is.mp.Reset() // Do not reset tsidByNameMisses and tsidByNameSkips, // since they are used in GetOrCreateTSIDByName across call boundaries. @@ -548,7 +550,7 @@ func (db *indexDB) generateTSID(dst *TSID, metricName []byte, mn *MetricName) er } } - // The TSID wan't found in the external storage. + // The TSID wasn't found in the external storage. // Generate it locally. dst.AccountID = mn.AccountID dst.ProjectID = mn.ProjectID @@ -589,7 +591,7 @@ func (db *indexDB) createIndexes(tsid *TSID, mn *MetricName) error { items.Next() commonPrefix := kbPool.Get() - commonPrefix.B = marshalCommonPrefix(commonPrefix.B[:0], nsPrefixTagToMetricID, mn.AccountID, mn.ProjectID) + commonPrefix.B = marshalCommonPrefix(commonPrefix.B[:0], nsPrefixTagToMetricIDs, mn.AccountID, mn.ProjectID) // Create MetricGroup -> MetricID index. items.B = append(items.B, commonPrefix.B...) @@ -680,50 +682,37 @@ func (db *indexDB) SearchTagKeys(accountID, projectID uint32, maxTagKeys int) ([ func (is *indexSearch) searchTagKeys(accountID, projectID uint32, tks map[string]struct{}, maxTagKeys int) error { ts := &is.ts kb := &is.kb + mp := &is.mp + mp.Reset() dmis := is.db.getDeletedMetricIDs() - commonPrefix := marshalCommonPrefix(nil, nsPrefixTagToMetricID, accountID, projectID) - ts.Seek(commonPrefix) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs, accountID, projectID) + prefix := kb.B + ts.Seek(prefix) for len(tks) < maxTagKeys && ts.NextItem() { item := ts.Item - if !bytes.HasPrefix(item, commonPrefix) { + if !bytes.HasPrefix(item, prefix) { break } - tail := item[len(commonPrefix):] - - // Unmarshal tag key into kb.B - var err error - tail, kb.B, err = unmarshalTagValue(kb.B[:0], tail) - if err != nil { - return fmt.Errorf("cannot unmarshal tagKey from %X: %s", item, err) + if err := mp.Init(item); err != nil { + return err } - - // Verify that the tag key points to existing metric. - if len(tail) < 8 { - return fmt.Errorf("cannot unmarshal metricID from less than 8 bytes; got %d bytes; item=%X", len(tail), tail) - } - metricID := encoding.UnmarshalUint64(tail[len(tail)-8:]) - if _, deleted := dmis[metricID]; deleted { - // The given metric is deleted. Skip it. + if mp.IsDeletedTag(dmis) { continue } // Store tag key. - tks[string(kb.B)] = struct{}{} + tks[string(mp.Tag.Key)] = struct{}{} // Search for the next tag key. - // tkp (tag key prefix) contains (commonPrefix + encoded tag key). - // The last char must be tagSeparatorChar. Just increment it - // in order to jump to the next tag key. - tkp := item[:len(item)-len(tail)] - if len(tkp) == 0 || tkp[len(tkp)-1] != tagSeparatorChar || tagSeparatorChar >= 0xff { - logger.Panicf("BUG: the last char in tkp=%X must be %X. Check unmarshalTagValue code", tkp, tagSeparatorChar) - } - kb.B = append(kb.B[:0], tkp...) + // The last char in kb.B must be tagSeparatorChar. + // Just increment it in order to jump to the next tag key. + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs, accountID, projectID) + kb.B = marshalTagValue(kb.B, mp.Tag.Key) kb.B[len(kb.B)-1]++ ts.Seek(kb.B) } if err := ts.Error(); err != nil { - return fmt.Errorf("error during search for commonPrefix %q: %s", commonPrefix, err) + return fmt.Errorf("error during search for prefix %q: %s", prefix, err) } return nil } @@ -732,24 +721,18 @@ func (is *indexSearch) searchTagKeys(accountID, projectID uint32, tks map[string func (db *indexDB) SearchTagValues(accountID, projectID uint32, tagKey []byte, maxTagValues int) ([]string, error) { // TODO: cache results? - kb := kbPool.Get() - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricID, accountID, projectID) - kb.B = marshalTagValue(kb.B, tagKey) - tvs := make(map[string]struct{}) is := db.getIndexSearch() - err := is.searchTagValues(tvs, kb.B, maxTagValues) + err := is.searchTagValues(accountID, projectID, tvs, tagKey, maxTagValues) db.putIndexSearch(is) if err != nil { - kbPool.Put(kb) return nil, err } ok := db.doExtDB(func(extDB *indexDB) { is := extDB.getIndexSearch() - err = is.searchTagValues(tvs, kb.B, maxTagValues) + err = is.searchTagValues(accountID, projectID, tvs, tagKey, maxTagValues) extDB.putIndexSearch(is) }) - kbPool.Put(kb) if ok && err != nil { return nil, err } @@ -763,49 +746,37 @@ func (db *indexDB) SearchTagValues(accountID, projectID uint32, tagKey []byte, m return tagValues, nil } -func (is *indexSearch) searchTagValues(tvs map[string]struct{}, prefix []byte, maxTagValues int) error { +func (is *indexSearch) searchTagValues(accountID, projectID uint32, tvs map[string]struct{}, tagKey []byte, maxTagValues int) error { ts := &is.ts kb := &is.kb + mp := &is.mp + mp.Reset() dmis := is.db.getDeletedMetricIDs() + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs, accountID, projectID) + kb.B = marshalTagValue(kb.B, tagKey) + prefix := kb.B ts.Seek(prefix) for len(tvs) < maxTagValues && ts.NextItem() { - k := ts.Item - if !bytes.HasPrefix(k, prefix) { + item := ts.Item + if !bytes.HasPrefix(item, prefix) { break } - - // Get TagValue - k = k[len(prefix):] - var err error - k, kb.B, err = unmarshalTagValue(kb.B[:0], k) - if err != nil { - return fmt.Errorf("cannot unmarshal tagValue: %s", err) + if err := mp.Init(item); err != nil { + return err } - if len(k) != 8 { - return fmt.Errorf("unexpected suffix after tag value; want %d bytes; got %d bytes", 8, len(k)) - } - - // Verify whether the corresponding metric is deleted. - if len(dmis) > 0 { - metricID := encoding.UnmarshalUint64(k) - if _, deleted := dmis[metricID]; deleted { - // The metric is deleted. - continue - } + if mp.IsDeletedTag(dmis) { + continue } // Store tag value - tvs[string(kb.B)] = struct{}{} + tvs[string(mp.Tag.Value)] = struct{}{} // Search for the next tag value. - // tkp (tag key prefix) contains (commonPrefix + encoded tag value). - // The last char must be tagSeparatorChar. Just increment it - // in order to jump to the next tag key. - tkp := ts.Item[:len(ts.Item)-8] - if len(tkp) == 0 || tkp[len(tkp)-1] != tagSeparatorChar || tagSeparatorChar >= 0xff { - logger.Panicf("BUG: the last char in tkp=%X must be %X. Check unmarshalTagValue code", tkp, tagSeparatorChar) - } - kb.B = append(kb.B[:0], tkp...) + // The last char in kb.B must be tagSeparatorChar. + // Just increment it in order to jump to the next tag key. + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs, accountID, projectID) + kb.B = marshalTagValue(kb.B, mp.Tag.Key) + kb.B = marshalTagValue(kb.B, mp.Tag.Value) kb.B[len(kb.B)-1]++ ts.Seek(kb.B) } @@ -1460,7 +1431,7 @@ func (is *indexSearch) getTagFilterWithMinMetricIDsCount(tfs *TagFilters, maxMet } func matchTagFilters(mn *MetricName, tfs []*tagFilter, kb *bytesutil.ByteBuffer) (bool, error) { - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricID, mn.AccountID, mn.ProjectID) + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs, mn.AccountID, mn.ProjectID) for _, tf := range tfs { if len(tf.key) == 0 { // Match against mn.MetricGroup. @@ -1628,7 +1599,10 @@ func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, maxMetrics int) ( if len(tf.orSuffixes) > 0 { // Fast path for orSuffixes - seek for rows for each value from orSuffxies. if err := is.updateMetricIDsForOrSuffixesNoFilter(tf, maxMetrics, metricIDs); err != nil { - return nil, err + if err == errFallbackToMetricNameMatch { + return nil, err + } + return nil, fmt.Errorf("error when searching for metricIDs for tagFilter in fast path: %s; tagFilter=%s", err, tf) } return metricIDs, nil } @@ -1640,7 +1614,10 @@ func (is *indexSearch) getMetricIDsForTagFilter(tf *tagFilter, maxMetrics int) ( return len(metricIDs) < maxMetrics }) if err != nil { - return nil, err + if err == errFallbackToMetricNameMatch { + return nil, err + } + return nil, fmt.Errorf("error when searching for metricIDs for tagFilter in slow path: %s; tagFilter=%s", err, tf) } return metricIDs, nil } @@ -1654,46 +1631,53 @@ func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, maxLoops int, loops := 0 ts := &is.ts kb := &is.kb - var prevMatchingK []byte + mp := &is.mp + mp.Reset() + var prevMatchingSuffix []byte var prevMatch bool - ts.Seek(tf.prefix) + prefix := tf.prefix + ts.Seek(prefix) for ts.NextItem() { - loops++ - if loops > maxLoops { - return errFallbackToMetricNameMatch + item := ts.Item + if !bytes.HasPrefix(item, prefix) { + return nil } - k := ts.Item - if !bytes.HasPrefix(k, tf.prefix) { - break + tail := item[len(prefix):] + n := bytes.IndexByte(tail, tagSeparatorChar) + if n < 0 { + return fmt.Errorf("invalid tag->metricIDs line %q: cannot find tagSeparatorChar=%d", item, tagSeparatorChar) } - - // Get MetricID from k (the last 8 bytes). - k = k[len(tf.prefix):] - if len(k) < 8 { - return fmt.Errorf("invald key suffix size; want at least %d bytes; got %d bytes", 8, len(k)) + suffix := tail[:n+1] + tail = tail[n+1:] + if err := mp.InitOnlyTail(item, tail); err != nil { + return err } - v := k[len(k)-8:] - k = k[:len(k)-8] - metricID := encoding.UnmarshalUint64(v) - - if prevMatch && string(k) == string(prevMatchingK) { + if prevMatch && string(suffix) == string(prevMatchingSuffix) { // Fast path: the same tag value found. // There is no need in checking it again with potentially // slow tf.matchSuffix, which may call regexp. - if !f(metricID) { - break + mp.ParseMetricIDs() + loops += len(mp.MetricIDs) + if loops > maxLoops { + return errFallbackToMetricNameMatch + } + for _, metricID := range mp.MetricIDs { + if !f(metricID) { + return nil + } } continue } - ok, err := tf.matchSuffix(k) + // Slow path: need tf.matchSuffix call. + ok, err := tf.matchSuffix(suffix) if err != nil { - return fmt.Errorf("error when matching %s: %s", tf, err) + return fmt.Errorf("error when matching %s against suffix %q: %s", tf, suffix, err) } if !ok { prevMatch = false // Optimization: skip all the metricIDs for the given tag value - kb.B = append(kb.B[:0], ts.Item[:len(ts.Item)-8]...) + kb.B = append(kb.B[:0], item[:len(item)-len(tail)]...) // The last char in kb.B must be tagSeparatorChar. Just increment it // in order to jump to the next tag value. if len(kb.B) == 0 || kb.B[len(kb.B)-1] != tagSeparatorChar || tagSeparatorChar >= 0xff { @@ -1704,13 +1688,20 @@ func (is *indexSearch) getMetricIDsForTagFilterSlow(tf *tagFilter, maxLoops int, continue } prevMatch = true - prevMatchingK = append(prevMatchingK[:0], k...) - if !f(metricID) { - break + prevMatchingSuffix = append(prevMatchingSuffix[:0], suffix...) + mp.ParseMetricIDs() + loops += len(mp.MetricIDs) + if loops > maxLoops { + return errFallbackToMetricNameMatch + } + for _, metricID := range mp.MetricIDs { + if !f(metricID) { + return nil + } } } if err := ts.Error(); err != nil { - return fmt.Errorf("error when searching for tag filter prefix %q: %s", tf.prefix, err) + return fmt.Errorf("error when searching for tag filter prefix %q: %s", prefix, err) } return nil } @@ -1752,24 +1743,27 @@ func (is *indexSearch) updateMetricIDsForOrSuffixesWithFilter(tf *tagFilter, met func (is *indexSearch) updateMetricIDsForOrSuffixNoFilter(prefix []byte, maxMetrics int, metricIDs map[uint64]struct{}) error { ts := &is.ts + mp := &is.mp + mp.Reset() maxLoops := maxMetrics * maxIndexScanLoopsPerMetric loops := 0 ts.Seek(prefix) for len(metricIDs) < maxMetrics && ts.NextItem() { - loops++ + item := ts.Item + if !bytes.HasPrefix(item, prefix) { + return nil + } + if err := mp.InitOnlyTail(item, item[len(prefix):]); err != nil { + return err + } + mp.ParseMetricIDs() + loops += len(mp.MetricIDs) if loops > maxLoops { return errFallbackToMetricNameMatch } - if !bytes.HasPrefix(ts.Item, prefix) { - break + for _, metricID := range mp.MetricIDs { + metricIDs[metricID] = struct{}{} } - // Get MetricID from ts.Item (the last 8 bytes). - v := ts.Item[len(prefix):] - if len(v) != 8 { - return fmt.Errorf("invalid key suffix size for prefix=%q; want %d bytes; got %d bytes; value=%q", 8, prefix, len(v), v) - } - metricID := encoding.UnmarshalUint64(v) - metricIDs[metricID] = struct{}{} } if err := ts.Error(); err != nil { return fmt.Errorf("error when searching for tag filter prefix %q: %s", prefix, err) @@ -1778,48 +1772,67 @@ func (is *indexSearch) updateMetricIDsForOrSuffixNoFilter(prefix []byte, maxMetr } func (is *indexSearch) updateMetricIDsForOrSuffixWithFilter(prefix []byte, metricIDs map[uint64]struct{}, sortedFilter []uint64, isNegative bool) error { + if len(sortedFilter) == 0 { + return nil + } + firstFilterMetricID := sortedFilter[0] + lastFilterMetricID := sortedFilter[len(sortedFilter)-1] ts := &is.ts - kb := &is.kb - for { - // Seek for the next metricID from sortedFilter. - if len(sortedFilter) == 0 { - // All the sorteFilter entries have been searched. - break + mp := &is.mp + mp.Reset() + maxLoops := len(sortedFilter) * maxIndexScanLoopsPerMetric + loops := 0 + ts.Seek(prefix) + var sf []uint64 + var metricID uint64 + for ts.NextItem() { + item := ts.Item + if !bytes.HasPrefix(item, prefix) { + return nil } - nextMetricID := sortedFilter[0] - sortedFilter = sortedFilter[1:] - kb.B = append(kb.B[:0], prefix...) - kb.B = encoding.MarshalUint64(kb.B, nextMetricID) - ts.Seek(kb.B) - if !ts.NextItem() { - break + if err := mp.InitOnlyTail(item, item[len(prefix):]); err != nil { + return err } - if !bytes.HasPrefix(ts.Item, prefix) { - break + firstMetricID, lastMetricID := mp.FirstAndLastMetricIDs() + if lastMetricID < firstFilterMetricID { + // Skip the item, since it contains metricIDs lower + // than metricIDs in sortedFilter. + continue } - // Get MetricID from ts.Item (the last 8 bytes). - v := ts.Item[len(prefix):] - if len(v) != 8 { - return fmt.Errorf("invalid key suffix size for prefix=%q; want %d bytes; got %d bytes; value=%q", 8, prefix, len(v), v) + if firstMetricID > lastFilterMetricID { + // Stop searching, since the current item and all the subsequent items + // contain metricIDs higher than metricIDs in sortedFilter. + return nil } - metricID := encoding.UnmarshalUint64(v) - if metricID != nextMetricID { - // Skip metricIDs smaller than the found metricID, since they don't - // match anything. - if len(sortedFilter) > 0 && metricID > sortedFilter[0] { - sortedFilter = sortedFilter[1:] - n := sort.Search(len(sortedFilter), func(i int) bool { - return metricID <= sortedFilter[i] - }) - sortedFilter = sortedFilter[n:] + sf = sortedFilter + mp.ParseMetricIDs() + loops += len(mp.MetricIDs) + if loops > maxLoops { + return errFallbackToMetricNameMatch + } + for _, metricID = range mp.MetricIDs { + if len(sf) == 0 { + break } - continue + if metricID > sf[0] { + n := sort.Search(len(sf), func(i int) bool { + return i >= 0 && i < len(sf) && sf[i] >= metricID + }) + sf = sf[n:] + if len(sf) == 0 { + break + } + } + if metricID < sf[0] { + continue + } + if isNegative { + delete(metricIDs, metricID) + } else { + metricIDs[metricID] = struct{}{} + } + sf = sf[1:] } - if isNegative { - delete(metricIDs, metricID) - continue - } - metricIDs[metricID] = struct{}{} } if err := ts.Error(); err != nil { return fmt.Errorf("error when searching for tag filter prefix %q: %s", prefix, err) @@ -2071,7 +2084,7 @@ func (is *indexSearch) updateMetricIDsAll(metricIDs map[uint64]struct{}, account // The maximum number of index scan loops per already found metric. // Bigger number of loops is slower than updateMetricIDsByMetricNameMatch // over the found metrics. -const maxIndexScanLoopsPerMetric = 32 +const maxIndexScanLoopsPerMetric = 400 func (is *indexSearch) intersectMetricIDsWithTagFilter(tf *tagFilter, filter map[uint64]struct{}) (map[uint64]struct{}, error) { if len(filter) == 0 { @@ -2084,7 +2097,10 @@ func (is *indexSearch) intersectMetricIDsWithTagFilter(tf *tagFilter, filter map if len(tf.orSuffixes) > 0 { // Fast path for orSuffixes - seek for rows for each value from orSuffixes. if err := is.updateMetricIDsForOrSuffixesWithFilter(tf, metricIDs, filter); err != nil { - return nil, err + if err == errFallbackToMetricNameMatch { + return nil, err + } + return nil, fmt.Errorf("error when intersecting metricIDs for tagFilter in fast path: %s; tagFilter=%s", err, tf) } return metricIDs, nil } @@ -2103,7 +2119,10 @@ func (is *indexSearch) intersectMetricIDsWithTagFilter(tf *tagFilter, filter map return true }) if err != nil { - return nil, err + if err == errFallbackToMetricNameMatch { + return nil, err + } + return nil, fmt.Errorf("error when intersecting metricIDs for tagFilter in slow path: %s; tagFilter=%s", err, tf) } return metricIDs, nil } @@ -2127,6 +2146,19 @@ func marshalCommonPrefix(dst []byte, nsPrefix byte, accountID, projectID uint32) return dst } +func unmarshalCommonPrefix(src []byte) ([]byte, byte, uint32, uint32, error) { + if len(src) < commonPrefixLen { + return nil, 0, 0, 0, fmt.Errorf("cannot unmarshal common prefix from %d bytes; need at least %d bytes; data=%X", len(src), commonPrefixLen, src) + } + prefix := src[0] + accountID := encoding.UnmarshalUint32(src[1:]) + projectID := encoding.UnmarshalUint32(src[5:]) + return src[commonPrefixLen:], prefix, accountID, projectID, nil +} + +// 1 byte for prefix, 4 bytes for accountID, 4 bytes for projectID +const commonPrefixLen = 9 + func getSortedMetricIDs(m map[uint64]struct{}) []uint64 { a := make(uint64Sorter, len(m)) i := 0 @@ -2139,6 +2171,179 @@ func getSortedMetricIDs(m map[uint64]struct{}) []uint64 { return a } +type tagToMetricIDsRowParser struct { + // AccountID contains parsed value after Init call + AccountID uint32 + + // ProjectID contains parsed value after Init call + ProjectID uint32 + + // MetricIDs contains parsed MetricIDs after ParseMetricIDs call + MetricIDs []uint64 + + // Tag contains parsed tag after Init call + Tag Tag + + // tail contains the remaining unparsed metricIDs + tail []byte +} + +func (mp *tagToMetricIDsRowParser) Reset() { + mp.AccountID = 0 + mp.ProjectID = 0 + mp.MetricIDs = mp.MetricIDs[:0] + mp.Tag.Reset() + mp.tail = nil +} + +// Init initializes mp from b, which should contain encoded tag->metricIDs row. +// +// b cannot be re-used until Reset call. +func (mp *tagToMetricIDsRowParser) Init(b []byte) error { + tail, prefix, accountID, projectID, err := unmarshalCommonPrefix(b) + if err != nil { + return fmt.Errorf("invalid tag->metricIDs row %q: %s", b, err) + } + if prefix != nsPrefixTagToMetricIDs { + return fmt.Errorf("invalid prefix for tag->metricIDs row %q; got %d; want %d", b, prefix, nsPrefixTagToMetricIDs) + } + mp.AccountID = accountID + mp.ProjectID = projectID + tail, err = mp.Tag.Unmarshal(tail) + if err != nil { + return fmt.Errorf("cannot unmarshal tag from tag->metricIDs row %q: %s", b, err) + } + return mp.InitOnlyTail(b, tail) +} + +// InitOnlyTail initializes mp.tail from tail. +// +// b must contain tag->metricIDs row. +// b cannot be re-used until Reset call. +func (mp *tagToMetricIDsRowParser) InitOnlyTail(b, tail []byte) error { + if len(tail) == 0 { + return fmt.Errorf("missing metricID in the tag->metricIDs row %q", b) + } + if len(tail)%8 != 0 { + return fmt.Errorf("invalid tail length in the tag->metricIDs row; got %d bytes; must be multiple of 8 bytes", len(tail)) + } + mp.tail = tail + return nil +} + +// EqualPrefix returns true if prefixes for mp and x are equal. +// +// Prefix contains (tag, accountID, projectID) +func (mp *tagToMetricIDsRowParser) EqualPrefix(x *tagToMetricIDsRowParser) bool { + if !mp.Tag.Equal(&x.Tag) { + return false + } + return mp.ProjectID == x.ProjectID && mp.AccountID == x.AccountID +} + +// FirstAndLastMetricIDs returns the first and the last metricIDs in the mp.tail. +func (mp *tagToMetricIDsRowParser) FirstAndLastMetricIDs() (uint64, uint64) { + tail := mp.tail + if len(tail) < 8 { + logger.Panicf("BUG: cannot unmarshal metricID from %d bytes; need 8 bytes", len(tail)) + return 0, 0 + } + firstMetricID := encoding.UnmarshalUint64(tail) + lastMetricID := firstMetricID + if len(tail) > 8 { + lastMetricID = encoding.UnmarshalUint64(tail[len(tail)-8:]) + } + return firstMetricID, lastMetricID +} + +// ParseMetricIDs parses MetricIDs from mp.tail into mp.MetricIDs. +func (mp *tagToMetricIDsRowParser) ParseMetricIDs() { + tail := mp.tail + mp.MetricIDs = mp.MetricIDs[:0] + n := len(tail) / 8 + if n <= cap(mp.MetricIDs) { + mp.MetricIDs = mp.MetricIDs[:n] + } else { + mp.MetricIDs = append(mp.MetricIDs[:cap(mp.MetricIDs)], make([]uint64, n-cap(mp.MetricIDs))...) + } + metricIDs := mp.MetricIDs + _ = metricIDs[n-1] + for i := 0; i < n; i++ { + if len(tail) < 8 { + logger.Panicf("BUG: tail cannot be smaller than 8 bytes; got %d bytes; tail=%X", len(tail), tail) + return + } + metricID := encoding.UnmarshalUint64(tail) + metricIDs[i] = metricID + tail = tail[8:] + } +} + +// IsDeletedTag verifies whether the tag from mp is deleted according to dmis. +// +// dmis must contain deleted MetricIDs. +func (mp *tagToMetricIDsRowParser) IsDeletedTag(dmis map[uint64]struct{}) bool { + if len(dmis) == 0 { + return false + } + mp.ParseMetricIDs() + for _, metricID := range mp.MetricIDs { + if _, ok := dmis[metricID]; !ok { + return false + } + } + return true +} + +func mergeTagToMetricIDsRows(data []byte, items [][]byte) ([]byte, [][]byte) { + // Perform quick checks whether items contain tag->metricIDs rows + // based on the fact that items are sorted. + if len(items) == 0 { + return data, items + } + firstItem := items[0] + if len(firstItem) > 0 && firstItem[0] > nsPrefixTagToMetricIDs { + return data, items + } + lastItem := items[len(items)-1] + if len(lastItem) > 0 && lastItem[0] < nsPrefixTagToMetricIDs { + return data, items + } + + // items contain at least one tag->metricIDs row. Merge rows with common tag. + dstData := data[:0] + dstItems := items[:0] + + tmm := getTagToMetricIDsRowsMerger() + defer putTagToMetricIDsRowsMerger(tmm) + + mp := &tmm.mp + mpPrev := &tmm.mpPrev + for _, item := range items { + if len(item) == 0 || item[0] != nsPrefixTagToMetricIDs { + if len(tmm.pendingMetricIDs) > 0 { + dstData, dstItems = tmm.flushPendingMetricIDs(dstData, dstItems, mpPrev) + } + dstData = append(dstData, item...) + dstItems = append(dstItems, dstData[len(dstData)-len(item):]) + continue + } + if err := mp.Init(item); err != nil { + logger.Panicf("FATAL: cannot parse tag->metricIDs row during merge: %s", err) + } + if len(tmm.pendingMetricIDs) > 0 && !mp.EqualPrefix(mpPrev) { + dstData, dstItems = tmm.flushPendingMetricIDs(dstData, dstItems, mpPrev) + } + mp.ParseMetricIDs() + tmm.pendingMetricIDs = append(tmm.pendingMetricIDs, mp.MetricIDs...) + mpPrev, mp = mp, mpPrev + } + if len(tmm.pendingMetricIDs) > 0 { + dstData, dstItems = tmm.flushPendingMetricIDs(dstData, dstItems, mpPrev) + } + return dstData, dstItems +} + type uint64Sorter []uint64 func (s uint64Sorter) Len() int { return len(s) } @@ -2148,3 +2353,43 @@ func (s uint64Sorter) Less(i, j int) bool { func (s uint64Sorter) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +type tagToMetricIDsRowsMerger struct { + pendingMetricIDs uint64Sorter + mp tagToMetricIDsRowParser + mpPrev tagToMetricIDsRowParser +} + +func (tmm *tagToMetricIDsRowsMerger) flushPendingMetricIDs(dstData []byte, dstItems [][]byte, mp *tagToMetricIDsRowParser) ([]byte, [][]byte) { + if len(tmm.pendingMetricIDs) == 0 { + logger.Panicf("BUG: pendingMetricIDs must be non-empty") + } + dstDataLen := len(dstData) + dstData = marshalCommonPrefix(dstData, nsPrefixTagToMetricIDs, mp.AccountID, mp.ProjectID) + dstData = mp.Tag.Marshal(dstData) + // Use sort.Sort instead of sort.Slice in order to reduce memory allocations + sort.Sort(&tmm.pendingMetricIDs) + for _, metricID := range tmm.pendingMetricIDs { + dstData = encoding.MarshalUint64(dstData, metricID) + } + tmm.pendingMetricIDs = tmm.pendingMetricIDs[:0] + dstItems = append(dstItems, dstData[dstDataLen:]) + return dstData, dstItems +} + +func getTagToMetricIDsRowsMerger() *tagToMetricIDsRowsMerger { + v := tmmPool.Get() + if v == nil { + return &tagToMetricIDsRowsMerger{} + } + return v.(*tagToMetricIDsRowsMerger) +} + +func putTagToMetricIDsRowsMerger(tmm *tagToMetricIDsRowsMerger) { + tmm.pendingMetricIDs = tmm.pendingMetricIDs[:0] + tmm.mp.Reset() + tmm.mpPrev.Reset() + tmmPool.Put(tmm) +} + +var tmmPool sync.Pool diff --git a/lib/storage/metric_name.go b/lib/storage/metric_name.go index b416aff80d..bc24a6ec63 100644 --- a/lib/storage/metric_name.go +++ b/lib/storage/metric_name.go @@ -25,6 +25,17 @@ type Tag struct { Value []byte } +// Reset resets the tag. +func (tag *Tag) Reset() { + tag.Key = tag.Key[:0] + tag.Value = tag.Value[:0] +} + +// Equal returns true if tag equals t +func (tag *Tag) Equal(t *Tag) bool { + return string(tag.Key) == string(t.Key) && string(tag.Value) == string(t.Value) +} + // Marshal appends marshaled tag to dst and returns the result. func (tag *Tag) Marshal(dst []byte) []byte { dst = marshalTagValue(dst, tag.Key) diff --git a/lib/storage/partition.go b/lib/storage/partition.go index 3f77782a37..b9fe8f6cbf 100644 --- a/lib/storage/partition.go +++ b/lib/storage/partition.go @@ -881,7 +881,7 @@ func (pt *partition) mergeSmallParts(isFinal bool) error { maxRows := maxRowsByPath(pt.smallPartsPath) if maxRows > maxRowsPerSmallPart() { // The output part may go to big part, - // so make sure it as enough space. + // so make sure it has enough space. maxBigPartRows := maxRowsByPath(pt.bigPartsPath) if maxRows > maxBigPartRows { maxRows = maxBigPartRows diff --git a/lib/storage/tag_filters.go b/lib/storage/tag_filters.go index d576cfe439..da03186481 100644 --- a/lib/storage/tag_filters.go +++ b/lib/storage/tag_filters.go @@ -23,7 +23,7 @@ type TagFilters struct { tfs []tagFilter // Common prefix for all the tag filters. - // Contains encoded nsPrefixTagToMetricID + accountID + projectID + // Contains encoded nsPrefixTagToMetricIDs + accountID + projectID. commonPrefix []byte } @@ -32,7 +32,7 @@ func NewTagFilters(accountID, projectID uint32) *TagFilters { return &TagFilters{ accountID: accountID, projectID: projectID, - commonPrefix: marshalCommonPrefix(nil, nsPrefixTagToMetricID, accountID, projectID), + commonPrefix: marshalCommonPrefix(nil, nsPrefixTagToMetricIDs, accountID, projectID), } } @@ -87,7 +87,7 @@ func (tfs *TagFilters) Reset(accountID, projectID uint32) { tfs.accountID = accountID tfs.projectID = projectID tfs.tfs = tfs.tfs[:0] - tfs.commonPrefix = marshalCommonPrefix(tfs.commonPrefix[:0], nsPrefixTagToMetricID, accountID, projectID) + tfs.commonPrefix = marshalCommonPrefix(tfs.commonPrefix[:0], nsPrefixTagToMetricIDs, accountID, projectID) } func (tfs *TagFilters) marshal(dst []byte) []byte { @@ -106,7 +106,7 @@ type tagFilter struct { isNegative bool isRegexp bool - // Prefix always contains {nsPrefixTagToMetricID, AccountID, ProjectID, key}. + // Prefix always contains {nsPrefixTagToMetricIDs, AccountID, ProjectID, key}. // Additionally it contains: // - value ending with tagSeparatorChar if !isRegexp. // - non-regexp prefix if isRegexp.