mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-15 08:23:34 +01:00
app/vminsert/netstorage: make sure that the the data is always replicated among -replicationFactor vmstorage nodes
Previously vminsert could write multiple copies of the data to a single vmstorage node when the ingestion rate exceeds the maximum throughput for connections to vmstorage nodes.
This commit is contained in:
parent
981caa6f0b
commit
7a61357b5d
@ -1,7 +1,6 @@
|
|||||||
package netstorage
|
package netstorage
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"flag"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
|
||||||
@ -15,10 +14,6 @@ import (
|
|||||||
jump "github.com/lithammer/go-jump-consistent-hash"
|
jump "github.com/lithammer/go-jump-consistent-hash"
|
||||||
)
|
)
|
||||||
|
|
||||||
var replicationFactor = flag.Int("replicationFactor", 1, "Replication factor for the ingested data, i.e. how many copies to make among distinct -storageNode instances. "+
|
|
||||||
"Note that vmselect must run with -dedup.minScrapeInterval=1ms for data de-duplication when replicationFactor is greater than 1. "+
|
|
||||||
"Higher values for -dedup.minScrapeInterval at vmselect is OK")
|
|
||||||
|
|
||||||
// InsertCtx is a generic context for inserting data.
|
// InsertCtx is a generic context for inserting data.
|
||||||
//
|
//
|
||||||
// InsertCtx.Reset must be called before the first usage.
|
// InsertCtx.Reset must be called before the first usage.
|
||||||
@ -120,17 +115,8 @@ func (ctx *InsertCtx) WriteDataPoint(at *auth.Token, labels []prompb.Label, time
|
|||||||
|
|
||||||
// WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx.
|
// WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx.
|
||||||
func (ctx *InsertCtx) WriteDataPointExt(at *auth.Token, storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
|
func (ctx *InsertCtx) WriteDataPointExt(at *auth.Token, storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error {
|
||||||
idx := storageNodeIdx
|
br := &ctx.bufRowss[storageNodeIdx]
|
||||||
replicas := *replicationFactor
|
sn := storageNodes[storageNodeIdx]
|
||||||
if replicas <= 0 {
|
|
||||||
replicas = 1
|
|
||||||
}
|
|
||||||
if replicas > len(storageNodes) {
|
|
||||||
replicas = len(storageNodes)
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
br := &ctx.bufRowss[idx]
|
|
||||||
sn := storageNodes[idx]
|
|
||||||
bufNew := storage.MarshalMetricRow(br.buf, metricNameRaw, timestamp, value)
|
bufNew := storage.MarshalMetricRow(br.buf, metricNameRaw, timestamp, value)
|
||||||
if len(bufNew) >= maxBufSizePerStorageNode {
|
if len(bufNew) >= maxBufSizePerStorageNode {
|
||||||
// Send buf to storageNode, since it is too big.
|
// Send buf to storageNode, since it is too big.
|
||||||
@ -142,16 +128,7 @@ func (ctx *InsertCtx) WriteDataPointExt(at *auth.Token, storageNodeIdx int, metr
|
|||||||
br.buf = bufNew
|
br.buf = bufNew
|
||||||
}
|
}
|
||||||
br.rows++
|
br.rows++
|
||||||
|
|
||||||
replicas--
|
|
||||||
if replicas == 0 {
|
|
||||||
return nil
|
return nil
|
||||||
}
|
|
||||||
idx++
|
|
||||||
if idx >= len(storageNodes) {
|
|
||||||
idx = 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// FlushBufs flushes ctx bufs to remote storage nodes.
|
// FlushBufs flushes ctx bufs to remote storage nodes.
|
||||||
|
@ -21,7 +21,12 @@ import (
|
|||||||
jump "github.com/lithammer/go-jump-consistent-hash"
|
jump "github.com/lithammer/go-jump-consistent-hash"
|
||||||
)
|
)
|
||||||
|
|
||||||
var disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage")
|
var (
|
||||||
|
disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage")
|
||||||
|
replicationFactor = flag.Int("replicationFactor", 1, "Replication factor for the ingested data, i.e. how many copies to make among distinct -storageNode instances. "+
|
||||||
|
"Note that vmselect must run with -dedup.minScrapeInterval=1ms for data de-duplication when replicationFactor is greater than 1. "+
|
||||||
|
"Higher values for -dedup.minScrapeInterval at vmselect is OK")
|
||||||
|
)
|
||||||
|
|
||||||
func (sn *storageNode) isBroken() bool {
|
func (sn *storageNode) isBroken() bool {
|
||||||
return atomic.LoadUint32(&sn.broken) != 0
|
return atomic.LoadUint32(&sn.broken) != 0
|
||||||
@ -78,12 +83,18 @@ var closedCh = func() <-chan struct{} {
|
|||||||
return ch
|
return ch
|
||||||
}()
|
}()
|
||||||
|
|
||||||
func (sn *storageNode) run(stopCh <-chan struct{}) {
|
func (sn *storageNode) run(stopCh <-chan struct{}, snIdx int) {
|
||||||
|
replicas := *replicationFactor
|
||||||
|
if replicas <= 0 {
|
||||||
|
replicas = 1
|
||||||
|
}
|
||||||
|
if replicas > len(storageNodes) {
|
||||||
|
replicas = len(storageNodes)
|
||||||
|
}
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Second)
|
ticker := time.NewTicker(time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
var br bufRows
|
var br bufRows
|
||||||
var bc *handshake.BufferedConn
|
|
||||||
var err error
|
|
||||||
var waitCh <-chan struct{}
|
var waitCh <-chan struct{}
|
||||||
mustStop := false
|
mustStop := false
|
||||||
for !mustStop {
|
for !mustStop {
|
||||||
@ -108,44 +119,94 @@ func (sn *storageNode) run(stopCh <-chan struct{}) {
|
|||||||
sn.br, br = br, sn.br
|
sn.br, br = br, sn.br
|
||||||
sn.brLock.Unlock()
|
sn.brLock.Unlock()
|
||||||
}
|
}
|
||||||
if bc == nil {
|
if len(br.buf) == 0 {
|
||||||
bc, err = sn.dial()
|
// Nothing to send.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send br to replicas storageNodes starting from snIdx.
|
||||||
|
if !sendBufToReplicas(&br, snIdx, replicas) {
|
||||||
|
// do not reset br in the hope it will be sent next time.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
br.reset()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sendBufToReplicas(br *bufRows, snIdx, replicas int) bool {
|
||||||
|
usedStorageNodes := make(map[*storageNode]bool, replicas)
|
||||||
|
for i := 0; i < replicas; i++ {
|
||||||
|
idx := snIdx + i
|
||||||
|
attempts := 0
|
||||||
|
for {
|
||||||
|
attempts++
|
||||||
|
if attempts > len(storageNodes) {
|
||||||
|
if i == 0 {
|
||||||
|
// The data wasn't replicated at all.
|
||||||
|
rowsLostTotal.Add(br.rows)
|
||||||
|
logger.Warnf("cannot push %d bytes with %d rows to storage nodes, since all the nodes are temporarily unavailable; "+
|
||||||
|
"re-trying to send the data soon", len(br.buf), br.rows)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// The data is partially replicated, so just emit a warning and return true.
|
||||||
|
// We could retry sending the data again, but this may result in uncontrolled duplicate data.
|
||||||
|
// So it is better returning true.
|
||||||
|
rowsIncompletelyReplicatedTotal.Add(br.rows)
|
||||||
|
logger.Warnf("cannot make a copy #%d out of %d copies according to -replicationFactor=%d for %d bytes with %d rows, "+
|
||||||
|
"since a part of storage nodes is temporarily unavailable", i+1, replicas, *replicationFactor, len(br.buf), br.rows)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if idx >= len(storageNodes) {
|
||||||
|
idx = 0
|
||||||
|
}
|
||||||
|
sn := storageNodes[idx]
|
||||||
|
idx++
|
||||||
|
if usedStorageNodes[sn] {
|
||||||
|
// The br has been already replicated to sn. Skip it.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !sn.sendBufRows(br) {
|
||||||
|
// Cannot send data to sn. Go to the next sn.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Successfully sent data to sn.
|
||||||
|
usedStorageNodes[sn] = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sn *storageNode) sendBufRows(br *bufRows) bool {
|
||||||
|
sn.bcLock.Lock()
|
||||||
|
defer sn.bcLock.Unlock()
|
||||||
|
|
||||||
|
if sn.bc == nil {
|
||||||
|
bc, err := sn.dial()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Mark sn as broken in order to prevent sending additional data to it until it is recovered.
|
// Mark sn as broken in order to prevent sending additional data to it until it is recovered.
|
||||||
atomic.StoreUint32(&sn.broken, 1)
|
atomic.StoreUint32(&sn.broken, 1)
|
||||||
if len(br.buf) == 0 {
|
logger.Warnf("cannot dial storageNode %q: %s", sn.dialer.Addr(), err)
|
||||||
continue
|
return false
|
||||||
}
|
}
|
||||||
logger.Warnf("re-routing %d bytes with %d rows to other storage nodes because cannot dial storageNode %q: %s",
|
sn.bc = bc
|
||||||
len(br.buf), br.rows, sn.dialer.Addr(), err)
|
|
||||||
if addToReroutedBufNonblock(br.buf, br.rows) {
|
|
||||||
sn.rowsReroutedFromHere.Add(br.rows)
|
|
||||||
br.reset()
|
|
||||||
}
|
}
|
||||||
continue
|
err := sendToConn(sn.bc, br.buf)
|
||||||
}
|
if err == nil {
|
||||||
}
|
|
||||||
if err = sendToConn(bc, br.buf); err == nil {
|
|
||||||
// Successfully sent buf to bc. Remove broken flag from sn.
|
// Successfully sent buf to bc. Remove broken flag from sn.
|
||||||
atomic.StoreUint32(&sn.broken, 0)
|
atomic.StoreUint32(&sn.broken, 0)
|
||||||
sn.rowsSent.Add(br.rows)
|
sn.rowsSent.Add(br.rows)
|
||||||
br.reset()
|
return true
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
// Couldn't flush buf to sn. Mark sn as broken
|
// Couldn't flush buf to sn. Mark sn as broken.
|
||||||
// and try re-routing buf to healthy vmstorage nodes.
|
logger.Warnf("cannot send %d bytes with %d rows to %q: %s; re-routing this data to healthy storage nodes", len(br.buf), br.rows, sn.dialer.Addr(), err)
|
||||||
if err = bc.Close(); err != nil {
|
if err = sn.bc.Close(); err != nil {
|
||||||
logger.Warnf("cannot close connection to storageNode %q: %s", sn.dialer.Addr(), err)
|
logger.Warnf("cannot close connection to storageNode %q: %s", sn.dialer.Addr(), err)
|
||||||
// continue executing the code below.
|
|
||||||
}
|
}
|
||||||
bc = nil
|
sn.bc = nil
|
||||||
sn.connectionErrors.Inc()
|
sn.connectionErrors.Inc()
|
||||||
atomic.StoreUint32(&sn.broken, 1)
|
atomic.StoreUint32(&sn.broken, 1)
|
||||||
if addToReroutedBufNonblock(br.buf, br.rows) {
|
return false
|
||||||
sn.rowsReroutedFromHere.Add(br.rows)
|
|
||||||
br.reset()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func sendToConn(bc *handshake.BufferedConn, buf []byte) error {
|
func sendToConn(bc *handshake.BufferedConn, buf []byte) error {
|
||||||
@ -272,8 +333,16 @@ type storageNode struct {
|
|||||||
brLock sync.Mutex
|
brLock sync.Mutex
|
||||||
|
|
||||||
// Buffer with data that needs to be written to the storage node.
|
// Buffer with data that needs to be written to the storage node.
|
||||||
|
// It must be accessed under brLock.
|
||||||
br bufRows
|
br bufRows
|
||||||
|
|
||||||
|
// bcLock protects bc.
|
||||||
|
bcLock sync.Mutex
|
||||||
|
|
||||||
|
// bc is a single connection to vmstorage for data transfer.
|
||||||
|
// It must be accessed under bcLock.
|
||||||
|
bc *handshake.BufferedConn
|
||||||
|
|
||||||
dialer *netutil.TCPDialer
|
dialer *netutil.TCPDialer
|
||||||
|
|
||||||
// The number of dial errors to vmstorage node.
|
// The number of dial errors to vmstorage node.
|
||||||
@ -322,6 +391,7 @@ func InitStorageNodes(addrs []string) {
|
|||||||
logger.Panicf("BUG: too much addresses: %d; max supported %d addresses", len(addrs), 255)
|
logger.Panicf("BUG: too much addresses: %d; max supported %d addresses", len(addrs), 255)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
storageNodes = storageNodes[:0]
|
||||||
for _, addr := range addrs {
|
for _, addr := range addrs {
|
||||||
sn := &storageNode{
|
sn := &storageNode{
|
||||||
dialer: netutil.NewTCPDialer("vminsert", addr),
|
dialer: netutil.NewTCPDialer("vminsert", addr),
|
||||||
@ -347,11 +417,6 @@ func InitStorageNodes(addrs []string) {
|
|||||||
return float64(n)
|
return float64(n)
|
||||||
})
|
})
|
||||||
storageNodes = append(storageNodes, sn)
|
storageNodes = append(storageNodes, sn)
|
||||||
storageNodesWG.Add(1)
|
|
||||||
go func(addr string) {
|
|
||||||
sn.run(storageNodesStopCh)
|
|
||||||
storageNodesWG.Done()
|
|
||||||
}(addr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
maxBufSizePerStorageNode = memory.Allowed() / 8 / len(storageNodes)
|
maxBufSizePerStorageNode = memory.Allowed() / 8 / len(storageNodes)
|
||||||
@ -365,6 +430,15 @@ func InitStorageNodes(addrs []string) {
|
|||||||
if reroutedBufMaxSize > maxBufSizePerStorageNode*len(storageNodes) {
|
if reroutedBufMaxSize > maxBufSizePerStorageNode*len(storageNodes) {
|
||||||
reroutedBufMaxSize = maxBufSizePerStorageNode * len(storageNodes)
|
reroutedBufMaxSize = maxBufSizePerStorageNode * len(storageNodes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for idx, sn := range storageNodes {
|
||||||
|
storageNodesWG.Add(1)
|
||||||
|
go func(sn *storageNode, idx int) {
|
||||||
|
sn.run(storageNodesStopCh, idx)
|
||||||
|
storageNodesWG.Done()
|
||||||
|
}(sn, idx)
|
||||||
|
}
|
||||||
|
|
||||||
rerouteWorkerWG.Add(1)
|
rerouteWorkerWG.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
rerouteWorker(rerouteWorkerStopCh)
|
rerouteWorker(rerouteWorkerStopCh)
|
||||||
@ -422,24 +496,6 @@ func addToReroutedBuf(buf []byte, rows int) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// addToReroutedBufNonblock adds buf to reroutedBR.
|
|
||||||
//
|
|
||||||
// It returns true if buf has been successfully added to reroutedBR.
|
|
||||||
func addToReroutedBufNonblock(buf []byte, rows int) bool {
|
|
||||||
if len(buf) > reroutedBufMaxSize {
|
|
||||||
logger.Panicf("BUG: len(buf)=%d cannot exceed reroutedBufMaxSize=%d", len(buf), reroutedBufMaxSize)
|
|
||||||
}
|
|
||||||
reroutedBRLock.Lock()
|
|
||||||
ok := len(reroutedBR.buf)+len(buf) <= reroutedBufMaxSize
|
|
||||||
if ok {
|
|
||||||
reroutedBR.buf = append(reroutedBR.buf, buf...)
|
|
||||||
reroutedBR.rows += rows
|
|
||||||
reroutesTotal.Inc()
|
|
||||||
}
|
|
||||||
reroutedBRLock.Unlock()
|
|
||||||
return ok
|
|
||||||
}
|
|
||||||
|
|
||||||
func getHealthyStorageNodesCount() int {
|
func getHealthyStorageNodesCount() int {
|
||||||
n := 0
|
n := 0
|
||||||
for _, sn := range storageNodes {
|
for _, sn := range storageNodes {
|
||||||
@ -552,4 +608,5 @@ var (
|
|||||||
|
|
||||||
rerouteErrors = metrics.NewCounter(`vm_rpc_reroute_errors_total{name="vminsert"}`)
|
rerouteErrors = metrics.NewCounter(`vm_rpc_reroute_errors_total{name="vminsert"}`)
|
||||||
rowsLostTotal = metrics.NewCounter(`vm_rpc_rows_lost_total{name="vminsert"}`)
|
rowsLostTotal = metrics.NewCounter(`vm_rpc_rows_lost_total{name="vminsert"}`)
|
||||||
|
rowsIncompletelyReplicatedTotal = metrics.NewCounter(`vm_rpc_rows_incompletely_replicated_total{name="vminsert"}`)
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user