diff --git a/README.md b/README.md index a5a93f33a4..3ee4f65b44 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ Join [our Slack](http://slack.victoriametrics.com/) or [contact us](mailto:info@ ## Prominent features - Supports all the features of [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics). -- Performance and capacity scales horizontally. -- Supports multiple independent namespaces for time series data (aka multi-tenancy). +- Performance and capacity scales horizontally. See [these docs for details](#cluster-resizing-and-scalability). +- Supports multiple independent namespaces for time series data (aka multi-tenancy). See [these docs for details](#multitenancy). +- Supports replication. See [these docs for details](#replication-and-data-safety). ## Architecture overview @@ -203,7 +204,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr across `vmstorage` nodes. -### Cluster resizing and scalability. +### Cluster resizing and scalability Cluster performance and capacity scales with adding new nodes. @@ -283,7 +284,7 @@ Upgrade follows `Cluster resizing procedure` under the hood. ### Replication and data safety -VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`. +By default VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`. It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs), since they are protected from data loss and data corruption. They also provide consistently high performance and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime. @@ -291,7 +292,13 @@ HDD-based persistent disks should be enough for the majority of use cases. It is recommended using durable replicated persistent volumes in Kubernetes. -Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883). +If `-replicationFactor=N` command-line flag is passed to `vminsert`, then `vminsert` puts `N` copies of the ingested data to distinct `vmstorage` nodes. +This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable. Note that `-dedup.minScrapeInterval=1ms` command-line +flag must be passed to `vmselect` if `-replicationFactor` exceeds 1 in order to de-duplicate replicated data during queries. +It is OK if `-dedup.minScrapeInterval` exceeds 1ms. + +Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883), +so it is recommended performing regular backups. See [these docs](#backups) for details. ### Backups diff --git a/app/vminsert/netstorage/insert_ctx.go b/app/vminsert/netstorage/insert_ctx.go index 54639ad096..1a7728d3aa 100644 --- a/app/vminsert/netstorage/insert_ctx.go +++ b/app/vminsert/netstorage/insert_ctx.go @@ -1,6 +1,7 @@ package netstorage import ( + "flag" "fmt" "net/http" @@ -14,6 +15,10 @@ import ( jump "github.com/lithammer/go-jump-consistent-hash" ) +var replicationFactor = flag.Int("replicationFactor", 1, "Replication factor for the ingested data, i.e. how many copies to make among distinct -storageNode instances. "+ + "Note that vmselect must run with -dedup.minScrapeInterval=1ms for data de-duplication when replicationFactor is greater than 1. "+ + "Higher values for -dedup.minScrapeInterval at vmselect is OK") + // InsertCtx is a generic context for inserting data. // // InsertCtx.Reset must be called before the first usage. @@ -115,20 +120,38 @@ func (ctx *InsertCtx) WriteDataPoint(at *auth.Token, labels []prompb.Label, time // WriteDataPointExt writes the given metricNameRaw with (timestmap, value) to ctx buffer with the given storageNodeIdx. func (ctx *InsertCtx) WriteDataPointExt(at *auth.Token, storageNodeIdx int, metricNameRaw []byte, timestamp int64, value float64) error { - br := &ctx.bufRowss[storageNodeIdx] - sn := storageNodes[storageNodeIdx] - bufNew := storage.MarshalMetricRow(br.buf, metricNameRaw, timestamp, value) - if len(bufNew) >= maxBufSizePerStorageNode { - // Send buf to storageNode, since it is too big. - if err := br.pushTo(sn); err != nil { - return err - } - br.buf = storage.MarshalMetricRow(bufNew[:0], metricNameRaw, timestamp, value) - } else { - br.buf = bufNew + idx := storageNodeIdx + replicas := *replicationFactor + if replicas <= 0 { + replicas = 1 + } + if replicas > len(storageNodes) { + replicas = len(storageNodes) + } + for { + br := &ctx.bufRowss[idx] + sn := storageNodes[idx] + bufNew := storage.MarshalMetricRow(br.buf, metricNameRaw, timestamp, value) + if len(bufNew) >= maxBufSizePerStorageNode { + // Send buf to storageNode, since it is too big. + if err := br.pushTo(sn); err != nil { + return err + } + br.buf = storage.MarshalMetricRow(bufNew[:0], metricNameRaw, timestamp, value) + } else { + br.buf = bufNew + } + br.rows++ + + replicas-- + if replicas == 0 { + return nil + } + idx++ + if idx >= len(storageNodes) { + idx = 0 + } } - br.rows++ - return nil } // FlushBufs flushes ctx bufs to remote storage nodes. diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index a5a93f33a4..3ee4f65b44 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -16,8 +16,9 @@ Join [our Slack](http://slack.victoriametrics.com/) or [contact us](mailto:info@ ## Prominent features - Supports all the features of [single-node version](https://github.com/VictoriaMetrics/VictoriaMetrics). -- Performance and capacity scales horizontally. -- Supports multiple independent namespaces for time series data (aka multi-tenancy). +- Performance and capacity scales horizontally. See [these docs for details](#cluster-resizing-and-scalability). +- Supports multiple independent namespaces for time series data (aka multi-tenancy). See [these docs for details](#multitenancy). +- Supports replication. See [these docs for details](#replication-and-data-safety). ## Architecture overview @@ -203,7 +204,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr across `vmstorage` nodes. -### Cluster resizing and scalability. +### Cluster resizing and scalability Cluster performance and capacity scales with adding new nodes. @@ -283,7 +284,7 @@ Upgrade follows `Cluster resizing procedure` under the hood. ### Replication and data safety -VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`. +By default VictoriaMetrics offloads replication to the underlying storage pointed by `-storageDataPath`. It is recommended storing data on [Google Compute Engine persistent disks](https://cloud.google.com/compute/docs/disks/#pdspecs), since they are protected from data loss and data corruption. They also provide consistently high performance and [may be resized](https://cloud.google.com/compute/docs/disks/add-persistent-disk) without downtime. @@ -291,7 +292,13 @@ HDD-based persistent disks should be enough for the majority of use cases. It is recommended using durable replicated persistent volumes in Kubernetes. -Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883). +If `-replicationFactor=N` command-line flag is passed to `vminsert`, then `vminsert` puts `N` copies of the ingested data to distinct `vmstorage` nodes. +This guarantees that all the data remains available for querying if up to `N-1` `vmstorage` nodes are unavailable. Note that `-dedup.minScrapeInterval=1ms` command-line +flag must be passed to `vmselect` if `-replicationFactor` exceeds 1 in order to de-duplicate replicated data during queries. +It is OK if `-dedup.minScrapeInterval` exceeds 1ms. + +Note that [replication doesn't save from disaster](https://medium.com/@valyala/speeding-up-backups-for-big-time-series-databases-533c1a927883), +so it is recommended performing regular backups. See [these docs](#backups) for details. ### Backups