mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-24 03:06:48 +01:00
2b55d167d7
The standard Snappy encoder from github.com/golang/snappy shows quite good performance number for compressing the Prometheus remote_write proto messages according to the added benchmarks, so there is no need in switching to github.com/klauspost/compress/s2 yet.
575 lines
15 KiB
ArmAsm
575 lines
15 KiB
ArmAsm
// Copyright 2020 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// +build !appengine
|
|
// +build gc
|
|
// +build !noasm
|
|
|
|
#include "textflag.h"
|
|
|
|
#define R_TMP0 R2
|
|
#define R_TMP1 R3
|
|
#define R_LEN R4
|
|
#define R_OFF R5
|
|
#define R_SRC R6
|
|
#define R_DST R7
|
|
#define R_DBASE R8
|
|
#define R_DLEN R9
|
|
#define R_DEND R10
|
|
#define R_SBASE R11
|
|
#define R_SLEN R12
|
|
#define R_SEND R13
|
|
#define R_TMP2 R14
|
|
#define R_TMP3 R15
|
|
|
|
// TEST_SRC will check if R_SRC is <= SRC_END
|
|
#define TEST_SRC() \
|
|
CMP R_SEND, R_SRC \
|
|
BGT errCorrupt
|
|
|
|
// MOVD R_SRC, R_TMP1
|
|
// SUB R_SBASE, R_TMP1, R_TMP1
|
|
// CMP R_SLEN, R_TMP1
|
|
// BGT errCorrupt
|
|
|
|
// The asm code generally follows the pure Go code in decode_other.go, except
|
|
// where marked with a "!!!".
|
|
|
|
// func decode(dst, src []byte) int
|
|
//
|
|
// All local variables fit into registers. The non-zero stack size is only to
|
|
// spill registers and push args when issuing a CALL. The register allocation:
|
|
// - R_TMP0 scratch
|
|
// - R_TMP1 scratch
|
|
// - R_LEN length or x
|
|
// - R_OFF offset
|
|
// - R_SRC &src[s]
|
|
// - R_DST &dst[d]
|
|
// + R_DBASE dst_base
|
|
// + R_DLEN dst_len
|
|
// + R_DEND dst_base + dst_len
|
|
// + R_SBASE src_base
|
|
// + R_SLEN src_len
|
|
// + R_SEND src_base + src_len
|
|
// - R_TMP2 used by doCopy
|
|
// - R_TMP3 used by doCopy
|
|
//
|
|
// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
|
|
// function, and after a CALL returns, and are not otherwise modified.
|
|
//
|
|
// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
|
|
// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
|
|
TEXT ·s2Decode(SB), NOSPLIT, $56-64
|
|
// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
|
|
MOVD dst_base+0(FP), R_DBASE
|
|
MOVD dst_len+8(FP), R_DLEN
|
|
MOVD R_DBASE, R_DST
|
|
MOVD R_DBASE, R_DEND
|
|
ADD R_DLEN, R_DEND, R_DEND
|
|
MOVD src_base+24(FP), R_SBASE
|
|
MOVD src_len+32(FP), R_SLEN
|
|
MOVD R_SBASE, R_SRC
|
|
MOVD R_SBASE, R_SEND
|
|
ADD R_SLEN, R_SEND, R_SEND
|
|
MOVD $0, R_OFF
|
|
|
|
loop:
|
|
// for s < len(src)
|
|
CMP R_SEND, R_SRC
|
|
BEQ end
|
|
|
|
// R_LEN = uint32(src[s])
|
|
//
|
|
// switch src[s] & 0x03
|
|
MOVBU (R_SRC), R_LEN
|
|
MOVW R_LEN, R_TMP1
|
|
ANDW $3, R_TMP1
|
|
MOVW $1, R1
|
|
CMPW R1, R_TMP1
|
|
BGE tagCopy
|
|
|
|
// ----------------------------------------
|
|
// The code below handles literal tags.
|
|
|
|
// case tagLiteral:
|
|
// x := uint32(src[s] >> 2)
|
|
// switch
|
|
MOVW $60, R1
|
|
LSRW $2, R_LEN, R_LEN
|
|
CMPW R_LEN, R1
|
|
BLS tagLit60Plus
|
|
|
|
// case x < 60:
|
|
// s++
|
|
ADD $1, R_SRC, R_SRC
|
|
|
|
doLit:
|
|
// This is the end of the inner "switch", when we have a literal tag.
|
|
//
|
|
// We assume that R_LEN == x and x fits in a uint32, where x is the variable
|
|
// used in the pure Go decode_other.go code.
|
|
|
|
// length = int(x) + 1
|
|
//
|
|
// Unlike the pure Go code, we don't need to check if length <= 0 because
|
|
// R_LEN can hold 64 bits, so the increment cannot overflow.
|
|
ADD $1, R_LEN, R_LEN
|
|
|
|
// Prepare to check if copying length bytes will run past the end of dst or
|
|
// src.
|
|
//
|
|
// R_TMP0 = len(dst) - d
|
|
// R_TMP1 = len(src) - s
|
|
MOVD R_DEND, R_TMP0
|
|
SUB R_DST, R_TMP0, R_TMP0
|
|
MOVD R_SEND, R_TMP1
|
|
SUB R_SRC, R_TMP1, R_TMP1
|
|
|
|
// !!! Try a faster technique for short (16 or fewer bytes) copies.
|
|
//
|
|
// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
|
|
// goto callMemmove // Fall back on calling runtime·memmove.
|
|
// }
|
|
//
|
|
// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
|
|
// against 21 instead of 16, because it cannot assume that all of its input
|
|
// is contiguous in memory and so it needs to leave enough source bytes to
|
|
// read the next tag without refilling buffers, but Go's Decode assumes
|
|
// contiguousness (the src argument is a []byte).
|
|
CMP $16, R_LEN
|
|
BGT callMemmove
|
|
CMP $16, R_TMP0
|
|
BLT callMemmove
|
|
CMP $16, R_TMP1
|
|
BLT callMemmove
|
|
|
|
// !!! Implement the copy from src to dst as a 16-byte load and store.
|
|
// (Decode's documentation says that dst and src must not overlap.)
|
|
//
|
|
// This always copies 16 bytes, instead of only length bytes, but that's
|
|
// OK. If the input is a valid Snappy encoding then subsequent iterations
|
|
// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
|
|
// non-nil error), so the overrun will be ignored.
|
|
//
|
|
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
|
|
// 16-byte loads and stores. This technique probably wouldn't be as
|
|
// effective on architectures that are fussier about alignment.
|
|
LDP 0(R_SRC), (R_TMP2, R_TMP3)
|
|
STP (R_TMP2, R_TMP3), 0(R_DST)
|
|
|
|
// d += length
|
|
// s += length
|
|
ADD R_LEN, R_DST, R_DST
|
|
ADD R_LEN, R_SRC, R_SRC
|
|
B loop
|
|
|
|
callMemmove:
|
|
// if length > len(dst)-d || length > len(src)-s { etc }
|
|
CMP R_TMP0, R_LEN
|
|
BGT errCorrupt
|
|
CMP R_TMP1, R_LEN
|
|
BGT errCorrupt
|
|
|
|
// copy(dst[d:], src[s:s+length])
|
|
//
|
|
// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
|
|
// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
|
|
// three registers to the stack, to save local variables across the CALL.
|
|
MOVD R_DST, 8(RSP)
|
|
MOVD R_SRC, 16(RSP)
|
|
MOVD R_LEN, 24(RSP)
|
|
MOVD R_DST, 32(RSP)
|
|
MOVD R_SRC, 40(RSP)
|
|
MOVD R_LEN, 48(RSP)
|
|
MOVD R_OFF, 56(RSP)
|
|
CALL runtime·memmove(SB)
|
|
|
|
// Restore local variables: unspill registers from the stack and
|
|
// re-calculate R_DBASE-R_SEND.
|
|
MOVD 32(RSP), R_DST
|
|
MOVD 40(RSP), R_SRC
|
|
MOVD 48(RSP), R_LEN
|
|
MOVD 56(RSP), R_OFF
|
|
MOVD dst_base+0(FP), R_DBASE
|
|
MOVD dst_len+8(FP), R_DLEN
|
|
MOVD R_DBASE, R_DEND
|
|
ADD R_DLEN, R_DEND, R_DEND
|
|
MOVD src_base+24(FP), R_SBASE
|
|
MOVD src_len+32(FP), R_SLEN
|
|
MOVD R_SBASE, R_SEND
|
|
ADD R_SLEN, R_SEND, R_SEND
|
|
|
|
// d += length
|
|
// s += length
|
|
ADD R_LEN, R_DST, R_DST
|
|
ADD R_LEN, R_SRC, R_SRC
|
|
B loop
|
|
|
|
tagLit60Plus:
|
|
// !!! This fragment does the
|
|
//
|
|
// s += x - 58; if uint(s) > uint(len(src)) { etc }
|
|
//
|
|
// checks. In the asm version, we code it once instead of once per switch case.
|
|
ADD R_LEN, R_SRC, R_SRC
|
|
SUB $58, R_SRC, R_SRC
|
|
TEST_SRC()
|
|
|
|
// case x == 60:
|
|
MOVW $61, R1
|
|
CMPW R1, R_LEN
|
|
BEQ tagLit61
|
|
BGT tagLit62Plus
|
|
|
|
// x = uint32(src[s-1])
|
|
MOVBU -1(R_SRC), R_LEN
|
|
B doLit
|
|
|
|
tagLit61:
|
|
// case x == 61:
|
|
// x = uint32(src[s-2]) | uint32(src[s-1])<<8
|
|
MOVHU -2(R_SRC), R_LEN
|
|
B doLit
|
|
|
|
tagLit62Plus:
|
|
CMPW $62, R_LEN
|
|
BHI tagLit63
|
|
|
|
// case x == 62:
|
|
// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
|
|
MOVHU -3(R_SRC), R_LEN
|
|
MOVBU -1(R_SRC), R_TMP1
|
|
ORR R_TMP1<<16, R_LEN
|
|
B doLit
|
|
|
|
tagLit63:
|
|
// case x == 63:
|
|
// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
|
|
MOVWU -4(R_SRC), R_LEN
|
|
B doLit
|
|
|
|
// The code above handles literal tags.
|
|
// ----------------------------------------
|
|
// The code below handles copy tags.
|
|
|
|
tagCopy4:
|
|
// case tagCopy4:
|
|
// s += 5
|
|
ADD $5, R_SRC, R_SRC
|
|
|
|
// if uint(s) > uint(len(src)) { etc }
|
|
MOVD R_SRC, R_TMP1
|
|
SUB R_SBASE, R_TMP1, R_TMP1
|
|
CMP R_SLEN, R_TMP1
|
|
BGT errCorrupt
|
|
|
|
// length = 1 + int(src[s-5])>>2
|
|
MOVD $1, R1
|
|
ADD R_LEN>>2, R1, R_LEN
|
|
|
|
// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
|
|
MOVWU -4(R_SRC), R_OFF
|
|
B doCopy
|
|
|
|
tagCopy2:
|
|
// case tagCopy2:
|
|
// s += 3
|
|
ADD $3, R_SRC, R_SRC
|
|
|
|
// if uint(s) > uint(len(src)) { etc }
|
|
TEST_SRC()
|
|
|
|
// length = 1 + int(src[s-3])>>2
|
|
MOVD $1, R1
|
|
ADD R_LEN>>2, R1, R_LEN
|
|
|
|
// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
|
|
MOVHU -2(R_SRC), R_OFF
|
|
B doCopy
|
|
|
|
tagCopy:
|
|
// We have a copy tag. We assume that:
|
|
// - R_TMP1 == src[s] & 0x03
|
|
// - R_LEN == src[s]
|
|
CMP $2, R_TMP1
|
|
BEQ tagCopy2
|
|
BGT tagCopy4
|
|
|
|
// case tagCopy1:
|
|
// s += 2
|
|
ADD $2, R_SRC, R_SRC
|
|
|
|
// if uint(s) > uint(len(src)) { etc }
|
|
TEST_SRC()
|
|
|
|
// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
|
|
// Calculate offset in R_TMP0 in case it is a repeat.
|
|
MOVD R_LEN, R_TMP0
|
|
AND $0xe0, R_TMP0
|
|
MOVBU -1(R_SRC), R_TMP1
|
|
ORR R_TMP0<<3, R_TMP1, R_TMP0
|
|
|
|
// length = 4 + int(src[s-2])>>2&0x7
|
|
MOVD $7, R1
|
|
AND R_LEN>>2, R1, R_LEN
|
|
ADD $4, R_LEN, R_LEN
|
|
|
|
// check if repeat code with offset 0.
|
|
CMP $0, R_TMP0
|
|
BEQ repeatCode
|
|
|
|
// This is a regular copy, transfer our temporary value to R_OFF (offset)
|
|
MOVD R_TMP0, R_OFF
|
|
B doCopy
|
|
|
|
// This is a repeat code.
|
|
repeatCode:
|
|
// If length < 9, reuse last offset, with the length already calculated.
|
|
CMP $9, R_LEN
|
|
BLT doCopyRepeat
|
|
BEQ repeatLen1
|
|
CMP $10, R_LEN
|
|
BEQ repeatLen2
|
|
|
|
repeatLen3:
|
|
// s +=3
|
|
ADD $3, R_SRC, R_SRC
|
|
|
|
// if uint(s) > uint(len(src)) { etc }
|
|
TEST_SRC()
|
|
|
|
// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
|
|
MOVBU -1(R_SRC), R_TMP0
|
|
MOVHU -3(R_SRC), R_LEN
|
|
ORR R_TMP0<<16, R_LEN, R_LEN
|
|
ADD $65540, R_LEN, R_LEN
|
|
B doCopyRepeat
|
|
|
|
repeatLen2:
|
|
// s +=2
|
|
ADD $2, R_SRC, R_SRC
|
|
|
|
// if uint(s) > uint(len(src)) { etc }
|
|
TEST_SRC()
|
|
|
|
// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
|
|
MOVHU -2(R_SRC), R_LEN
|
|
ADD $260, R_LEN, R_LEN
|
|
B doCopyRepeat
|
|
|
|
repeatLen1:
|
|
// s +=1
|
|
ADD $1, R_SRC, R_SRC
|
|
|
|
// if uint(s) > uint(len(src)) { etc }
|
|
TEST_SRC()
|
|
|
|
// length = src[s-1] + 8
|
|
MOVBU -1(R_SRC), R_LEN
|
|
ADD $8, R_LEN, R_LEN
|
|
B doCopyRepeat
|
|
|
|
doCopy:
|
|
// This is the end of the outer "switch", when we have a copy tag.
|
|
//
|
|
// We assume that:
|
|
// - R_LEN == length && R_LEN > 0
|
|
// - R_OFF == offset
|
|
|
|
// if d < offset { etc }
|
|
MOVD R_DST, R_TMP1
|
|
SUB R_DBASE, R_TMP1, R_TMP1
|
|
CMP R_OFF, R_TMP1
|
|
BLT errCorrupt
|
|
|
|
// Repeat values can skip the test above, since any offset > 0 will be in dst.
|
|
doCopyRepeat:
|
|
|
|
// if offset <= 0 { etc }
|
|
CMP $0, R_OFF
|
|
BLE errCorrupt
|
|
|
|
// if length > len(dst)-d { etc }
|
|
MOVD R_DEND, R_TMP1
|
|
SUB R_DST, R_TMP1, R_TMP1
|
|
CMP R_TMP1, R_LEN
|
|
BGT errCorrupt
|
|
|
|
// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
|
|
//
|
|
// Set:
|
|
// - R_TMP2 = len(dst)-d
|
|
// - R_TMP3 = &dst[d-offset]
|
|
MOVD R_DEND, R_TMP2
|
|
SUB R_DST, R_TMP2, R_TMP2
|
|
MOVD R_DST, R_TMP3
|
|
SUB R_OFF, R_TMP3, R_TMP3
|
|
|
|
// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
|
|
//
|
|
// First, try using two 8-byte load/stores, similar to the doLit technique
|
|
// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
|
|
// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
|
|
// and not one 16-byte load/store, and the first store has to be before the
|
|
// second load, due to the overlap if offset is in the range [8, 16).
|
|
//
|
|
// if length > 16 || offset < 8 || len(dst)-d < 16 {
|
|
// goto slowForwardCopy
|
|
// }
|
|
// copy 16 bytes
|
|
// d += length
|
|
CMP $16, R_LEN
|
|
BGT slowForwardCopy
|
|
CMP $8, R_OFF
|
|
BLT slowForwardCopy
|
|
CMP $16, R_TMP2
|
|
BLT slowForwardCopy
|
|
MOVD 0(R_TMP3), R_TMP0
|
|
MOVD R_TMP0, 0(R_DST)
|
|
MOVD 8(R_TMP3), R_TMP1
|
|
MOVD R_TMP1, 8(R_DST)
|
|
ADD R_LEN, R_DST, R_DST
|
|
B loop
|
|
|
|
slowForwardCopy:
|
|
// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
|
|
// can still try 8-byte load stores, provided we can overrun up to 10 extra
|
|
// bytes. As above, the overrun will be fixed up by subsequent iterations
|
|
// of the outermost loop.
|
|
//
|
|
// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
|
|
// commentary says:
|
|
//
|
|
// ----
|
|
//
|
|
// The main part of this loop is a simple copy of eight bytes at a time
|
|
// until we've copied (at least) the requested amount of bytes. However,
|
|
// if d and d-offset are less than eight bytes apart (indicating a
|
|
// repeating pattern of length < 8), we first need to expand the pattern in
|
|
// order to get the correct results. For instance, if the buffer looks like
|
|
// this, with the eight-byte <d-offset> and <d> patterns marked as
|
|
// intervals:
|
|
//
|
|
// abxxxxxxxxxxxx
|
|
// [------] d-offset
|
|
// [------] d
|
|
//
|
|
// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
|
|
// once, after which we can move <d> two bytes without moving <d-offset>:
|
|
//
|
|
// ababxxxxxxxxxx
|
|
// [------] d-offset
|
|
// [------] d
|
|
//
|
|
// and repeat the exercise until the two no longer overlap.
|
|
//
|
|
// This allows us to do very well in the special case of one single byte
|
|
// repeated many times, without taking a big hit for more general cases.
|
|
//
|
|
// The worst case of extra writing past the end of the match occurs when
|
|
// offset == 1 and length == 1; the last copy will read from byte positions
|
|
// [0..7] and write to [4..11], whereas it was only supposed to write to
|
|
// position 1. Thus, ten excess bytes.
|
|
//
|
|
// ----
|
|
//
|
|
// That "10 byte overrun" worst case is confirmed by Go's
|
|
// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
|
|
// and finishSlowForwardCopy algorithm.
|
|
//
|
|
// if length > len(dst)-d-10 {
|
|
// goto verySlowForwardCopy
|
|
// }
|
|
SUB $10, R_TMP2, R_TMP2
|
|
CMP R_TMP2, R_LEN
|
|
BGT verySlowForwardCopy
|
|
|
|
// We want to keep the offset, so we use R_TMP2 from here.
|
|
MOVD R_OFF, R_TMP2
|
|
|
|
makeOffsetAtLeast8:
|
|
// !!! As above, expand the pattern so that offset >= 8 and we can use
|
|
// 8-byte load/stores.
|
|
//
|
|
// for offset < 8 {
|
|
// copy 8 bytes from dst[d-offset:] to dst[d:]
|
|
// length -= offset
|
|
// d += offset
|
|
// offset += offset
|
|
// // The two previous lines together means that d-offset, and therefore
|
|
// // R_TMP3, is unchanged.
|
|
// }
|
|
CMP $8, R_TMP2
|
|
BGE fixUpSlowForwardCopy
|
|
MOVD (R_TMP3), R_TMP1
|
|
MOVD R_TMP1, (R_DST)
|
|
SUB R_TMP2, R_LEN, R_LEN
|
|
ADD R_TMP2, R_DST, R_DST
|
|
ADD R_TMP2, R_TMP2, R_TMP2
|
|
B makeOffsetAtLeast8
|
|
|
|
fixUpSlowForwardCopy:
|
|
// !!! Add length (which might be negative now) to d (implied by R_DST being
|
|
// &dst[d]) so that d ends up at the right place when we jump back to the
|
|
// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
|
|
// length is positive, copying the remaining length bytes will write to the
|
|
// right place.
|
|
MOVD R_DST, R_TMP0
|
|
ADD R_LEN, R_DST, R_DST
|
|
|
|
finishSlowForwardCopy:
|
|
// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
|
|
// length means that we overrun, but as above, that will be fixed up by
|
|
// subsequent iterations of the outermost loop.
|
|
MOVD $0, R1
|
|
CMP R1, R_LEN
|
|
BLE loop
|
|
MOVD (R_TMP3), R_TMP1
|
|
MOVD R_TMP1, (R_TMP0)
|
|
ADD $8, R_TMP3, R_TMP3
|
|
ADD $8, R_TMP0, R_TMP0
|
|
SUB $8, R_LEN, R_LEN
|
|
B finishSlowForwardCopy
|
|
|
|
verySlowForwardCopy:
|
|
// verySlowForwardCopy is a simple implementation of forward copy. In C
|
|
// parlance, this is a do/while loop instead of a while loop, since we know
|
|
// that length > 0. In Go syntax:
|
|
//
|
|
// for {
|
|
// dst[d] = dst[d - offset]
|
|
// d++
|
|
// length--
|
|
// if length == 0 {
|
|
// break
|
|
// }
|
|
// }
|
|
MOVB (R_TMP3), R_TMP1
|
|
MOVB R_TMP1, (R_DST)
|
|
ADD $1, R_TMP3, R_TMP3
|
|
ADD $1, R_DST, R_DST
|
|
SUB $1, R_LEN, R_LEN
|
|
CBNZ R_LEN, verySlowForwardCopy
|
|
B loop
|
|
|
|
// The code above handles copy tags.
|
|
// ----------------------------------------
|
|
|
|
end:
|
|
// This is the end of the "for s < len(src)".
|
|
//
|
|
// if d != len(dst) { etc }
|
|
CMP R_DEND, R_DST
|
|
BNE errCorrupt
|
|
|
|
// return 0
|
|
MOVD $0, ret+48(FP)
|
|
RET
|
|
|
|
errCorrupt:
|
|
// return decodeErrCodeCorrupt
|
|
MOVD $1, R_TMP0
|
|
MOVD R_TMP0, ret+48(FP)
|
|
RET
|