VictoriaMetrics/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s

4176 lines
82 KiB
ArmAsm
Raw Normal View History

2022-05-02 15:00:32 +02:00
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
2022-05-02 15:00:32 +02:00
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
MOVQ 104(AX), R10
MOVQ s+0(FP), AX
MOVQ 144(AX), R11
MOVQ 152(AX), R12
MOVQ 160(AX), R13
sequenceDecs_decode_amd64_main_loop:
MOVQ (SP), R14
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decode_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R14
MOVQ (R14), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decode_amd64_fill_end
sequenceDecs_decode_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_amd64_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decode_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R14
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R14), AX
ORQ AX, DX
JMP sequenceDecs_decode_amd64_fill_byte_by_byte
sequenceDecs_decode_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_amd64_fill_end:
// Update offset
2022-07-21 20:10:25 +02:00
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_amd64_of_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_amd64_of_update_zero:
MOVQ AX, 16(R10)
2022-05-02 15:00:32 +02:00
// Update match length
2022-07-21 20:10:25 +02:00
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_amd64_ml_update_zero:
MOVQ AX, 8(R10)
2022-05-02 15:00:32 +02:00
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R14
MOVQ (R14), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decode_amd64_fill_2_end
sequenceDecs_decode_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_amd64_fill_2_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decode_amd64_fill_2_end
SHLQ $0x08, DX
SUBQ $0x01, R14
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R14), AX
ORQ AX, DX
JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
sequenceDecs_decode_amd64_fill_2_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_amd64_fill_2_end:
// Update literal length
2022-07-21 20:10:25 +02:00
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_amd64_ll_update_zero:
MOVQ AX, (R10)
2022-05-02 15:00:32 +02:00
// Fill bitreader for state updates
MOVQ R14, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R14*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R15
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R15
MOVL $0x00000001, BP
MOVB R14, CL
SHLL CL, BP
DECL BP
ANDQ BP, R15
2022-05-02 15:00:32 +02:00
ADDQ R15, DI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R14*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R15
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R15
MOVL $0x00000001, BP
MOVB R14, CL
SHLL CL, BP
DECL BP
ANDQ BP, R15
2022-05-02 15:00:32 +02:00
ADDQ R15, R8
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R14*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R15
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R15
MOVL $0x00000001, BP
MOVB R14, CL
SHLL CL, BP
DECL BP
ANDQ BP, R15
2022-05-02 15:00:32 +02:00
ADDQ R15, R9
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decode_amd64_skip_update:
// Adjust offset
MOVQ 16(R10), CX
CMPQ AX, $0x01
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
sequenceDecs_decode_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
MOVQ R11, CX
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_amd64_adjust_zero
JEQ sequenceDecs_decode_amd64_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_amd64_adjust_three
JMP sequenceDecs_decode_amd64_adjust_two
sequenceDecs_decode_amd64_adjust_zero:
MOVQ R11, AX
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
sequenceDecs_decode_amd64_adjust_one:
MOVQ R12, AX
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
sequenceDecs_decode_amd64_adjust_two:
MOVQ R13, AX
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
sequenceDecs_decode_amd64_adjust_three:
LEAQ -1(R11), AX
sequenceDecs_decode_amd64_adjust_test_temp_valid:
TESTQ AX, AX
JNZ sequenceDecs_decode_amd64_adjust_temp_valid
MOVQ $0x00000001, AX
sequenceDecs_decode_amd64_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R12, R13
MOVQ R11, R12
MOVQ AX, R11
MOVQ AX, CX
2022-07-21 20:10:25 +02:00
sequenceDecs_decode_amd64_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ CX, 16(R10)
// Check values
MOVQ 8(R10), AX
MOVQ (R10), R14
LEAQ (AX)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decode_amd64_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
sequenceDecs_decode_amd64_match_len_ofs_ok:
ADDQ $0x18, R10
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decode_amd64_main_loop
MOVQ s+0(FP), AX
MOVQ R11, 144(AX)
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
2023-10-02 21:49:16 +02:00
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
2022-05-02 15:00:32 +02:00
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_amd64_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
2022-05-02 15:00:32 +02:00
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
MOVQ 104(AX), R10
MOVQ s+0(FP), AX
MOVQ 144(AX), R11
MOVQ 152(AX), R12
MOVQ 160(AX), R13
sequenceDecs_decode_56_amd64_main_loop:
MOVQ (SP), R14
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R14
MOVQ (R14), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decode_56_amd64_fill_end
sequenceDecs_decode_56_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_56_amd64_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decode_56_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R14
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R14), AX
ORQ AX, DX
JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
sequenceDecs_decode_56_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_amd64_fill_end:
// Update offset
2022-07-21 20:10:25 +02:00
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_56_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_56_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_56_amd64_of_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_56_amd64_of_update_zero:
MOVQ AX, 16(R10)
2022-05-02 15:00:32 +02:00
// Update match length
2022-07-21 20:10:25 +02:00
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_56_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_56_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_56_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_56_amd64_ml_update_zero:
MOVQ AX, 8(R10)
2022-05-02 15:00:32 +02:00
// Update literal length
2022-07-21 20:10:25 +02:00
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decode_56_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decode_56_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decode_56_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R15
ADDQ R15, AX
sequenceDecs_decode_56_amd64_ll_update_zero:
MOVQ AX, (R10)
2022-05-02 15:00:32 +02:00
// Fill bitreader for state updates
MOVQ R14, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_56_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R14*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R15
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R15
MOVL $0x00000001, BP
MOVB R14, CL
SHLL CL, BP
DECL BP
ANDQ BP, R15
2022-05-02 15:00:32 +02:00
ADDQ R15, DI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R14*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R15
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R15
MOVL $0x00000001, BP
MOVB R14, CL
SHLL CL, BP
DECL BP
ANDQ BP, R15
2022-05-02 15:00:32 +02:00
ADDQ R15, R8
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R14*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R15
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R15
MOVL $0x00000001, BP
MOVB R14, CL
SHLL CL, BP
DECL BP
ANDQ BP, R15
2022-05-02 15:00:32 +02:00
ADDQ R15, R9
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decode_56_amd64_skip_update:
// Adjust offset
MOVQ 16(R10), CX
CMPQ AX, $0x01
JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_56_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
MOVQ R11, CX
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_56_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_56_amd64_adjust_zero
JEQ sequenceDecs_decode_56_amd64_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_56_amd64_adjust_three
JMP sequenceDecs_decode_56_amd64_adjust_two
sequenceDecs_decode_56_amd64_adjust_zero:
MOVQ R11, AX
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
sequenceDecs_decode_56_amd64_adjust_one:
MOVQ R12, AX
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
sequenceDecs_decode_56_amd64_adjust_two:
MOVQ R13, AX
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
sequenceDecs_decode_56_amd64_adjust_three:
LEAQ -1(R11), AX
sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
TESTQ AX, AX
JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
MOVQ $0x00000001, AX
sequenceDecs_decode_56_amd64_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R12, R13
MOVQ R11, R12
MOVQ AX, R11
MOVQ AX, CX
2022-07-21 20:10:25 +02:00
sequenceDecs_decode_56_amd64_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ CX, 16(R10)
// Check values
MOVQ 8(R10), AX
MOVQ (R10), R14
LEAQ (AX)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decode_56_amd64_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
sequenceDecs_decode_56_amd64_match_len_ofs_ok:
ADDQ $0x18, R10
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decode_56_amd64_main_loop
MOVQ s+0(FP), AX
MOVQ R11, 144(AX)
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
2023-10-02 21:49:16 +02:00
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
2022-05-02 15:00:32 +02:00
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_56_amd64_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
2022-05-02 15:00:32 +02:00
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
MOVQ 104(CX), R9
MOVQ s+0(FP), CX
MOVQ 144(CX), R10
MOVQ 152(CX), R11
MOVQ 160(CX), R12
sequenceDecs_decode_bmi2_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decode_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R13
MOVQ (R13), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decode_bmi2_fill_end
sequenceDecs_decode_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_bmi2_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decode_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R13
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R13), CX
ORQ CX, AX
JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
sequenceDecs_decode_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 16(R9)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 8(R9)
// Fill bitreader to have enough for the remaining
CMPQ BX, $0x08
JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R13
MOVQ (R13), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decode_bmi2_fill_2_end
sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_bmi2_fill_2_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decode_bmi2_fill_2_end
SHLQ $0x08, AX
SUBQ $0x01, R13
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R13), CX
ORQ CX, AX
JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
sequenceDecs_decode_bmi2_fill_2_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, (R9)
// Fill bitreader for state updates
2022-05-31 11:57:04 +02:00
MOVQ R13, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_bmi2_skip_update
LEAQ (SI)(DI*1), R14
ADDQ R8, R14
MOVBQZX R14, R14
2022-05-02 15:00:32 +02:00
LEAQ (DX)(R14*1), CX
MOVQ AX, R15
MOVQ CX, DX
ROLQ CL, R15
BZHIQ R14, R15, R15
2022-05-31 11:57:04 +02:00
// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
2022-05-02 15:00:32 +02:00
// Update Match Length State
2022-05-31 11:57:04 +02:00
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
2022-05-02 15:00:32 +02:00
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
2022-05-31 11:57:04 +02:00
// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
2022-05-02 15:00:32 +02:00
2022-05-31 11:57:04 +02:00
// Load ctx.llTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_bmi2_skip_update:
// Adjust offset
MOVQ 16(R9), CX
CMPQ R13, $0x01
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
sequenceDecs_decode_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
MOVQ R10, CX
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_bmi2_adjust_zero
JEQ sequenceDecs_decode_bmi2_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_bmi2_adjust_three
JMP sequenceDecs_decode_bmi2_adjust_two
sequenceDecs_decode_bmi2_adjust_zero:
MOVQ R10, R13
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
sequenceDecs_decode_bmi2_adjust_one:
MOVQ R11, R13
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
sequenceDecs_decode_bmi2_adjust_two:
MOVQ R12, R13
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
sequenceDecs_decode_bmi2_adjust_three:
LEAQ -1(R10), R13
sequenceDecs_decode_bmi2_adjust_test_temp_valid:
TESTQ R13, R13
JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
MOVQ $0x00000001, R13
sequenceDecs_decode_bmi2_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R11, R12
MOVQ R10, R11
MOVQ R13, R10
MOVQ R13, CX
2022-07-21 20:10:25 +02:00
sequenceDecs_decode_bmi2_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ CX, 16(R9)
// Check values
MOVQ 8(R9), R13
MOVQ (R9), R14
LEAQ (R13)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ R13, $0x00020002
JA sequenceDecs_decode_bmi2_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
TESTQ R13, R13
JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decode_bmi2_match_len_ofs_ok:
ADDQ $0x18, R9
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decode_bmi2_main_loop
MOVQ s+0(FP), CX
MOVQ R10, 144(CX)
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
2023-10-02 21:49:16 +02:00
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
2022-05-02 15:00:32 +02:00
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_bmi2_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
2022-05-02 15:00:32 +02:00
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
MOVQ 104(CX), R9
MOVQ s+0(FP), CX
MOVQ 144(CX), R10
MOVQ 152(CX), R11
MOVQ 160(CX), R12
sequenceDecs_decode_56_bmi2_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R13
MOVQ (R13), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decode_56_bmi2_fill_end
sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_56_bmi2_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decode_56_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R13
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R13), CX
ORQ CX, AX
JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
sequenceDecs_decode_56_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 16(R9)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 8(R9)
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, (R9)
// Fill bitreader for state updates
2022-05-31 11:57:04 +02:00
MOVQ R13, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_56_bmi2_skip_update
LEAQ (SI)(DI*1), R14
ADDQ R8, R14
MOVBQZX R14, R14
2022-05-02 15:00:32 +02:00
LEAQ (DX)(R14*1), CX
MOVQ AX, R15
MOVQ CX, DX
ROLQ CL, R15
BZHIQ R14, R15, R15
2022-05-31 11:57:04 +02:00
// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
2022-05-02 15:00:32 +02:00
// Update Match Length State
2022-05-31 11:57:04 +02:00
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
2022-05-02 15:00:32 +02:00
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
2022-05-31 11:57:04 +02:00
// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
2022-05-02 15:00:32 +02:00
2022-05-31 11:57:04 +02:00
// Load ctx.llTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_bmi2_skip_update:
// Adjust offset
MOVQ 16(R9), CX
CMPQ R13, $0x01
JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_56_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
MOVQ R10, CX
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decode_56_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_56_bmi2_adjust_zero
JEQ sequenceDecs_decode_56_bmi2_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_56_bmi2_adjust_three
JMP sequenceDecs_decode_56_bmi2_adjust_two
sequenceDecs_decode_56_bmi2_adjust_zero:
MOVQ R10, R13
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
sequenceDecs_decode_56_bmi2_adjust_one:
MOVQ R11, R13
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
sequenceDecs_decode_56_bmi2_adjust_two:
MOVQ R12, R13
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
sequenceDecs_decode_56_bmi2_adjust_three:
LEAQ -1(R10), R13
sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
TESTQ R13, R13
JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
MOVQ $0x00000001, R13
sequenceDecs_decode_56_bmi2_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R11, R12
MOVQ R10, R11
MOVQ R13, R10
MOVQ R13, CX
2022-07-21 20:10:25 +02:00
sequenceDecs_decode_56_bmi2_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ CX, 16(R9)
// Check values
MOVQ 8(R9), R13
MOVQ (R9), R14
LEAQ (R13)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ R13, $0x00020002
JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
TESTQ R13, R13
JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
ADDQ $0x18, R9
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decode_56_bmi2_main_loop
MOVQ s+0(FP), CX
MOVQ R10, 144(CX)
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
2023-10-02 21:49:16 +02:00
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
2022-05-02 15:00:32 +02:00
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_56_bmi2_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
MOVQ ctx+0(FP), R10
MOVQ 8(R10), CX
TESTQ CX, CX
JZ empty_seqs
MOVQ (R10), AX
MOVQ 24(R10), DX
MOVQ 32(R10), BX
MOVQ 80(R10), SI
MOVQ 104(R10), DI
MOVQ 120(R10), R8
MOVQ 56(R10), R9
MOVQ 64(R10), R10
ADDQ R10, R9
// seqsBase += 24 * seqIndex
LEAQ (DX)(DX*2), R11
SHLQ $0x03, R11
ADDQ R11, AX
// outBase += outPosition
ADDQ DI, BX
2022-05-31 11:57:04 +02:00
main_loop:
MOVQ (AX), R11
MOVQ 16(AX), R12
MOVQ 8(AX), R13
// Copy literals
TESTQ R11, R11
JZ check_offset
XORQ R14, R14
copy_1:
MOVUPS (SI)(R14*1), X0
MOVUPS X0, (BX)(R14*1)
ADDQ $0x10, R14
CMPQ R14, R11
JB copy_1
ADDQ R11, SI
ADDQ R11, BX
ADDQ R11, DI
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
LEAQ (DI)(R10*1), R11
CMPQ R12, R11
JG error_match_off_too_big
CMPQ R12, R8
JG error_match_off_too_big
// Copy match from history
2022-07-13 15:43:46 +02:00
MOVQ R12, R11
SUBQ DI, R11
JLS copy_match
MOVQ R9, R14
SUBQ R11, R14
CMPQ R13, R11
JG copy_all_from_history
MOVQ R13, R11
SUBQ $0x10, R11
JB copy_4_small
copy_4_loop:
MOVUPS (R14), X0
MOVUPS X0, (BX)
ADDQ $0x10, R14
ADDQ $0x10, BX
SUBQ $0x10, R11
JAE copy_4_loop
LEAQ 16(R14)(R11*1), R14
LEAQ 16(BX)(R11*1), BX
MOVUPS -16(R14), X0
MOVUPS X0, -16(BX)
JMP copy_4_end
copy_4_small:
CMPQ R13, $0x03
JE copy_4_move_3
CMPQ R13, $0x08
JB copy_4_move_4through7
JMP copy_4_move_8through16
copy_4_move_3:
MOVW (R14), R11
MOVB 2(R14), R12
MOVW R11, (BX)
MOVB R12, 2(BX)
ADDQ R13, R14
ADDQ R13, BX
JMP copy_4_end
copy_4_move_4through7:
MOVL (R14), R11
MOVL -4(R14)(R13*1), R12
MOVL R11, (BX)
MOVL R12, -4(BX)(R13*1)
ADDQ R13, R14
ADDQ R13, BX
JMP copy_4_end
copy_4_move_8through16:
MOVQ (R14), R11
MOVQ -8(R14)(R13*1), R12
MOVQ R11, (BX)
MOVQ R12, -8(BX)(R13*1)
ADDQ R13, R14
ADDQ R13, BX
2022-05-31 11:57:04 +02:00
2022-07-13 15:43:46 +02:00
copy_4_end:
2022-05-31 11:57:04 +02:00
ADDQ R13, DI
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
JMP loop_finished
copy_all_from_history:
2022-07-13 15:43:46 +02:00
MOVQ R11, R15
SUBQ $0x10, R15
JB copy_5_small
copy_5_loop:
MOVUPS (R14), X0
MOVUPS X0, (BX)
ADDQ $0x10, R14
ADDQ $0x10, BX
SUBQ $0x10, R15
JAE copy_5_loop
LEAQ 16(R14)(R15*1), R14
LEAQ 16(BX)(R15*1), BX
MOVUPS -16(R14), X0
MOVUPS X0, -16(BX)
JMP copy_5_end
copy_5_small:
CMPQ R11, $0x03
JE copy_5_move_3
JB copy_5_move_1or2
CMPQ R11, $0x08
JB copy_5_move_4through7
JMP copy_5_move_8through16
copy_5_move_1or2:
MOVB (R14), R15
MOVB -1(R14)(R11*1), BP
MOVB R15, (BX)
MOVB BP, -1(BX)(R11*1)
ADDQ R11, R14
ADDQ R11, BX
JMP copy_5_end
copy_5_move_3:
MOVW (R14), R15
MOVB 2(R14), BP
MOVW R15, (BX)
MOVB BP, 2(BX)
ADDQ R11, R14
2022-05-31 11:57:04 +02:00
ADDQ R11, BX
2022-07-13 15:43:46 +02:00
JMP copy_5_end
copy_5_move_4through7:
MOVL (R14), R15
MOVL -4(R14)(R11*1), BP
MOVL R15, (BX)
MOVL BP, -4(BX)(R11*1)
ADDQ R11, R14
ADDQ R11, BX
JMP copy_5_end
copy_5_move_8through16:
MOVQ (R14), R15
MOVQ -8(R14)(R11*1), BP
MOVQ R15, (BX)
MOVQ BP, -8(BX)(R11*1)
ADDQ R11, R14
ADDQ R11, BX
copy_5_end:
2022-05-31 11:57:04 +02:00
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
2022-07-13 15:43:46 +02:00
MOVQ BX, R11
SUBQ R12, R11
2022-05-31 11:57:04 +02:00
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, DI
MOVQ BX, R12
ADDQ R13, BX
copy_2:
MOVUPS (R11), X0
MOVUPS X0, (R12)
ADDQ $0x10, R11
ADDQ $0x10, R12
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, DI
copy_slow_3:
MOVB (R11), R12
MOVB R12, (BX)
INCQ R11
INCQ BX
DECQ R13
JNZ copy_slow_3
handle_loop:
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
loop_finished:
// Return value
MOVB $0x01, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
2023-01-04 08:36:36 +01:00
SUBQ 80(AX), SI
2022-05-31 11:57:04 +02:00
MOVQ SI, 112(AX)
RET
error_match_off_too_big:
// Return value
MOVB $0x00, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
2023-01-04 08:36:36 +01:00
SUBQ 80(AX), SI
2022-05-31 11:57:04 +02:00
MOVQ SI, 112(AX)
RET
empty_seqs:
// Return value
MOVB $0x01, ret+8(FP)
RET
// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
MOVQ ctx+0(FP), R10
MOVQ 8(R10), CX
TESTQ CX, CX
JZ empty_seqs
MOVQ (R10), AX
MOVQ 24(R10), DX
MOVQ 32(R10), BX
MOVQ 80(R10), SI
MOVQ 104(R10), DI
MOVQ 120(R10), R8
MOVQ 56(R10), R9
MOVQ 64(R10), R10
ADDQ R10, R9
// seqsBase += 24 * seqIndex
LEAQ (DX)(DX*2), R11
SHLQ $0x03, R11
ADDQ R11, AX
// outBase += outPosition
ADDQ DI, BX
2022-05-02 15:00:32 +02:00
main_loop:
MOVQ (AX), R11
MOVQ 16(AX), R12
MOVQ 8(AX), R13
// Copy literals
TESTQ R11, R11
JZ check_offset
2022-07-13 15:43:46 +02:00
MOVQ R11, R14
SUBQ $0x10, R14
JB copy_1_small
copy_1_loop:
MOVUPS (SI), X0
MOVUPS X0, (BX)
ADDQ $0x10, SI
ADDQ $0x10, BX
SUBQ $0x10, R14
JAE copy_1_loop
LEAQ 16(SI)(R14*1), SI
LEAQ 16(BX)(R14*1), BX
MOVUPS -16(SI), X0
MOVUPS X0, -16(BX)
JMP copy_1_end
copy_1_small:
CMPQ R11, $0x03
JE copy_1_move_3
JB copy_1_move_1or2
CMPQ R11, $0x08
JB copy_1_move_4through7
JMP copy_1_move_8through16
copy_1_move_1or2:
MOVB (SI), R14
MOVB -1(SI)(R11*1), R15
MOVB R14, (BX)
MOVB R15, -1(BX)(R11*1)
ADDQ R11, SI
ADDQ R11, BX
JMP copy_1_end
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_1_move_3:
MOVW (SI), R14
MOVB 2(SI), R15
MOVW R14, (BX)
MOVB R15, 2(BX)
ADDQ R11, SI
ADDQ R11, BX
JMP copy_1_end
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_1_move_4through7:
MOVL (SI), R14
MOVL -4(SI)(R11*1), R15
MOVL R14, (BX)
MOVL R15, -4(BX)(R11*1)
2022-05-02 15:00:32 +02:00
ADDQ R11, SI
ADDQ R11, BX
2022-07-13 15:43:46 +02:00
JMP copy_1_end
copy_1_move_8through16:
MOVQ (SI), R14
MOVQ -8(SI)(R11*1), R15
MOVQ R14, (BX)
MOVQ R15, -8(BX)(R11*1)
ADDQ R11, SI
ADDQ R11, BX
copy_1_end:
2022-05-02 15:00:32 +02:00
ADDQ R11, DI
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
LEAQ (DI)(R10*1), R11
CMPQ R12, R11
JG error_match_off_too_big
CMPQ R12, R8
JG error_match_off_too_big
// Copy match from history
2022-07-13 15:43:46 +02:00
MOVQ R12, R11
SUBQ DI, R11
JLS copy_match
MOVQ R9, R14
SUBQ R11, R14
CMPQ R13, R11
JG copy_all_from_history
MOVQ R13, R11
SUBQ $0x10, R11
JB copy_4_small
copy_4_loop:
MOVUPS (R14), X0
MOVUPS X0, (BX)
ADDQ $0x10, R14
ADDQ $0x10, BX
SUBQ $0x10, R11
JAE copy_4_loop
LEAQ 16(R14)(R11*1), R14
LEAQ 16(BX)(R11*1), BX
MOVUPS -16(R14), X0
MOVUPS X0, -16(BX)
JMP copy_4_end
copy_4_small:
CMPQ R13, $0x03
JE copy_4_move_3
CMPQ R13, $0x08
JB copy_4_move_4through7
JMP copy_4_move_8through16
copy_4_move_3:
MOVW (R14), R11
MOVB 2(R14), R12
MOVW R11, (BX)
MOVB R12, 2(BX)
ADDQ R13, R14
ADDQ R13, BX
JMP copy_4_end
copy_4_move_4through7:
MOVL (R14), R11
MOVL -4(R14)(R13*1), R12
MOVL R11, (BX)
MOVL R12, -4(BX)(R13*1)
ADDQ R13, R14
ADDQ R13, BX
JMP copy_4_end
copy_4_move_8through16:
MOVQ (R14), R11
MOVQ -8(R14)(R13*1), R12
MOVQ R11, (BX)
MOVQ R12, -8(BX)(R13*1)
ADDQ R13, R14
ADDQ R13, BX
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_4_end:
2022-05-02 15:00:32 +02:00
ADDQ R13, DI
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
JMP loop_finished
copy_all_from_history:
2022-07-13 15:43:46 +02:00
MOVQ R11, R15
SUBQ $0x10, R15
JB copy_5_small
copy_5_loop:
MOVUPS (R14), X0
MOVUPS X0, (BX)
ADDQ $0x10, R14
ADDQ $0x10, BX
SUBQ $0x10, R15
JAE copy_5_loop
LEAQ 16(R14)(R15*1), R14
LEAQ 16(BX)(R15*1), BX
MOVUPS -16(R14), X0
MOVUPS X0, -16(BX)
JMP copy_5_end
copy_5_small:
CMPQ R11, $0x03
JE copy_5_move_3
JB copy_5_move_1or2
CMPQ R11, $0x08
JB copy_5_move_4through7
JMP copy_5_move_8through16
copy_5_move_1or2:
MOVB (R14), R15
MOVB -1(R14)(R11*1), BP
MOVB R15, (BX)
MOVB BP, -1(BX)(R11*1)
ADDQ R11, R14
ADDQ R11, BX
JMP copy_5_end
copy_5_move_3:
MOVW (R14), R15
MOVB 2(R14), BP
MOVW R15, (BX)
MOVB BP, 2(BX)
ADDQ R11, R14
ADDQ R11, BX
JMP copy_5_end
copy_5_move_4through7:
MOVL (R14), R15
MOVL -4(R14)(R11*1), BP
MOVL R15, (BX)
MOVL BP, -4(BX)(R11*1)
ADDQ R11, R14
ADDQ R11, BX
JMP copy_5_end
copy_5_move_8through16:
MOVQ (R14), R15
MOVQ -8(R14)(R11*1), BP
MOVQ R15, (BX)
MOVQ BP, -8(BX)(R11*1)
ADDQ R11, R14
2022-05-02 15:00:32 +02:00
ADDQ R11, BX
2022-07-13 15:43:46 +02:00
copy_5_end:
2022-05-02 15:00:32 +02:00
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
2022-07-13 15:43:46 +02:00
MOVQ BX, R11
SUBQ R12, R11
2022-05-02 15:00:32 +02:00
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
2022-07-13 15:43:46 +02:00
ADDQ R13, DI
MOVQ R13, R12
SUBQ $0x10, R12
JB copy_2_small
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_2_loop:
MOVUPS (R11), X0
MOVUPS X0, (BX)
ADDQ $0x10, R11
ADDQ $0x10, BX
SUBQ $0x10, R12
JAE copy_2_loop
LEAQ 16(R11)(R12*1), R11
LEAQ 16(BX)(R12*1), BX
MOVUPS -16(R11), X0
MOVUPS X0, -16(BX)
JMP copy_2_end
copy_2_small:
CMPQ R13, $0x03
JE copy_2_move_3
JB copy_2_move_1or2
CMPQ R13, $0x08
JB copy_2_move_4through7
JMP copy_2_move_8through16
copy_2_move_1or2:
MOVB (R11), R12
MOVB -1(R11)(R13*1), R14
MOVB R12, (BX)
MOVB R14, -1(BX)(R13*1)
ADDQ R13, R11
ADDQ R13, BX
JMP copy_2_end
2022-05-31 11:57:04 +02:00
2022-07-13 15:43:46 +02:00
copy_2_move_3:
MOVW (R11), R12
MOVB 2(R11), R14
MOVW R12, (BX)
MOVB R14, 2(BX)
ADDQ R13, R11
ADDQ R13, BX
JMP copy_2_end
copy_2_move_4through7:
MOVL (R11), R12
MOVL -4(R11)(R13*1), R14
MOVL R12, (BX)
MOVL R14, -4(BX)(R13*1)
ADDQ R13, R11
ADDQ R13, BX
JMP copy_2_end
copy_2_move_8through16:
MOVQ (R11), R12
MOVQ -8(R11)(R13*1), R14
MOVQ R12, (BX)
MOVQ R14, -8(BX)(R13*1)
ADDQ R13, R11
2022-05-31 11:57:04 +02:00
ADDQ R13, BX
2022-07-13 15:43:46 +02:00
copy_2_end:
JMP handle_loop
2022-05-02 15:00:32 +02:00
// Copy overlapping match
copy_overlapping_match:
2022-05-20 13:45:24 +02:00
ADDQ R13, DI
2022-05-02 15:00:32 +02:00
copy_slow_3:
2022-05-20 13:45:24 +02:00
MOVB (R11), R12
MOVB R12, (BX)
INCQ R11
INCQ BX
DECQ R13
JNZ copy_slow_3
2022-05-02 15:00:32 +02:00
handle_loop:
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
loop_finished:
// Return value
MOVB $0x01, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
2023-01-04 08:36:36 +01:00
SUBQ 80(AX), SI
2022-05-02 15:00:32 +02:00
MOVQ SI, 112(AX)
RET
error_match_off_too_big:
// Return value
MOVB $0x00, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
2023-01-04 08:36:36 +01:00
SUBQ 80(AX), SI
2022-05-02 15:00:32 +02:00
MOVQ SI, 112(AX)
RET
empty_seqs:
// Return value
MOVB $0x01, ret+8(FP)
RET
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
2022-05-02 15:00:32 +02:00
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
2022-07-21 20:10:25 +02:00
XORQ CX, CX
MOVQ CX, 8(SP)
MOVQ CX, 16(SP)
MOVQ CX, 24(SP)
2022-05-02 15:00:32 +02:00
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
MOVQ 144(AX), R11
MOVQ 136(AX), R12
MOVQ 200(AX), CX
MOVQ CX, 56(SP)
MOVQ 176(AX), CX
MOVQ CX, 48(SP)
MOVQ 184(AX), AX
MOVQ AX, 40(SP)
MOVQ 40(SP), AX
ADDQ AX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R10, 32(SP)
// outBase += outPosition
ADDQ R12, R10
sequenceDecs_decodeSync_amd64_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_amd64_fill_end
sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_amd64_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
sequenceDecs_decodeSync_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_amd64_fill_end:
// Update offset
2022-07-21 20:10:25 +02:00
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_amd64_of_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_amd64_of_update_zero:
MOVQ AX, 8(SP)
2022-05-02 15:00:32 +02:00
// Update match length
2022-07-21 20:10:25 +02:00
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_amd64_ml_update_zero:
MOVQ AX, 16(SP)
2022-05-02 15:00:32 +02:00
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_amd64_fill_2_end
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_amd64_fill_2_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
sequenceDecs_decodeSync_amd64_fill_2_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_amd64_fill_2_end:
// Update literal length
2022-07-21 20:10:25 +02:00
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_amd64_ll_update_zero:
MOVQ AX, 24(SP)
2022-05-02 15:00:32 +02:00
// Fill bitreader for state updates
MOVQ R13, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R13*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R14
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R14
MOVL $0x00000001, R15
MOVB R13, CL
SHLL CL, R15
DECL R15
ANDQ R15, R14
2022-05-02 15:00:32 +02:00
ADDQ R14, DI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R13*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R14
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R14
MOVL $0x00000001, R15
MOVB R13, CL
SHLL CL, R15
DECL R15
ANDQ R15, R14
2022-05-02 15:00:32 +02:00
ADDQ R14, R8
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R13*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R14
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R14
MOVL $0x00000001, R15
MOVB R13, CL
SHLL CL, R15
DECL R15
ANDQ R15, R14
2022-05-02 15:00:32 +02:00
ADDQ R14, R9
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decodeSync_amd64_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ AX, $0x01
JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
MOVQ R13, AX
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
2022-07-21 20:10:25 +02:00
ADDQ 144(CX)(AX*8), R14
2022-05-02 15:00:32 +02:00
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_amd64_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_amd64_adjust_skip
MOVQ 152(CX), AX
MOVQ AX, 160(CX)
sequenceDecs_decodeSync_amd64_adjust_skip:
MOVQ 144(CX), AX
MOVQ AX, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
2022-07-21 20:10:25 +02:00
sequenceDecs_decodeSync_amd64_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), AX
MOVQ 24(SP), CX
LEAQ (AX)(CX*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ CX, 104(R14)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
MOVQ 24(SP), AX
MOVQ 8(SP), CX
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (AX)(R13*1), R14
ADDQ R10, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ AX, AX
JZ check_offset
XORQ R14, R14
copy_1:
MOVUPS (R11)(R14*1), X0
MOVUPS X0, (R10)(R14*1)
ADDQ $0x10, R14
2022-05-31 11:57:04 +02:00
CMPQ R14, AX
JB copy_1
ADDQ AX, R11
ADDQ AX, R10
ADDQ AX, R12
2022-05-02 15:00:32 +02:00
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R12, AX
ADDQ 40(SP), AX
CMPQ CX, AX
JG error_match_off_too_big
CMPQ CX, 56(SP)
JG error_match_off_too_big
// Copy match from history
2022-07-13 15:43:46 +02:00
MOVQ CX, AX
SUBQ R12, AX
JLS copy_match
MOVQ 48(SP), R14
SUBQ AX, R14
CMPQ R13, AX
JG copy_all_from_history
MOVQ R13, AX
SUBQ $0x10, AX
JB copy_4_small
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_4_loop:
MOVUPS (R14), X0
MOVUPS X0, (R10)
ADDQ $0x10, R14
ADDQ $0x10, R10
SUBQ $0x10, AX
JAE copy_4_loop
LEAQ 16(R14)(AX*1), R14
LEAQ 16(R10)(AX*1), R10
MOVUPS -16(R14), X0
MOVUPS X0, -16(R10)
JMP copy_4_end
copy_4_small:
CMPQ R13, $0x03
JE copy_4_move_3
CMPQ R13, $0x08
JB copy_4_move_4through7
JMP copy_4_move_8through16
copy_4_move_3:
MOVW (R14), AX
MOVB 2(R14), CL
MOVW AX, (R10)
MOVB CL, 2(R10)
ADDQ R13, R14
ADDQ R13, R10
JMP copy_4_end
copy_4_move_4through7:
MOVL (R14), AX
MOVL -4(R14)(R13*1), CX
MOVL AX, (R10)
MOVL CX, -4(R10)(R13*1)
ADDQ R13, R14
ADDQ R13, R10
JMP copy_4_end
copy_4_move_8through16:
MOVQ (R14), AX
MOVQ -8(R14)(R13*1), CX
MOVQ AX, (R10)
MOVQ CX, -8(R10)(R13*1)
ADDQ R13, R14
2022-05-02 15:00:32 +02:00
ADDQ R13, R10
2022-07-13 15:43:46 +02:00
copy_4_end:
ADDQ R13, R12
2022-05-02 15:00:32 +02:00
JMP handle_loop
JMP loop_finished
copy_all_from_history:
2022-07-13 15:43:46 +02:00
MOVQ AX, R15
SUBQ $0x10, R15
JB copy_5_small
copy_5_loop:
MOVUPS (R14), X0
MOVUPS X0, (R10)
ADDQ $0x10, R14
ADDQ $0x10, R10
SUBQ $0x10, R15
JAE copy_5_loop
LEAQ 16(R14)(R15*1), R14
LEAQ 16(R10)(R15*1), R10
MOVUPS -16(R14), X0
MOVUPS X0, -16(R10)
JMP copy_5_end
copy_5_small:
CMPQ AX, $0x03
JE copy_5_move_3
JB copy_5_move_1or2
CMPQ AX, $0x08
JB copy_5_move_4through7
JMP copy_5_move_8through16
copy_5_move_1or2:
MOVB (R14), R15
MOVB -1(R14)(AX*1), BP
MOVB R15, (R10)
MOVB BP, -1(R10)(AX*1)
ADDQ AX, R14
ADDQ AX, R10
JMP copy_5_end
copy_5_move_3:
MOVW (R14), R15
MOVB 2(R14), BP
MOVW R15, (R10)
MOVB BP, 2(R10)
ADDQ AX, R14
ADDQ AX, R10
JMP copy_5_end
copy_5_move_4through7:
MOVL (R14), R15
MOVL -4(R14)(AX*1), BP
MOVL R15, (R10)
MOVL BP, -4(R10)(AX*1)
ADDQ AX, R14
ADDQ AX, R10
JMP copy_5_end
copy_5_move_8through16:
MOVQ (R14), R15
MOVQ -8(R14)(AX*1), BP
MOVQ R15, (R10)
MOVQ BP, -8(R10)(AX*1)
ADDQ AX, R14
2022-05-02 15:00:32 +02:00
ADDQ AX, R10
2022-07-13 15:43:46 +02:00
copy_5_end:
2022-05-02 15:00:32 +02:00
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
2022-07-13 15:43:46 +02:00
MOVQ R10, AX
SUBQ CX, AX
2022-05-02 15:00:32 +02:00
// ml <= mo
CMPQ R13, CX
JA copy_overlapping_match
// Copy non-overlapping match
2022-05-20 13:45:24 +02:00
ADDQ R13, R12
MOVQ R10, CX
ADDQ R13, R10
2022-05-02 15:00:32 +02:00
copy_2:
2022-05-20 13:45:24 +02:00
MOVUPS (AX), X0
MOVUPS X0, (CX)
ADDQ $0x10, AX
2022-05-02 15:00:32 +02:00
ADDQ $0x10, CX
2022-05-20 13:45:24 +02:00
SUBQ $0x10, R13
JHI copy_2
2022-05-02 15:00:32 +02:00
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
2022-05-20 13:45:24 +02:00
ADDQ R13, R12
2022-05-02 15:00:32 +02:00
copy_slow_3:
2022-05-20 13:45:24 +02:00
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3
2022-05-02 15:00:32 +02:00
handle_loop:
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decodeSync_amd64_main_loop
loop_finished:
MOVQ br+8(FP), AX
2023-10-02 21:49:16 +02:00
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
2022-05-02 15:00:32 +02:00
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R12, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R11
MOVQ R11, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_amd64_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
2022-05-02 15:00:32 +02:00
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
2022-07-21 20:10:25 +02:00
XORQ R9, R9
MOVQ R9, 8(SP)
MOVQ R9, 16(SP)
MOVQ R9, 24(SP)
2022-05-02 15:00:32 +02:00
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
MOVQ 144(CX), R10
MOVQ 136(CX), R11
MOVQ 200(CX), R12
MOVQ R12, 56(SP)
MOVQ 176(CX), R12
MOVQ R12, 48(SP)
MOVQ 184(CX), CX
MOVQ CX, 40(SP)
MOVQ 40(SP), CX
ADDQ CX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R9, 32(SP)
// outBase += outPosition
ADDQ R11, R9
sequenceDecs_decodeSync_bmi2_main_loop:
MOVQ (SP), R12
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_bmi2_fill_end
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
sequenceDecs_decodeSync_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 8(SP)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_bmi2_fill_2_end
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 24(SP)
// Fill bitreader for state updates
2022-05-31 11:57:04 +02:00
MOVQ R12, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R12
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_bmi2_skip_update
LEAQ (SI)(DI*1), R13
ADDQ R8, R13
MOVBQZX R13, R13
2022-05-02 15:00:32 +02:00
LEAQ (DX)(R13*1), CX
MOVQ AX, R14
MOVQ CX, DX
ROLQ CL, R14
BZHIQ R13, R14, R14
2022-05-31 11:57:04 +02:00
// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
2022-05-02 15:00:32 +02:00
// Update Match Length State
2022-05-31 11:57:04 +02:00
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
2022-05-02 15:00:32 +02:00
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
2022-05-31 11:57:04 +02:00
// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
2022-05-02 15:00:32 +02:00
2022-05-31 11:57:04 +02:00
// Load ctx.llTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_bmi2_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ R12, $0x01
JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
2022-07-21 20:10:25 +02:00
ADDQ 144(CX)(R12*8), R14
2022-05-02 15:00:32 +02:00
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_bmi2_adjust_skip
MOVQ 152(CX), R12
MOVQ R12, 160(CX)
sequenceDecs_decodeSync_bmi2_adjust_skip:
MOVQ 144(CX), R12
MOVQ R12, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
2022-07-21 20:10:25 +02:00
sequenceDecs_decodeSync_bmi2_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), CX
MOVQ 24(SP), R12
LEAQ (CX)(R12*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ R12, 104(R14)
JS error_not_enough_literals
CMPQ CX, $0x00020002
JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
TESTQ CX, CX
JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
MOVQ 24(SP), CX
MOVQ 8(SP), R12
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (CX)(R13*1), R14
ADDQ R9, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ CX, CX
JZ check_offset
XORQ R14, R14
copy_1:
MOVUPS (R10)(R14*1), X0
MOVUPS X0, (R9)(R14*1)
ADDQ $0x10, R14
2022-05-31 11:57:04 +02:00
CMPQ R14, CX
JB copy_1
ADDQ CX, R10
ADDQ CX, R9
ADDQ CX, R11
2022-05-02 15:00:32 +02:00
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R11, CX
ADDQ 40(SP), CX
CMPQ R12, CX
JG error_match_off_too_big
CMPQ R12, 56(SP)
JG error_match_off_too_big
// Copy match from history
2022-07-13 15:43:46 +02:00
MOVQ R12, CX
SUBQ R11, CX
JLS copy_match
MOVQ 48(SP), R14
SUBQ CX, R14
CMPQ R13, CX
JG copy_all_from_history
MOVQ R13, CX
SUBQ $0x10, CX
JB copy_4_small
copy_4_loop:
MOVUPS (R14), X0
MOVUPS X0, (R9)
ADDQ $0x10, R14
ADDQ $0x10, R9
SUBQ $0x10, CX
JAE copy_4_loop
LEAQ 16(R14)(CX*1), R14
LEAQ 16(R9)(CX*1), R9
MOVUPS -16(R14), X0
MOVUPS X0, -16(R9)
JMP copy_4_end
copy_4_small:
CMPQ R13, $0x03
JE copy_4_move_3
CMPQ R13, $0x08
JB copy_4_move_4through7
JMP copy_4_move_8through16
copy_4_move_3:
MOVW (R14), CX
MOVB 2(R14), R12
MOVW CX, (R9)
MOVB R12, 2(R9)
ADDQ R13, R14
ADDQ R13, R9
JMP copy_4_end
copy_4_move_4through7:
MOVL (R14), CX
MOVL -4(R14)(R13*1), R12
MOVL CX, (R9)
MOVL R12, -4(R9)(R13*1)
ADDQ R13, R14
ADDQ R13, R9
JMP copy_4_end
copy_4_move_8through16:
MOVQ (R14), CX
MOVQ -8(R14)(R13*1), R12
MOVQ CX, (R9)
MOVQ R12, -8(R9)(R13*1)
ADDQ R13, R14
ADDQ R13, R9
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_4_end:
2022-05-02 15:00:32 +02:00
ADDQ R13, R11
JMP handle_loop
JMP loop_finished
copy_all_from_history:
2022-07-13 15:43:46 +02:00
MOVQ CX, R15
SUBQ $0x10, R15
JB copy_5_small
copy_5_loop:
MOVUPS (R14), X0
MOVUPS X0, (R9)
ADDQ $0x10, R14
ADDQ $0x10, R9
SUBQ $0x10, R15
JAE copy_5_loop
LEAQ 16(R14)(R15*1), R14
LEAQ 16(R9)(R15*1), R9
MOVUPS -16(R14), X0
MOVUPS X0, -16(R9)
JMP copy_5_end
copy_5_small:
CMPQ CX, $0x03
JE copy_5_move_3
JB copy_5_move_1or2
CMPQ CX, $0x08
JB copy_5_move_4through7
JMP copy_5_move_8through16
copy_5_move_1or2:
MOVB (R14), R15
MOVB -1(R14)(CX*1), BP
MOVB R15, (R9)
MOVB BP, -1(R9)(CX*1)
ADDQ CX, R14
2022-05-02 15:00:32 +02:00
ADDQ CX, R9
2022-07-13 15:43:46 +02:00
JMP copy_5_end
copy_5_move_3:
MOVW (R14), R15
MOVB 2(R14), BP
MOVW R15, (R9)
MOVB BP, 2(R9)
ADDQ CX, R14
ADDQ CX, R9
JMP copy_5_end
copy_5_move_4through7:
MOVL (R14), R15
MOVL -4(R14)(CX*1), BP
MOVL R15, (R9)
MOVL BP, -4(R9)(CX*1)
ADDQ CX, R14
ADDQ CX, R9
JMP copy_5_end
copy_5_move_8through16:
MOVQ (R14), R15
MOVQ -8(R14)(CX*1), BP
MOVQ R15, (R9)
MOVQ BP, -8(R9)(CX*1)
ADDQ CX, R14
ADDQ CX, R9
copy_5_end:
2022-05-02 15:00:32 +02:00
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
2022-07-13 15:43:46 +02:00
MOVQ R9, CX
SUBQ R12, CX
2022-05-02 15:00:32 +02:00
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
2022-05-20 13:45:24 +02:00
ADDQ R13, R11
MOVQ R9, R12
ADDQ R13, R9
2022-05-02 15:00:32 +02:00
copy_2:
2022-05-20 13:45:24 +02:00
MOVUPS (CX), X0
MOVUPS X0, (R12)
ADDQ $0x10, CX
2022-05-02 15:00:32 +02:00
ADDQ $0x10, R12
2022-05-20 13:45:24 +02:00
SUBQ $0x10, R13
JHI copy_2
2022-05-02 15:00:32 +02:00
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
2022-05-20 13:45:24 +02:00
ADDQ R13, R11
2022-05-02 15:00:32 +02:00
copy_slow_3:
2022-05-20 13:45:24 +02:00
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3
2022-05-02 15:00:32 +02:00
handle_loop:
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decodeSync_bmi2_main_loop
loop_finished:
MOVQ br+8(FP), CX
2023-10-02 21:49:16 +02:00
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
2022-05-02 15:00:32 +02:00
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R11, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R10
MOVQ R10, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
2022-05-02 15:00:32 +02:00
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
2022-07-21 20:10:25 +02:00
XORQ CX, CX
MOVQ CX, 8(SP)
MOVQ CX, 16(SP)
MOVQ CX, 24(SP)
2022-05-02 15:00:32 +02:00
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
MOVQ 144(AX), R11
MOVQ 136(AX), R12
MOVQ 200(AX), CX
MOVQ CX, 56(SP)
MOVQ 176(AX), CX
MOVQ CX, 48(SP)
MOVQ 184(AX), AX
MOVQ AX, 40(SP)
MOVQ 40(SP), AX
ADDQ AX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R10, 32(SP)
// outBase += outPosition
ADDQ R12, R10
sequenceDecs_decodeSync_safe_amd64_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_safe_amd64_fill_end
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_amd64_fill_end:
// Update offset
2022-07-21 20:10:25 +02:00
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_safe_amd64_of_update_zero:
MOVQ AX, 8(SP)
2022-05-02 15:00:32 +02:00
// Update match length
2022-07-21 20:10:25 +02:00
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
MOVQ AX, 16(SP)
2022-05-02 15:00:32 +02:00
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
2022-05-02 15:00:32 +02:00
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
CMPQ BX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
// Update literal length
2022-07-21 20:10:25 +02:00
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
SHRQ $0x20, AX
TESTQ CX, CX
JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
ADDQ CX, BX
CMPQ BX, $0x40
JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
CMPQ CX, $0x40
JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
NEGQ CX
SHRQ CL, R14
ADDQ R14, AX
sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
MOVQ AX, 24(SP)
2022-05-02 15:00:32 +02:00
// Fill bitreader for state updates
MOVQ R13, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_safe_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R13*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R14
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R14
MOVL $0x00000001, R15
MOVB R13, CL
SHLL CL, R15
DECL R15
ANDQ R15, R14
2022-05-02 15:00:32 +02:00
ADDQ R14, DI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R13*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R14
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R14
MOVL $0x00000001, R15
MOVB R13, CL
SHLL CL, R15
DECL R15
ANDQ R15, R14
2022-05-02 15:00:32 +02:00
ADDQ R14, R8
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
2022-07-13 15:43:46 +02:00
LEAQ (BX)(R13*1), CX
2022-05-02 15:00:32 +02:00
MOVQ DX, R14
2022-07-13 15:43:46 +02:00
MOVQ CX, BX
ROLQ CL, R14
MOVL $0x00000001, R15
MOVB R13, CL
SHLL CL, R15
DECL R15
ANDQ R15, R14
2022-05-02 15:00:32 +02:00
ADDQ R14, R9
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decodeSync_safe_amd64_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ AX, $0x01
JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
MOVQ R13, AX
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
2022-07-21 20:10:25 +02:00
ADDQ 144(CX)(AX*8), R14
2022-05-02 15:00:32 +02:00
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
MOVQ 152(CX), AX
MOVQ AX, 160(CX)
sequenceDecs_decodeSync_safe_amd64_adjust_skip:
MOVQ 144(CX), AX
MOVQ AX, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
2022-07-21 20:10:25 +02:00
sequenceDecs_decodeSync_safe_amd64_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), AX
MOVQ 24(SP), CX
LEAQ (AX)(CX*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ CX, 104(R14)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
MOVQ 24(SP), AX
MOVQ 8(SP), CX
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (AX)(R13*1), R14
ADDQ R10, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ AX, AX
JZ check_offset
2022-07-13 15:43:46 +02:00
MOVQ AX, R14
SUBQ $0x10, R14
JB copy_1_small
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_1_loop:
MOVUPS (R11), X0
MOVUPS X0, (R10)
ADDQ $0x10, R11
ADDQ $0x10, R10
SUBQ $0x10, R14
JAE copy_1_loop
LEAQ 16(R11)(R14*1), R11
LEAQ 16(R10)(R14*1), R10
MOVUPS -16(R11), X0
MOVUPS X0, -16(R10)
JMP copy_1_end
copy_1_small:
CMPQ AX, $0x03
JE copy_1_move_3
JB copy_1_move_1or2
CMPQ AX, $0x08
JB copy_1_move_4through7
JMP copy_1_move_8through16
copy_1_move_1or2:
MOVB (R11), R14
MOVB -1(R11)(AX*1), R15
MOVB R14, (R10)
MOVB R15, -1(R10)(AX*1)
ADDQ AX, R11
ADDQ AX, R10
JMP copy_1_end
copy_1_move_3:
MOVW (R11), R14
MOVB 2(R11), R15
MOVW R14, (R10)
MOVB R15, 2(R10)
ADDQ AX, R11
ADDQ AX, R10
JMP copy_1_end
copy_1_move_4through7:
MOVL (R11), R14
MOVL -4(R11)(AX*1), R15
MOVL R14, (R10)
MOVL R15, -4(R10)(AX*1)
ADDQ AX, R11
ADDQ AX, R10
JMP copy_1_end
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_1_move_8through16:
MOVQ (R11), R14
MOVQ -8(R11)(AX*1), R15
MOVQ R14, (R10)
MOVQ R15, -8(R10)(AX*1)
2022-05-02 15:00:32 +02:00
ADDQ AX, R11
ADDQ AX, R10
2022-07-13 15:43:46 +02:00
copy_1_end:
2022-05-02 15:00:32 +02:00
ADDQ AX, R12
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R12, AX
ADDQ 40(SP), AX
CMPQ CX, AX
JG error_match_off_too_big
CMPQ CX, 56(SP)
JG error_match_off_too_big
// Copy match from history
2022-07-13 15:43:46 +02:00
MOVQ CX, AX
SUBQ R12, AX
JLS copy_match
MOVQ 48(SP), R14
SUBQ AX, R14
CMPQ R13, AX
JG copy_all_from_history
MOVQ R13, AX
SUBQ $0x10, AX
JB copy_4_small
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_4_loop:
MOVUPS (R14), X0
MOVUPS X0, (R10)
ADDQ $0x10, R14
ADDQ $0x10, R10
SUBQ $0x10, AX
JAE copy_4_loop
LEAQ 16(R14)(AX*1), R14
LEAQ 16(R10)(AX*1), R10
MOVUPS -16(R14), X0
MOVUPS X0, -16(R10)
JMP copy_4_end
copy_4_small:
CMPQ R13, $0x03
JE copy_4_move_3
CMPQ R13, $0x08
JB copy_4_move_4through7
JMP copy_4_move_8through16
copy_4_move_3:
MOVW (R14), AX
MOVB 2(R14), CL
MOVW AX, (R10)
MOVB CL, 2(R10)
ADDQ R13, R14
ADDQ R13, R10
JMP copy_4_end
copy_4_move_4through7:
MOVL (R14), AX
MOVL -4(R14)(R13*1), CX
MOVL AX, (R10)
MOVL CX, -4(R10)(R13*1)
ADDQ R13, R14
2022-05-02 15:00:32 +02:00
ADDQ R13, R10
2022-07-13 15:43:46 +02:00
JMP copy_4_end
copy_4_move_8through16:
MOVQ (R14), AX
MOVQ -8(R14)(R13*1), CX
MOVQ AX, (R10)
MOVQ CX, -8(R10)(R13*1)
ADDQ R13, R14
ADDQ R13, R10
copy_4_end:
ADDQ R13, R12
2022-05-02 15:00:32 +02:00
JMP handle_loop
JMP loop_finished
copy_all_from_history:
2022-07-13 15:43:46 +02:00
MOVQ AX, R15
SUBQ $0x10, R15
JB copy_5_small
copy_5_loop:
MOVUPS (R14), X0
MOVUPS X0, (R10)
ADDQ $0x10, R14
ADDQ $0x10, R10
SUBQ $0x10, R15
JAE copy_5_loop
LEAQ 16(R14)(R15*1), R14
LEAQ 16(R10)(R15*1), R10
MOVUPS -16(R14), X0
MOVUPS X0, -16(R10)
JMP copy_5_end
copy_5_small:
CMPQ AX, $0x03
JE copy_5_move_3
JB copy_5_move_1or2
CMPQ AX, $0x08
JB copy_5_move_4through7
JMP copy_5_move_8through16
copy_5_move_1or2:
MOVB (R14), R15
MOVB -1(R14)(AX*1), BP
MOVB R15, (R10)
MOVB BP, -1(R10)(AX*1)
ADDQ AX, R14
ADDQ AX, R10
JMP copy_5_end
copy_5_move_3:
MOVW (R14), R15
MOVB 2(R14), BP
MOVW R15, (R10)
MOVB BP, 2(R10)
ADDQ AX, R14
ADDQ AX, R10
JMP copy_5_end
copy_5_move_4through7:
MOVL (R14), R15
MOVL -4(R14)(AX*1), BP
MOVL R15, (R10)
MOVL BP, -4(R10)(AX*1)
ADDQ AX, R14
2022-05-02 15:00:32 +02:00
ADDQ AX, R10
2022-07-13 15:43:46 +02:00
JMP copy_5_end
copy_5_move_8through16:
MOVQ (R14), R15
MOVQ -8(R14)(AX*1), BP
MOVQ R15, (R10)
MOVQ BP, -8(R10)(AX*1)
ADDQ AX, R14
ADDQ AX, R10
copy_5_end:
2022-05-02 15:00:32 +02:00
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
2022-07-13 15:43:46 +02:00
MOVQ R10, AX
SUBQ CX, AX
2022-05-02 15:00:32 +02:00
// ml <= mo
CMPQ R13, CX
JA copy_overlapping_match
// Copy non-overlapping match
2022-07-13 15:43:46 +02:00
ADDQ R13, R12
MOVQ R13, CX
SUBQ $0x10, CX
JB copy_2_small
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_2_loop:
MOVUPS (AX), X0
MOVUPS X0, (R10)
ADDQ $0x10, AX
ADDQ $0x10, R10
SUBQ $0x10, CX
JAE copy_2_loop
LEAQ 16(AX)(CX*1), AX
LEAQ 16(R10)(CX*1), R10
MOVUPS -16(AX), X0
MOVUPS X0, -16(R10)
JMP copy_2_end
copy_2_small:
CMPQ R13, $0x03
JE copy_2_move_3
JB copy_2_move_1or2
CMPQ R13, $0x08
JB copy_2_move_4through7
JMP copy_2_move_8through16
copy_2_move_1or2:
MOVB (AX), CL
MOVB -1(AX)(R13*1), R14
MOVB CL, (R10)
MOVB R14, -1(R10)(R13*1)
ADDQ R13, AX
2022-05-02 15:00:32 +02:00
ADDQ R13, R10
2022-07-13 15:43:46 +02:00
JMP copy_2_end
copy_2_move_3:
MOVW (AX), CX
MOVB 2(AX), R14
MOVW CX, (R10)
MOVB R14, 2(R10)
ADDQ R13, AX
ADDQ R13, R10
JMP copy_2_end
copy_2_move_4through7:
MOVL (AX), CX
MOVL -4(AX)(R13*1), R14
MOVL CX, (R10)
MOVL R14, -4(R10)(R13*1)
ADDQ R13, AX
ADDQ R13, R10
JMP copy_2_end
copy_2_move_8through16:
MOVQ (AX), CX
MOVQ -8(AX)(R13*1), R14
MOVQ CX, (R10)
MOVQ R14, -8(R10)(R13*1)
ADDQ R13, AX
ADDQ R13, R10
copy_2_end:
JMP handle_loop
2022-05-02 15:00:32 +02:00
// Copy overlapping match
copy_overlapping_match:
2022-05-20 13:45:24 +02:00
ADDQ R13, R12
2022-05-02 15:00:32 +02:00
copy_slow_3:
2022-05-20 13:45:24 +02:00
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3
2022-05-02 15:00:32 +02:00
handle_loop:
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decodeSync_safe_amd64_main_loop
loop_finished:
MOVQ br+8(FP), AX
2023-10-02 21:49:16 +02:00
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
2022-05-02 15:00:32 +02:00
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R12, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R11
MOVQ R11, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
2023-10-02 21:49:16 +02:00
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
2022-05-02 15:00:32 +02:00
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
2022-07-21 20:10:25 +02:00
XORQ R9, R9
MOVQ R9, 8(SP)
MOVQ R9, 16(SP)
MOVQ R9, 24(SP)
2022-05-02 15:00:32 +02:00
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
MOVQ 144(CX), R10
MOVQ 136(CX), R11
MOVQ 200(CX), R12
MOVQ R12, 56(SP)
MOVQ 176(CX), R12
MOVQ R12, 48(SP)
MOVQ 184(CX), CX
MOVQ CX, 40(SP)
MOVQ 40(SP), CX
ADDQ CX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R9, 32(SP)
// outBase += outPosition
ADDQ R11, R9
sequenceDecs_decodeSync_safe_bmi2_main_loop:
MOVQ (SP), R12
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 8(SP)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
2022-05-02 15:00:32 +02:00
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
CMPQ DX, $0x40
JA error_overread
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 24(SP)
// Fill bitreader for state updates
2022-05-31 11:57:04 +02:00
MOVQ R12, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R12
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
LEAQ (SI)(DI*1), R13
ADDQ R8, R13
MOVBQZX R13, R13
2022-05-02 15:00:32 +02:00
LEAQ (DX)(R13*1), CX
MOVQ AX, R14
MOVQ CX, DX
ROLQ CL, R14
BZHIQ R13, R14, R14
2022-05-31 11:57:04 +02:00
// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
2022-05-02 15:00:32 +02:00
// Update Match Length State
2022-05-31 11:57:04 +02:00
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
2022-05-02 15:00:32 +02:00
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
2022-05-31 11:57:04 +02:00
// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
2022-05-02 15:00:32 +02:00
2022-05-31 11:57:04 +02:00
// Load ctx.llTable
2022-05-02 15:00:32 +02:00
MOVQ ctx+16(FP), CX
2022-05-31 11:57:04 +02:00
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_bmi2_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ R12, $0x01
JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
2022-07-21 20:10:25 +02:00
JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
2022-05-02 15:00:32 +02:00
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
2022-07-21 20:10:25 +02:00
ADDQ 144(CX)(R12*8), R14
2022-05-02 15:00:32 +02:00
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
MOVQ 152(CX), R12
MOVQ R12, 160(CX)
sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
MOVQ 144(CX), R12
MOVQ R12, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
2022-07-21 20:10:25 +02:00
sequenceDecs_decodeSync_safe_bmi2_after_adjust:
2022-05-02 15:00:32 +02:00
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), CX
MOVQ 24(SP), R12
LEAQ (CX)(R12*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ R12, 104(R14)
JS error_not_enough_literals
CMPQ CX, $0x00020002
JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
TESTQ CX, CX
JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
MOVQ 24(SP), CX
MOVQ 8(SP), R12
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (CX)(R13*1), R14
ADDQ R9, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ CX, CX
JZ check_offset
2022-07-13 15:43:46 +02:00
MOVQ CX, R14
SUBQ $0x10, R14
JB copy_1_small
copy_1_loop:
MOVUPS (R10), X0
MOVUPS X0, (R9)
ADDQ $0x10, R10
ADDQ $0x10, R9
SUBQ $0x10, R14
JAE copy_1_loop
LEAQ 16(R10)(R14*1), R10
LEAQ 16(R9)(R14*1), R9
MOVUPS -16(R10), X0
MOVUPS X0, -16(R9)
JMP copy_1_end
copy_1_small:
CMPQ CX, $0x03
JE copy_1_move_3
JB copy_1_move_1or2
CMPQ CX, $0x08
JB copy_1_move_4through7
JMP copy_1_move_8through16
copy_1_move_1or2:
MOVB (R10), R14
MOVB -1(R10)(CX*1), R15
MOVB R14, (R9)
MOVB R15, -1(R9)(CX*1)
ADDQ CX, R10
ADDQ CX, R9
JMP copy_1_end
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_1_move_3:
MOVW (R10), R14
MOVB 2(R10), R15
MOVW R14, (R9)
MOVB R15, 2(R9)
ADDQ CX, R10
ADDQ CX, R9
JMP copy_1_end
copy_1_move_4through7:
MOVL (R10), R14
MOVL -4(R10)(CX*1), R15
MOVL R14, (R9)
MOVL R15, -4(R9)(CX*1)
ADDQ CX, R10
ADDQ CX, R9
JMP copy_1_end
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_1_move_8through16:
MOVQ (R10), R14
MOVQ -8(R10)(CX*1), R15
MOVQ R14, (R9)
MOVQ R15, -8(R9)(CX*1)
2022-05-02 15:00:32 +02:00
ADDQ CX, R10
ADDQ CX, R9
2022-07-13 15:43:46 +02:00
copy_1_end:
2022-05-02 15:00:32 +02:00
ADDQ CX, R11
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R11, CX
ADDQ 40(SP), CX
CMPQ R12, CX
JG error_match_off_too_big
CMPQ R12, 56(SP)
JG error_match_off_too_big
// Copy match from history
2022-07-13 15:43:46 +02:00
MOVQ R12, CX
SUBQ R11, CX
JLS copy_match
MOVQ 48(SP), R14
SUBQ CX, R14
CMPQ R13, CX
JG copy_all_from_history
MOVQ R13, CX
SUBQ $0x10, CX
JB copy_4_small
copy_4_loop:
MOVUPS (R14), X0
MOVUPS X0, (R9)
ADDQ $0x10, R14
ADDQ $0x10, R9
SUBQ $0x10, CX
JAE copy_4_loop
LEAQ 16(R14)(CX*1), R14
LEAQ 16(R9)(CX*1), R9
MOVUPS -16(R14), X0
MOVUPS X0, -16(R9)
JMP copy_4_end
copy_4_small:
CMPQ R13, $0x03
JE copy_4_move_3
CMPQ R13, $0x08
JB copy_4_move_4through7
JMP copy_4_move_8through16
copy_4_move_3:
MOVW (R14), CX
MOVB 2(R14), R12
MOVW CX, (R9)
MOVB R12, 2(R9)
ADDQ R13, R14
ADDQ R13, R9
JMP copy_4_end
copy_4_move_4through7:
MOVL (R14), CX
MOVL -4(R14)(R13*1), R12
MOVL CX, (R9)
MOVL R12, -4(R9)(R13*1)
ADDQ R13, R14
ADDQ R13, R9
JMP copy_4_end
copy_4_move_8through16:
MOVQ (R14), CX
MOVQ -8(R14)(R13*1), R12
MOVQ CX, (R9)
MOVQ R12, -8(R9)(R13*1)
ADDQ R13, R14
ADDQ R13, R9
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_4_end:
2022-05-02 15:00:32 +02:00
ADDQ R13, R11
JMP handle_loop
JMP loop_finished
copy_all_from_history:
2022-07-13 15:43:46 +02:00
MOVQ CX, R15
SUBQ $0x10, R15
JB copy_5_small
copy_5_loop:
MOVUPS (R14), X0
MOVUPS X0, (R9)
ADDQ $0x10, R14
ADDQ $0x10, R9
SUBQ $0x10, R15
JAE copy_5_loop
LEAQ 16(R14)(R15*1), R14
LEAQ 16(R9)(R15*1), R9
MOVUPS -16(R14), X0
MOVUPS X0, -16(R9)
JMP copy_5_end
copy_5_small:
CMPQ CX, $0x03
JE copy_5_move_3
JB copy_5_move_1or2
CMPQ CX, $0x08
JB copy_5_move_4through7
JMP copy_5_move_8through16
copy_5_move_1or2:
MOVB (R14), R15
MOVB -1(R14)(CX*1), BP
MOVB R15, (R9)
MOVB BP, -1(R9)(CX*1)
ADDQ CX, R14
ADDQ CX, R9
JMP copy_5_end
copy_5_move_3:
MOVW (R14), R15
MOVB 2(R14), BP
MOVW R15, (R9)
MOVB BP, 2(R9)
ADDQ CX, R14
2022-05-02 15:00:32 +02:00
ADDQ CX, R9
2022-07-13 15:43:46 +02:00
JMP copy_5_end
copy_5_move_4through7:
MOVL (R14), R15
MOVL -4(R14)(CX*1), BP
MOVL R15, (R9)
MOVL BP, -4(R9)(CX*1)
ADDQ CX, R14
ADDQ CX, R9
JMP copy_5_end
copy_5_move_8through16:
MOVQ (R14), R15
MOVQ -8(R14)(CX*1), BP
MOVQ R15, (R9)
MOVQ BP, -8(R9)(CX*1)
ADDQ CX, R14
ADDQ CX, R9
copy_5_end:
2022-05-02 15:00:32 +02:00
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
2022-07-13 15:43:46 +02:00
MOVQ R9, CX
SUBQ R12, CX
2022-05-02 15:00:32 +02:00
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
2022-07-13 15:43:46 +02:00
ADDQ R13, R11
MOVQ R13, R12
SUBQ $0x10, R12
JB copy_2_small
2022-05-02 15:00:32 +02:00
2022-07-13 15:43:46 +02:00
copy_2_loop:
MOVUPS (CX), X0
MOVUPS X0, (R9)
ADDQ $0x10, CX
ADDQ $0x10, R9
SUBQ $0x10, R12
JAE copy_2_loop
LEAQ 16(CX)(R12*1), CX
LEAQ 16(R9)(R12*1), R9
MOVUPS -16(CX), X0
MOVUPS X0, -16(R9)
JMP copy_2_end
copy_2_small:
CMPQ R13, $0x03
JE copy_2_move_3
JB copy_2_move_1or2
CMPQ R13, $0x08
JB copy_2_move_4through7
JMP copy_2_move_8through16
copy_2_move_1or2:
MOVB (CX), R12
MOVB -1(CX)(R13*1), R14
MOVB R12, (R9)
MOVB R14, -1(R9)(R13*1)
ADDQ R13, CX
2022-05-02 15:00:32 +02:00
ADDQ R13, R9
2022-07-13 15:43:46 +02:00
JMP copy_2_end
copy_2_move_3:
MOVW (CX), R12
MOVB 2(CX), R14
MOVW R12, (R9)
MOVB R14, 2(R9)
ADDQ R13, CX
ADDQ R13, R9
JMP copy_2_end
copy_2_move_4through7:
MOVL (CX), R12
MOVL -4(CX)(R13*1), R14
MOVL R12, (R9)
MOVL R14, -4(R9)(R13*1)
ADDQ R13, CX
ADDQ R13, R9
JMP copy_2_end
copy_2_move_8through16:
MOVQ (CX), R12
MOVQ -8(CX)(R13*1), R14
MOVQ R12, (R9)
MOVQ R14, -8(R9)(R13*1)
ADDQ R13, CX
ADDQ R13, R9
copy_2_end:
JMP handle_loop
2022-05-02 15:00:32 +02:00
// Copy overlapping match
copy_overlapping_match:
2022-05-20 13:45:24 +02:00
ADDQ R13, R11
2022-05-02 15:00:32 +02:00
copy_slow_3:
2022-05-20 13:45:24 +02:00
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3
2022-05-02 15:00:32 +02:00
handle_loop:
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
loop_finished:
MOVQ br+8(FP), CX
2023-10-02 21:49:16 +02:00
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
2022-05-02 15:00:32 +02:00
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R11, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R10
MOVQ R10, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
2022-05-02 15:00:32 +02:00
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET