// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.

//go:build !appengine && !noasm && gc && !noasm
// +build !appengine,!noasm,gc,!noasm

// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
	MOVQ    br+8(FP), AX
	MOVQ    32(AX), DX
	MOVBQZX 40(AX), BX
	MOVQ    24(AX), SI
	MOVQ    (AX), AX
	ADDQ    SI, AX
	MOVQ    AX, (SP)
	MOVQ    ctx+16(FP), AX
	MOVQ    72(AX), DI
	MOVQ    80(AX), R8
	MOVQ    88(AX), R9
	MOVQ    104(AX), R10
	MOVQ    s+0(FP), AX
	MOVQ    144(AX), R11
	MOVQ    152(AX), R12
	MOVQ    160(AX), R13

sequenceDecs_decode_amd64_main_loop:
	MOVQ (SP), R14

	// Fill bitreader to have enough for the offset and match length.
	CMPQ SI, $0x08
	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R14
	MOVQ (R14), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decode_amd64_fill_end

sequenceDecs_decode_amd64_fill_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decode_amd64_fill_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decode_amd64_fill_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R14
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R14), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte

sequenceDecs_decode_amd64_fill_end:
	// Update offset
	MOVQ    R9, AX
	MOVQ    BX, CX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R15
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R15
	ADDQ    R15, AX
	MOVQ    AX, 16(R10)

	// Update match length
	MOVQ    R8, AX
	MOVQ    BX, CX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R15
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R15
	ADDQ    R15, AX
	MOVQ    AX, 8(R10)

	// Fill bitreader to have enough for the remaining
	CMPQ SI, $0x08
	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R14
	MOVQ (R14), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decode_amd64_fill_2_end

sequenceDecs_decode_amd64_fill_2_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decode_amd64_fill_2_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decode_amd64_fill_2_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R14
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R14), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte

sequenceDecs_decode_amd64_fill_2_end:
	// Update literal length
	MOVQ    DI, AX
	MOVQ    BX, CX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R15
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R15
	ADDQ    R15, AX
	MOVQ    AX, (R10)

	// Fill bitreader for state updates
	MOVQ    R14, (SP)
	MOVQ    R9, AX
	SHRQ    $0x08, AX
	MOVBQZX AL, AX
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decode_amd64_skip_update

	// Update Literal Length State
	MOVBQZX DI, R14
	SHRQ    $0x10, DI
	MOVWQZX DI, DI
	CMPQ    R14, $0x00
	JZ      sequenceDecs_decode_amd64_llState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R14, BX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVQ    R14, CX
	NEGQ    CX
	SHRQ    CL, R15
	ADDQ    R15, DI

sequenceDecs_decode_amd64_llState_updateState_skip_zero:
	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Match Length State
	MOVBQZX R8, R14
	SHRQ    $0x10, R8
	MOVWQZX R8, R8
	CMPQ    R14, $0x00
	JZ      sequenceDecs_decode_amd64_mlState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R14, BX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVQ    R14, CX
	NEGQ    CX
	SHRQ    CL, R15
	ADDQ    R15, R8

sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Offset State
	MOVBQZX R9, R14
	SHRQ    $0x10, R9
	MOVWQZX R9, R9
	CMPQ    R14, $0x00
	JZ      sequenceDecs_decode_amd64_ofState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R14, BX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVQ    R14, CX
	NEGQ    CX
	SHRQ    CL, R15
	ADDQ    R15, R9

sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R9*8), R9

sequenceDecs_decode_amd64_skip_update:
	// Adjust offset
	MOVQ 16(R10), CX
	CMPQ AX, $0x01
	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
	MOVQ R12, R13
	MOVQ R11, R12
	MOVQ CX, R11
	JMP  sequenceDecs_decode_amd64_adjust_end

sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
	CMPQ (R10), $0x00000000
	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
	INCQ CX
	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero

sequenceDecs_decode_amd64_adjust_offset_maybezero:
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
	MOVQ  R11, CX
	JMP   sequenceDecs_decode_amd64_adjust_end

sequenceDecs_decode_amd64_adjust_offset_nonzero:
	CMPQ CX, $0x01
	JB   sequenceDecs_decode_amd64_adjust_zero
	JEQ  sequenceDecs_decode_amd64_adjust_one
	CMPQ CX, $0x02
	JA   sequenceDecs_decode_amd64_adjust_three
	JMP  sequenceDecs_decode_amd64_adjust_two

sequenceDecs_decode_amd64_adjust_zero:
	MOVQ R11, AX
	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid

sequenceDecs_decode_amd64_adjust_one:
	MOVQ R12, AX
	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid

sequenceDecs_decode_amd64_adjust_two:
	MOVQ R13, AX
	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid

sequenceDecs_decode_amd64_adjust_three:
	LEAQ -1(R11), AX

sequenceDecs_decode_amd64_adjust_test_temp_valid:
	TESTQ AX, AX
	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
	MOVQ  $0x00000001, AX

sequenceDecs_decode_amd64_adjust_temp_valid:
	CMPQ    CX, $0x01
	CMOVQNE R12, R13
	MOVQ    R11, R12
	MOVQ    AX, R11
	MOVQ    AX, CX

sequenceDecs_decode_amd64_adjust_end:
	MOVQ CX, 16(R10)

	// Check values
	MOVQ  8(R10), AX
	MOVQ  (R10), R14
	LEAQ  (AX)(R14*1), R15
	MOVQ  s+0(FP), BP
	ADDQ  R15, 256(BP)
	MOVQ  ctx+16(FP), R15
	SUBQ  R14, 128(R15)
	JS    error_not_enough_literals
	CMPQ  AX, $0x00020002
	JA    sequenceDecs_decode_amd64_error_match_len_too_big
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
	TESTQ AX, AX
	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch

sequenceDecs_decode_amd64_match_len_ofs_ok:
	ADDQ $0x18, R10
	MOVQ ctx+16(FP), AX
	DECQ 96(AX)
	JNS  sequenceDecs_decode_amd64_main_loop
	MOVQ s+0(FP), AX
	MOVQ R11, 144(AX)
	MOVQ R12, 152(AX)
	MOVQ R13, 160(AX)
	MOVQ br+8(FP), AX
	MOVQ DX, 32(AX)
	MOVB BL, 40(AX)
	MOVQ SI, 24(AX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decode_amd64_error_match_len_too_big:
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
	MOVQ    br+8(FP), AX
	MOVQ    32(AX), DX
	MOVBQZX 40(AX), BX
	MOVQ    24(AX), SI
	MOVQ    (AX), AX
	ADDQ    SI, AX
	MOVQ    AX, (SP)
	MOVQ    ctx+16(FP), AX
	MOVQ    72(AX), DI
	MOVQ    80(AX), R8
	MOVQ    88(AX), R9
	MOVQ    104(AX), R10
	MOVQ    s+0(FP), AX
	MOVQ    144(AX), R11
	MOVQ    152(AX), R12
	MOVQ    160(AX), R13

sequenceDecs_decode_56_amd64_main_loop:
	MOVQ (SP), R14

	// Fill bitreader to have enough for the offset and match length.
	CMPQ SI, $0x08
	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R14
	MOVQ (R14), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decode_56_amd64_fill_end

sequenceDecs_decode_56_amd64_fill_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decode_56_amd64_fill_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decode_56_amd64_fill_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R14
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R14), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte

sequenceDecs_decode_56_amd64_fill_end:
	// Update offset
	MOVQ    R9, AX
	MOVQ    BX, CX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R15
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R15
	ADDQ    R15, AX
	MOVQ    AX, 16(R10)

	// Update match length
	MOVQ    R8, AX
	MOVQ    BX, CX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R15
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R15
	ADDQ    R15, AX
	MOVQ    AX, 8(R10)

	// Update literal length
	MOVQ    DI, AX
	MOVQ    BX, CX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R15
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R15
	ADDQ    R15, AX
	MOVQ    AX, (R10)

	// Fill bitreader for state updates
	MOVQ    R14, (SP)
	MOVQ    R9, AX
	SHRQ    $0x08, AX
	MOVBQZX AL, AX
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decode_56_amd64_skip_update

	// Update Literal Length State
	MOVBQZX DI, R14
	SHRQ    $0x10, DI
	MOVWQZX DI, DI
	CMPQ    R14, $0x00
	JZ      sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R14, BX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVQ    R14, CX
	NEGQ    CX
	SHRQ    CL, R15
	ADDQ    R15, DI

sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Match Length State
	MOVBQZX R8, R14
	SHRQ    $0x10, R8
	MOVWQZX R8, R8
	CMPQ    R14, $0x00
	JZ      sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R14, BX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVQ    R14, CX
	NEGQ    CX
	SHRQ    CL, R15
	ADDQ    R15, R8

sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Offset State
	MOVBQZX R9, R14
	SHRQ    $0x10, R9
	MOVWQZX R9, R9
	CMPQ    R14, $0x00
	JZ      sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R14, BX
	MOVQ    DX, R15
	SHLQ    CL, R15
	MOVQ    R14, CX
	NEGQ    CX
	SHRQ    CL, R15
	ADDQ    R15, R9

sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R9*8), R9

sequenceDecs_decode_56_amd64_skip_update:
	// Adjust offset
	MOVQ 16(R10), CX
	CMPQ AX, $0x01
	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
	MOVQ R12, R13
	MOVQ R11, R12
	MOVQ CX, R11
	JMP  sequenceDecs_decode_56_amd64_adjust_end

sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
	CMPQ (R10), $0x00000000
	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
	INCQ CX
	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero

sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
	MOVQ  R11, CX
	JMP   sequenceDecs_decode_56_amd64_adjust_end

sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
	CMPQ CX, $0x01
	JB   sequenceDecs_decode_56_amd64_adjust_zero
	JEQ  sequenceDecs_decode_56_amd64_adjust_one
	CMPQ CX, $0x02
	JA   sequenceDecs_decode_56_amd64_adjust_three
	JMP  sequenceDecs_decode_56_amd64_adjust_two

sequenceDecs_decode_56_amd64_adjust_zero:
	MOVQ R11, AX
	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid

sequenceDecs_decode_56_amd64_adjust_one:
	MOVQ R12, AX
	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid

sequenceDecs_decode_56_amd64_adjust_two:
	MOVQ R13, AX
	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid

sequenceDecs_decode_56_amd64_adjust_three:
	LEAQ -1(R11), AX

sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
	TESTQ AX, AX
	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
	MOVQ  $0x00000001, AX

sequenceDecs_decode_56_amd64_adjust_temp_valid:
	CMPQ    CX, $0x01
	CMOVQNE R12, R13
	MOVQ    R11, R12
	MOVQ    AX, R11
	MOVQ    AX, CX

sequenceDecs_decode_56_amd64_adjust_end:
	MOVQ CX, 16(R10)

	// Check values
	MOVQ  8(R10), AX
	MOVQ  (R10), R14
	LEAQ  (AX)(R14*1), R15
	MOVQ  s+0(FP), BP
	ADDQ  R15, 256(BP)
	MOVQ  ctx+16(FP), R15
	SUBQ  R14, 128(R15)
	JS    error_not_enough_literals
	CMPQ  AX, $0x00020002
	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
	TESTQ AX, AX
	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch

sequenceDecs_decode_56_amd64_match_len_ofs_ok:
	ADDQ $0x18, R10
	MOVQ ctx+16(FP), AX
	DECQ 96(AX)
	JNS  sequenceDecs_decode_56_amd64_main_loop
	MOVQ s+0(FP), AX
	MOVQ R11, 144(AX)
	MOVQ R12, 152(AX)
	MOVQ R13, 160(AX)
	MOVQ br+8(FP), AX
	MOVQ DX, 32(AX)
	MOVB BL, 40(AX)
	MOVQ SI, 24(AX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decode_56_amd64_error_match_len_too_big:
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
	MOVQ    br+8(FP), CX
	MOVQ    32(CX), AX
	MOVBQZX 40(CX), DX
	MOVQ    24(CX), BX
	MOVQ    (CX), CX
	ADDQ    BX, CX
	MOVQ    CX, (SP)
	MOVQ    ctx+16(FP), CX
	MOVQ    72(CX), SI
	MOVQ    80(CX), DI
	MOVQ    88(CX), R8
	MOVQ    104(CX), R9
	MOVQ    s+0(FP), CX
	MOVQ    144(CX), R10
	MOVQ    152(CX), R11
	MOVQ    160(CX), R12

sequenceDecs_decode_bmi2_main_loop:
	MOVQ (SP), R13

	// Fill bitreader to have enough for the offset and match length.
	CMPQ BX, $0x08
	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R13
	MOVQ (R13), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decode_bmi2_fill_end

sequenceDecs_decode_bmi2_fill_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decode_bmi2_fill_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decode_bmi2_fill_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R13
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R13), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte

sequenceDecs_decode_bmi2_fill_end:
	// Update offset
	MOVQ   $0x00000808, CX
	BEXTRQ CX, R8, R14
	MOVQ   AX, R15
	LEAQ   (DX)(R14*1), CX
	ROLQ   CL, R15
	BZHIQ  R14, R15, R15
	MOVQ   CX, DX
	MOVQ   R8, CX
	SHRQ   $0x20, CX
	ADDQ   R15, CX
	MOVQ   CX, 16(R9)

	// Update match length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, DI, R14
	MOVQ   AX, R15
	LEAQ   (DX)(R14*1), CX
	ROLQ   CL, R15
	BZHIQ  R14, R15, R15
	MOVQ   CX, DX
	MOVQ   DI, CX
	SHRQ   $0x20, CX
	ADDQ   R15, CX
	MOVQ   CX, 8(R9)

	// Fill bitreader to have enough for the remaining
	CMPQ BX, $0x08
	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R13
	MOVQ (R13), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decode_bmi2_fill_2_end

sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decode_bmi2_fill_2_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decode_bmi2_fill_2_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R13
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R13), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte

sequenceDecs_decode_bmi2_fill_2_end:
	// Update literal length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, SI, R14
	MOVQ   AX, R15
	LEAQ   (DX)(R14*1), CX
	ROLQ   CL, R15
	BZHIQ  R14, R15, R15
	MOVQ   CX, DX
	MOVQ   SI, CX
	SHRQ   $0x20, CX
	ADDQ   R15, CX
	MOVQ   CX, (R9)

	// Fill bitreader for state updates
	MOVQ    R13, (SP)
	MOVQ    $0x00000808, CX
	BEXTRQ  CX, R8, R13
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decode_bmi2_skip_update
	LEAQ    (SI)(DI*1), R14
	ADDQ    R8, R14
	MOVBQZX R14, R14
	LEAQ    (DX)(R14*1), CX
	MOVQ    AX, R15
	MOVQ    CX, DX
	ROLQ    CL, R15
	BZHIQ   R14, R15, R15

	// Update Offset State
	BZHIQ  R8, R15, CX
	SHRXQ  R8, R15, R15
	MOVQ   $0x00001010, R14
	BEXTRQ R14, R8, R8
	ADDQ   CX, R8

	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Match Length State
	BZHIQ  DI, R15, CX
	SHRXQ  DI, R15, R15
	MOVQ   $0x00001010, R14
	BEXTRQ R14, DI, DI
	ADDQ   CX, DI

	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Literal Length State
	BZHIQ  SI, R15, CX
	MOVQ   $0x00001010, R14
	BEXTRQ R14, SI, SI
	ADDQ   CX, SI

	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(SI*8), SI

sequenceDecs_decode_bmi2_skip_update:
	// Adjust offset
	MOVQ 16(R9), CX
	CMPQ R13, $0x01
	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
	MOVQ R11, R12
	MOVQ R10, R11
	MOVQ CX, R10
	JMP  sequenceDecs_decode_bmi2_adjust_end

sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
	CMPQ (R9), $0x00000000
	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
	INCQ CX
	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero

sequenceDecs_decode_bmi2_adjust_offset_maybezero:
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
	MOVQ  R10, CX
	JMP   sequenceDecs_decode_bmi2_adjust_end

sequenceDecs_decode_bmi2_adjust_offset_nonzero:
	CMPQ CX, $0x01
	JB   sequenceDecs_decode_bmi2_adjust_zero
	JEQ  sequenceDecs_decode_bmi2_adjust_one
	CMPQ CX, $0x02
	JA   sequenceDecs_decode_bmi2_adjust_three
	JMP  sequenceDecs_decode_bmi2_adjust_two

sequenceDecs_decode_bmi2_adjust_zero:
	MOVQ R10, R13
	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid

sequenceDecs_decode_bmi2_adjust_one:
	MOVQ R11, R13
	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid

sequenceDecs_decode_bmi2_adjust_two:
	MOVQ R12, R13
	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid

sequenceDecs_decode_bmi2_adjust_three:
	LEAQ -1(R10), R13

sequenceDecs_decode_bmi2_adjust_test_temp_valid:
	TESTQ R13, R13
	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
	MOVQ  $0x00000001, R13

sequenceDecs_decode_bmi2_adjust_temp_valid:
	CMPQ    CX, $0x01
	CMOVQNE R11, R12
	MOVQ    R10, R11
	MOVQ    R13, R10
	MOVQ    R13, CX

sequenceDecs_decode_bmi2_adjust_end:
	MOVQ CX, 16(R9)

	// Check values
	MOVQ  8(R9), R13
	MOVQ  (R9), R14
	LEAQ  (R13)(R14*1), R15
	MOVQ  s+0(FP), BP
	ADDQ  R15, 256(BP)
	MOVQ  ctx+16(FP), R15
	SUBQ  R14, 128(R15)
	JS    error_not_enough_literals
	CMPQ  R13, $0x00020002
	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
	TESTQ R13, R13
	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch

sequenceDecs_decode_bmi2_match_len_ofs_ok:
	ADDQ $0x18, R9
	MOVQ ctx+16(FP), CX
	DECQ 96(CX)
	JNS  sequenceDecs_decode_bmi2_main_loop
	MOVQ s+0(FP), CX
	MOVQ R10, 144(CX)
	MOVQ R11, 152(CX)
	MOVQ R12, 160(CX)
	MOVQ br+8(FP), CX
	MOVQ AX, 32(CX)
	MOVB DL, 40(CX)
	MOVQ BX, 24(CX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decode_bmi2_error_match_len_too_big:
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
	MOVQ    br+8(FP), CX
	MOVQ    32(CX), AX
	MOVBQZX 40(CX), DX
	MOVQ    24(CX), BX
	MOVQ    (CX), CX
	ADDQ    BX, CX
	MOVQ    CX, (SP)
	MOVQ    ctx+16(FP), CX
	MOVQ    72(CX), SI
	MOVQ    80(CX), DI
	MOVQ    88(CX), R8
	MOVQ    104(CX), R9
	MOVQ    s+0(FP), CX
	MOVQ    144(CX), R10
	MOVQ    152(CX), R11
	MOVQ    160(CX), R12

sequenceDecs_decode_56_bmi2_main_loop:
	MOVQ (SP), R13

	// Fill bitreader to have enough for the offset and match length.
	CMPQ BX, $0x08
	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R13
	MOVQ (R13), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decode_56_bmi2_fill_end

sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decode_56_bmi2_fill_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decode_56_bmi2_fill_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R13
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R13), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte

sequenceDecs_decode_56_bmi2_fill_end:
	// Update offset
	MOVQ   $0x00000808, CX
	BEXTRQ CX, R8, R14
	MOVQ   AX, R15
	LEAQ   (DX)(R14*1), CX
	ROLQ   CL, R15
	BZHIQ  R14, R15, R15
	MOVQ   CX, DX
	MOVQ   R8, CX
	SHRQ   $0x20, CX
	ADDQ   R15, CX
	MOVQ   CX, 16(R9)

	// Update match length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, DI, R14
	MOVQ   AX, R15
	LEAQ   (DX)(R14*1), CX
	ROLQ   CL, R15
	BZHIQ  R14, R15, R15
	MOVQ   CX, DX
	MOVQ   DI, CX
	SHRQ   $0x20, CX
	ADDQ   R15, CX
	MOVQ   CX, 8(R9)

	// Update literal length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, SI, R14
	MOVQ   AX, R15
	LEAQ   (DX)(R14*1), CX
	ROLQ   CL, R15
	BZHIQ  R14, R15, R15
	MOVQ   CX, DX
	MOVQ   SI, CX
	SHRQ   $0x20, CX
	ADDQ   R15, CX
	MOVQ   CX, (R9)

	// Fill bitreader for state updates
	MOVQ    R13, (SP)
	MOVQ    $0x00000808, CX
	BEXTRQ  CX, R8, R13
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decode_56_bmi2_skip_update
	LEAQ    (SI)(DI*1), R14
	ADDQ    R8, R14
	MOVBQZX R14, R14
	LEAQ    (DX)(R14*1), CX
	MOVQ    AX, R15
	MOVQ    CX, DX
	ROLQ    CL, R15
	BZHIQ   R14, R15, R15

	// Update Offset State
	BZHIQ  R8, R15, CX
	SHRXQ  R8, R15, R15
	MOVQ   $0x00001010, R14
	BEXTRQ R14, R8, R8
	ADDQ   CX, R8

	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Match Length State
	BZHIQ  DI, R15, CX
	SHRXQ  DI, R15, R15
	MOVQ   $0x00001010, R14
	BEXTRQ R14, DI, DI
	ADDQ   CX, DI

	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Literal Length State
	BZHIQ  SI, R15, CX
	MOVQ   $0x00001010, R14
	BEXTRQ R14, SI, SI
	ADDQ   CX, SI

	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(SI*8), SI

sequenceDecs_decode_56_bmi2_skip_update:
	// Adjust offset
	MOVQ 16(R9), CX
	CMPQ R13, $0x01
	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
	MOVQ R11, R12
	MOVQ R10, R11
	MOVQ CX, R10
	JMP  sequenceDecs_decode_56_bmi2_adjust_end

sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
	CMPQ (R9), $0x00000000
	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
	INCQ CX
	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero

sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
	MOVQ  R10, CX
	JMP   sequenceDecs_decode_56_bmi2_adjust_end

sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
	CMPQ CX, $0x01
	JB   sequenceDecs_decode_56_bmi2_adjust_zero
	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
	CMPQ CX, $0x02
	JA   sequenceDecs_decode_56_bmi2_adjust_three
	JMP  sequenceDecs_decode_56_bmi2_adjust_two

sequenceDecs_decode_56_bmi2_adjust_zero:
	MOVQ R10, R13
	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid

sequenceDecs_decode_56_bmi2_adjust_one:
	MOVQ R11, R13
	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid

sequenceDecs_decode_56_bmi2_adjust_two:
	MOVQ R12, R13
	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid

sequenceDecs_decode_56_bmi2_adjust_three:
	LEAQ -1(R10), R13

sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
	TESTQ R13, R13
	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
	MOVQ  $0x00000001, R13

sequenceDecs_decode_56_bmi2_adjust_temp_valid:
	CMPQ    CX, $0x01
	CMOVQNE R11, R12
	MOVQ    R10, R11
	MOVQ    R13, R10
	MOVQ    R13, CX

sequenceDecs_decode_56_bmi2_adjust_end:
	MOVQ CX, 16(R9)

	// Check values
	MOVQ  8(R9), R13
	MOVQ  (R9), R14
	LEAQ  (R13)(R14*1), R15
	MOVQ  s+0(FP), BP
	ADDQ  R15, 256(BP)
	MOVQ  ctx+16(FP), R15
	SUBQ  R14, 128(R15)
	JS    error_not_enough_literals
	CMPQ  R13, $0x00020002
	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
	TESTQ CX, CX
	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
	TESTQ R13, R13
	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch

sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
	ADDQ $0x18, R9
	MOVQ ctx+16(FP), CX
	DECQ 96(CX)
	JNS  sequenceDecs_decode_56_bmi2_main_loop
	MOVQ s+0(FP), CX
	MOVQ R10, 144(CX)
	MOVQ R11, 152(CX)
	MOVQ R12, 160(CX)
	MOVQ br+8(FP), CX
	MOVQ AX, 32(CX)
	MOVB DL, 40(CX)
	MOVQ BX, 24(CX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decode_56_bmi2_error_match_len_too_big:
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
	MOVQ  ctx+0(FP), R10
	MOVQ  8(R10), CX
	TESTQ CX, CX
	JZ    empty_seqs
	MOVQ  (R10), AX
	MOVQ  24(R10), DX
	MOVQ  32(R10), BX
	MOVQ  80(R10), SI
	MOVQ  104(R10), DI
	MOVQ  120(R10), R8
	MOVQ  56(R10), R9
	MOVQ  64(R10), R10
	ADDQ  R10, R9

	// seqsBase += 24 * seqIndex
	LEAQ (DX)(DX*2), R11
	SHLQ $0x03, R11
	ADDQ R11, AX

	// outBase += outPosition
	ADDQ DI, BX

main_loop:
	MOVQ (AX), R11
	MOVQ 16(AX), R12
	MOVQ 8(AX), R13

	// Copy literals
	TESTQ R11, R11
	JZ    check_offset
	XORQ  R14, R14

copy_1:
	MOVUPS (SI)(R14*1), X0
	MOVUPS X0, (BX)(R14*1)
	ADDQ   $0x10, R14
	CMPQ   R14, R11
	JB     copy_1
	ADDQ   R11, SI
	ADDQ   R11, BX
	ADDQ   R11, DI

	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
	LEAQ (DI)(R10*1), R11
	CMPQ R12, R11
	JG   error_match_off_too_big
	CMPQ R12, R8
	JG   error_match_off_too_big

	// Copy match from history
	MOVQ  R12, R11
	SUBQ  DI, R11
	JLS   copy_match
	MOVQ  R9, R14
	SUBQ  R11, R14
	CMPQ  R13, R11
	JGE   copy_all_from_history
	XORQ  R11, R11
	TESTQ $0x00000001, R13
	JZ    copy_4_word
	MOVB  (R14)(R11*1), R12
	MOVB  R12, (BX)(R11*1)
	ADDQ  $0x01, R11

copy_4_word:
	TESTQ $0x00000002, R13
	JZ    copy_4_dword
	MOVW  (R14)(R11*1), R12
	MOVW  R12, (BX)(R11*1)
	ADDQ  $0x02, R11

copy_4_dword:
	TESTQ $0x00000004, R13
	JZ    copy_4_qword
	MOVL  (R14)(R11*1), R12
	MOVL  R12, (BX)(R11*1)
	ADDQ  $0x04, R11

copy_4_qword:
	TESTQ $0x00000008, R13
	JZ    copy_4_test
	MOVQ  (R14)(R11*1), R12
	MOVQ  R12, (BX)(R11*1)
	ADDQ  $0x08, R11
	JMP   copy_4_test

copy_4:
	MOVUPS (R14)(R11*1), X0
	MOVUPS X0, (BX)(R11*1)
	ADDQ   $0x10, R11

copy_4_test:
	CMPQ R11, R13
	JB   copy_4
	ADDQ R13, DI
	ADDQ R13, BX
	ADDQ $0x18, AX
	INCQ DX
	CMPQ DX, CX
	JB   main_loop
	JMP  loop_finished

copy_all_from_history:
	XORQ  R15, R15
	TESTQ $0x00000001, R11
	JZ    copy_5_word
	MOVB  (R14)(R15*1), BP
	MOVB  BP, (BX)(R15*1)
	ADDQ  $0x01, R15

copy_5_word:
	TESTQ $0x00000002, R11
	JZ    copy_5_dword
	MOVW  (R14)(R15*1), BP
	MOVW  BP, (BX)(R15*1)
	ADDQ  $0x02, R15

copy_5_dword:
	TESTQ $0x00000004, R11
	JZ    copy_5_qword
	MOVL  (R14)(R15*1), BP
	MOVL  BP, (BX)(R15*1)
	ADDQ  $0x04, R15

copy_5_qword:
	TESTQ $0x00000008, R11
	JZ    copy_5_test
	MOVQ  (R14)(R15*1), BP
	MOVQ  BP, (BX)(R15*1)
	ADDQ  $0x08, R15
	JMP   copy_5_test

copy_5:
	MOVUPS (R14)(R15*1), X0
	MOVUPS X0, (BX)(R15*1)
	ADDQ   $0x10, R15

copy_5_test:
	CMPQ R15, R11
	JB   copy_5
	ADDQ R11, BX
	ADDQ R11, DI
	SUBQ R11, R13

	// Copy match from the current buffer
copy_match:
	TESTQ R13, R13
	JZ    handle_loop
	MOVQ  BX, R11
	SUBQ  R12, R11

	// ml <= mo
	CMPQ R13, R12
	JA   copy_overlapping_match

	// Copy non-overlapping match
	ADDQ R13, DI
	MOVQ BX, R12
	ADDQ R13, BX

copy_2:
	MOVUPS (R11), X0
	MOVUPS X0, (R12)
	ADDQ   $0x10, R11
	ADDQ   $0x10, R12
	SUBQ   $0x10, R13
	JHI    copy_2
	JMP    handle_loop

	// Copy overlapping match
copy_overlapping_match:
	ADDQ R13, DI

copy_slow_3:
	MOVB (R11), R12
	MOVB R12, (BX)
	INCQ R11
	INCQ BX
	DECQ R13
	JNZ  copy_slow_3

handle_loop:
	ADDQ $0x18, AX
	INCQ DX
	CMPQ DX, CX
	JB   main_loop

loop_finished:
	// Return value
	MOVB $0x01, ret+8(FP)

	// Update the context
	MOVQ ctx+0(FP), AX
	MOVQ DX, 24(AX)
	MOVQ DI, 104(AX)
	MOVQ 80(AX), CX
	SUBQ CX, SI
	MOVQ SI, 112(AX)
	RET

error_match_off_too_big:
	// Return value
	MOVB $0x00, ret+8(FP)

	// Update the context
	MOVQ ctx+0(FP), AX
	MOVQ DX, 24(AX)
	MOVQ DI, 104(AX)
	MOVQ 80(AX), CX
	SUBQ CX, SI
	MOVQ SI, 112(AX)
	RET

empty_seqs:
	// Return value
	MOVB $0x01, ret+8(FP)
	RET

// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
	MOVQ  ctx+0(FP), R10
	MOVQ  8(R10), CX
	TESTQ CX, CX
	JZ    empty_seqs
	MOVQ  (R10), AX
	MOVQ  24(R10), DX
	MOVQ  32(R10), BX
	MOVQ  80(R10), SI
	MOVQ  104(R10), DI
	MOVQ  120(R10), R8
	MOVQ  56(R10), R9
	MOVQ  64(R10), R10
	ADDQ  R10, R9

	// seqsBase += 24 * seqIndex
	LEAQ (DX)(DX*2), R11
	SHLQ $0x03, R11
	ADDQ R11, AX

	// outBase += outPosition
	ADDQ DI, BX

main_loop:
	MOVQ (AX), R11
	MOVQ 16(AX), R12
	MOVQ 8(AX), R13

	// Copy literals
	TESTQ R11, R11
	JZ    check_offset
	XORQ  R14, R14
	TESTQ $0x00000001, R11
	JZ    copy_1_word
	MOVB  (SI)(R14*1), R15
	MOVB  R15, (BX)(R14*1)
	ADDQ  $0x01, R14

copy_1_word:
	TESTQ $0x00000002, R11
	JZ    copy_1_dword
	MOVW  (SI)(R14*1), R15
	MOVW  R15, (BX)(R14*1)
	ADDQ  $0x02, R14

copy_1_dword:
	TESTQ $0x00000004, R11
	JZ    copy_1_qword
	MOVL  (SI)(R14*1), R15
	MOVL  R15, (BX)(R14*1)
	ADDQ  $0x04, R14

copy_1_qword:
	TESTQ $0x00000008, R11
	JZ    copy_1_test
	MOVQ  (SI)(R14*1), R15
	MOVQ  R15, (BX)(R14*1)
	ADDQ  $0x08, R14
	JMP   copy_1_test

copy_1:
	MOVUPS (SI)(R14*1), X0
	MOVUPS X0, (BX)(R14*1)
	ADDQ   $0x10, R14

copy_1_test:
	CMPQ R14, R11
	JB   copy_1
	ADDQ R11, SI
	ADDQ R11, BX
	ADDQ R11, DI

	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
	LEAQ (DI)(R10*1), R11
	CMPQ R12, R11
	JG   error_match_off_too_big
	CMPQ R12, R8
	JG   error_match_off_too_big

	// Copy match from history
	MOVQ  R12, R11
	SUBQ  DI, R11
	JLS   copy_match
	MOVQ  R9, R14
	SUBQ  R11, R14
	CMPQ  R13, R11
	JGE   copy_all_from_history
	XORQ  R11, R11
	TESTQ $0x00000001, R13
	JZ    copy_4_word
	MOVB  (R14)(R11*1), R12
	MOVB  R12, (BX)(R11*1)
	ADDQ  $0x01, R11

copy_4_word:
	TESTQ $0x00000002, R13
	JZ    copy_4_dword
	MOVW  (R14)(R11*1), R12
	MOVW  R12, (BX)(R11*1)
	ADDQ  $0x02, R11

copy_4_dword:
	TESTQ $0x00000004, R13
	JZ    copy_4_qword
	MOVL  (R14)(R11*1), R12
	MOVL  R12, (BX)(R11*1)
	ADDQ  $0x04, R11

copy_4_qword:
	TESTQ $0x00000008, R13
	JZ    copy_4_test
	MOVQ  (R14)(R11*1), R12
	MOVQ  R12, (BX)(R11*1)
	ADDQ  $0x08, R11
	JMP   copy_4_test

copy_4:
	MOVUPS (R14)(R11*1), X0
	MOVUPS X0, (BX)(R11*1)
	ADDQ   $0x10, R11

copy_4_test:
	CMPQ R11, R13
	JB   copy_4
	ADDQ R13, DI
	ADDQ R13, BX
	ADDQ $0x18, AX
	INCQ DX
	CMPQ DX, CX
	JB   main_loop
	JMP  loop_finished

copy_all_from_history:
	XORQ  R15, R15
	TESTQ $0x00000001, R11
	JZ    copy_5_word
	MOVB  (R14)(R15*1), BP
	MOVB  BP, (BX)(R15*1)
	ADDQ  $0x01, R15

copy_5_word:
	TESTQ $0x00000002, R11
	JZ    copy_5_dword
	MOVW  (R14)(R15*1), BP
	MOVW  BP, (BX)(R15*1)
	ADDQ  $0x02, R15

copy_5_dword:
	TESTQ $0x00000004, R11
	JZ    copy_5_qword
	MOVL  (R14)(R15*1), BP
	MOVL  BP, (BX)(R15*1)
	ADDQ  $0x04, R15

copy_5_qword:
	TESTQ $0x00000008, R11
	JZ    copy_5_test
	MOVQ  (R14)(R15*1), BP
	MOVQ  BP, (BX)(R15*1)
	ADDQ  $0x08, R15
	JMP   copy_5_test

copy_5:
	MOVUPS (R14)(R15*1), X0
	MOVUPS X0, (BX)(R15*1)
	ADDQ   $0x10, R15

copy_5_test:
	CMPQ R15, R11
	JB   copy_5
	ADDQ R11, BX
	ADDQ R11, DI
	SUBQ R11, R13

	// Copy match from the current buffer
copy_match:
	TESTQ R13, R13
	JZ    handle_loop
	MOVQ  BX, R11
	SUBQ  R12, R11

	// ml <= mo
	CMPQ R13, R12
	JA   copy_overlapping_match

	// Copy non-overlapping match
	ADDQ  R13, DI
	XORQ  R12, R12
	TESTQ $0x00000001, R13
	JZ    copy_2_word
	MOVB  (R11)(R12*1), R14
	MOVB  R14, (BX)(R12*1)
	ADDQ  $0x01, R12

copy_2_word:
	TESTQ $0x00000002, R13
	JZ    copy_2_dword
	MOVW  (R11)(R12*1), R14
	MOVW  R14, (BX)(R12*1)
	ADDQ  $0x02, R12

copy_2_dword:
	TESTQ $0x00000004, R13
	JZ    copy_2_qword
	MOVL  (R11)(R12*1), R14
	MOVL  R14, (BX)(R12*1)
	ADDQ  $0x04, R12

copy_2_qword:
	TESTQ $0x00000008, R13
	JZ    copy_2_test
	MOVQ  (R11)(R12*1), R14
	MOVQ  R14, (BX)(R12*1)
	ADDQ  $0x08, R12
	JMP   copy_2_test

copy_2:
	MOVUPS (R11)(R12*1), X0
	MOVUPS X0, (BX)(R12*1)
	ADDQ   $0x10, R12

copy_2_test:
	CMPQ R12, R13
	JB   copy_2
	ADDQ R13, BX
	JMP  handle_loop

	// Copy overlapping match
copy_overlapping_match:
	ADDQ R13, DI

copy_slow_3:
	MOVB (R11), R12
	MOVB R12, (BX)
	INCQ R11
	INCQ BX
	DECQ R13
	JNZ  copy_slow_3

handle_loop:
	ADDQ $0x18, AX
	INCQ DX
	CMPQ DX, CX
	JB   main_loop

loop_finished:
	// Return value
	MOVB $0x01, ret+8(FP)

	// Update the context
	MOVQ ctx+0(FP), AX
	MOVQ DX, 24(AX)
	MOVQ DI, 104(AX)
	MOVQ 80(AX), CX
	SUBQ CX, SI
	MOVQ SI, 112(AX)
	RET

error_match_off_too_big:
	// Return value
	MOVB $0x00, ret+8(FP)

	// Update the context
	MOVQ ctx+0(FP), AX
	MOVQ DX, 24(AX)
	MOVQ DI, 104(AX)
	MOVQ 80(AX), CX
	SUBQ CX, SI
	MOVQ SI, 112(AX)
	RET

empty_seqs:
	// Return value
	MOVB $0x01, ret+8(FP)
	RET

// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
	MOVQ    br+8(FP), AX
	MOVQ    32(AX), DX
	MOVBQZX 40(AX), BX
	MOVQ    24(AX), SI
	MOVQ    (AX), AX
	ADDQ    SI, AX
	MOVQ    AX, (SP)
	MOVQ    ctx+16(FP), AX
	MOVQ    72(AX), DI
	MOVQ    80(AX), R8
	MOVQ    88(AX), R9
	MOVQ    112(AX), R10
	MOVQ    128(AX), CX
	MOVQ    CX, 32(SP)
	MOVQ    144(AX), R11
	MOVQ    136(AX), R12
	MOVQ    200(AX), CX
	MOVQ    CX, 56(SP)
	MOVQ    176(AX), CX
	MOVQ    CX, 48(SP)
	MOVQ    184(AX), AX
	MOVQ    AX, 40(SP)
	MOVQ    40(SP), AX
	ADDQ    AX, 48(SP)

	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
	ADDQ R10, 32(SP)

	// outBase += outPosition
	ADDQ R12, R10

sequenceDecs_decodeSync_amd64_main_loop:
	MOVQ (SP), R13

	// Fill bitreader to have enough for the offset and match length.
	CMPQ SI, $0x08
	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R13
	MOVQ (R13), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decodeSync_amd64_fill_end

sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decodeSync_amd64_fill_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decodeSync_amd64_fill_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R13
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R13), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte

sequenceDecs_decodeSync_amd64_fill_end:
	// Update offset
	MOVQ    R9, AX
	MOVQ    BX, CX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R14
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R14
	ADDQ    R14, AX
	MOVQ    AX, 8(SP)

	// Update match length
	MOVQ    R8, AX
	MOVQ    BX, CX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R14
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R14
	ADDQ    R14, AX
	MOVQ    AX, 16(SP)

	// Fill bitreader to have enough for the remaining
	CMPQ SI, $0x08
	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R13
	MOVQ (R13), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decodeSync_amd64_fill_2_end

sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R13
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R13), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte

sequenceDecs_decodeSync_amd64_fill_2_end:
	// Update literal length
	MOVQ    DI, AX
	MOVQ    BX, CX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R14
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R14
	ADDQ    R14, AX
	MOVQ    AX, 24(SP)

	// Fill bitreader for state updates
	MOVQ    R13, (SP)
	MOVQ    R9, AX
	SHRQ    $0x08, AX
	MOVBQZX AL, AX
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decodeSync_amd64_skip_update

	// Update Literal Length State
	MOVBQZX DI, R13
	SHRQ    $0x10, DI
	MOVWQZX DI, DI
	CMPQ    R13, $0x00
	JZ      sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R13, BX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVQ    R13, CX
	NEGQ    CX
	SHRQ    CL, R14
	ADDQ    R14, DI

sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Match Length State
	MOVBQZX R8, R13
	SHRQ    $0x10, R8
	MOVWQZX R8, R8
	CMPQ    R13, $0x00
	JZ      sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R13, BX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVQ    R13, CX
	NEGQ    CX
	SHRQ    CL, R14
	ADDQ    R14, R8

sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Offset State
	MOVBQZX R9, R13
	SHRQ    $0x10, R9
	MOVWQZX R9, R9
	CMPQ    R13, $0x00
	JZ      sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R13, BX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVQ    R13, CX
	NEGQ    CX
	SHRQ    CL, R14
	ADDQ    R14, R9

sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R9*8), R9

sequenceDecs_decodeSync_amd64_skip_update:
	// Adjust offset
	MOVQ   s+0(FP), CX
	MOVQ   8(SP), R13
	CMPQ   AX, $0x01
	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
	MOVUPS 144(CX), X0
	MOVQ   R13, 144(CX)
	MOVUPS X0, 152(CX)
	JMP    sequenceDecs_decodeSync_amd64_adjust_end

sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
	CMPQ 24(SP), $0x00000000
	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
	INCQ R13
	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero

sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
	MOVQ  144(CX), R13
	JMP   sequenceDecs_decodeSync_amd64_adjust_end

sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
	MOVQ    R13, AX
	XORQ    R14, R14
	MOVQ    $-1, R15
	CMPQ    R13, $0x03
	CMOVQEQ R14, AX
	CMOVQEQ R15, R14
	LEAQ    144(CX), R15
	ADDQ    (R15)(AX*8), R14
	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
	MOVQ    $0x00000001, R14

sequenceDecs_decodeSync_amd64_adjust_temp_valid:
	CMPQ R13, $0x01
	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
	MOVQ 152(CX), AX
	MOVQ AX, 160(CX)

sequenceDecs_decodeSync_amd64_adjust_skip:
	MOVQ 144(CX), AX
	MOVQ AX, 152(CX)
	MOVQ R14, 144(CX)
	MOVQ R14, R13

sequenceDecs_decodeSync_amd64_adjust_end:
	MOVQ R13, 8(SP)

	// Check values
	MOVQ  16(SP), AX
	MOVQ  24(SP), CX
	LEAQ  (AX)(CX*1), R14
	MOVQ  s+0(FP), R15
	ADDQ  R14, 256(R15)
	MOVQ  ctx+16(FP), R14
	SUBQ  CX, 104(R14)
	JS    error_not_enough_literals
	CMPQ  AX, $0x00020002
	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
	TESTQ AX, AX
	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch

sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
	MOVQ 24(SP), AX
	MOVQ 8(SP), CX
	MOVQ 16(SP), R13

	// Check if we have enough space in s.out
	LEAQ (AX)(R13*1), R14
	ADDQ R10, R14
	CMPQ R14, 32(SP)
	JA   error_not_enough_space

	// Copy literals
	TESTQ AX, AX
	JZ    check_offset
	XORQ  R14, R14

copy_1:
	MOVUPS (R11)(R14*1), X0
	MOVUPS X0, (R10)(R14*1)
	ADDQ   $0x10, R14
	CMPQ   R14, AX
	JB     copy_1
	ADDQ   AX, R11
	ADDQ   AX, R10
	ADDQ   AX, R12

	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
	MOVQ R12, AX
	ADDQ 40(SP), AX
	CMPQ CX, AX
	JG   error_match_off_too_big
	CMPQ CX, 56(SP)
	JG   error_match_off_too_big

	// Copy match from history
	MOVQ  CX, AX
	SUBQ  R12, AX
	JLS   copy_match
	MOVQ  48(SP), R14
	SUBQ  AX, R14
	CMPQ  R13, AX
	JGE   copy_all_from_history
	XORQ  AX, AX
	TESTQ $0x00000001, R13
	JZ    copy_4_word
	MOVB  (R14)(AX*1), CL
	MOVB  CL, (R10)(AX*1)
	ADDQ  $0x01, AX

copy_4_word:
	TESTQ $0x00000002, R13
	JZ    copy_4_dword
	MOVW  (R14)(AX*1), CX
	MOVW  CX, (R10)(AX*1)
	ADDQ  $0x02, AX

copy_4_dword:
	TESTQ $0x00000004, R13
	JZ    copy_4_qword
	MOVL  (R14)(AX*1), CX
	MOVL  CX, (R10)(AX*1)
	ADDQ  $0x04, AX

copy_4_qword:
	TESTQ $0x00000008, R13
	JZ    copy_4_test
	MOVQ  (R14)(AX*1), CX
	MOVQ  CX, (R10)(AX*1)
	ADDQ  $0x08, AX
	JMP   copy_4_test

copy_4:
	MOVUPS (R14)(AX*1), X0
	MOVUPS X0, (R10)(AX*1)
	ADDQ   $0x10, AX

copy_4_test:
	CMPQ AX, R13
	JB   copy_4
	ADDQ R13, R12
	ADDQ R13, R10
	JMP  handle_loop
	JMP loop_finished

copy_all_from_history:
	XORQ  R15, R15
	TESTQ $0x00000001, AX
	JZ    copy_5_word
	MOVB  (R14)(R15*1), BP
	MOVB  BP, (R10)(R15*1)
	ADDQ  $0x01, R15

copy_5_word:
	TESTQ $0x00000002, AX
	JZ    copy_5_dword
	MOVW  (R14)(R15*1), BP
	MOVW  BP, (R10)(R15*1)
	ADDQ  $0x02, R15

copy_5_dword:
	TESTQ $0x00000004, AX
	JZ    copy_5_qword
	MOVL  (R14)(R15*1), BP
	MOVL  BP, (R10)(R15*1)
	ADDQ  $0x04, R15

copy_5_qword:
	TESTQ $0x00000008, AX
	JZ    copy_5_test
	MOVQ  (R14)(R15*1), BP
	MOVQ  BP, (R10)(R15*1)
	ADDQ  $0x08, R15
	JMP   copy_5_test

copy_5:
	MOVUPS (R14)(R15*1), X0
	MOVUPS X0, (R10)(R15*1)
	ADDQ   $0x10, R15

copy_5_test:
	CMPQ R15, AX
	JB   copy_5
	ADDQ AX, R10
	ADDQ AX, R12
	SUBQ AX, R13

	// Copy match from the current buffer
copy_match:
	TESTQ R13, R13
	JZ    handle_loop
	MOVQ  R10, AX
	SUBQ  CX, AX

	// ml <= mo
	CMPQ R13, CX
	JA   copy_overlapping_match

	// Copy non-overlapping match
	ADDQ R13, R12
	MOVQ R10, CX
	ADDQ R13, R10

copy_2:
	MOVUPS (AX), X0
	MOVUPS X0, (CX)
	ADDQ   $0x10, AX
	ADDQ   $0x10, CX
	SUBQ   $0x10, R13
	JHI    copy_2
	JMP    handle_loop

	// Copy overlapping match
copy_overlapping_match:
	ADDQ R13, R12

copy_slow_3:
	MOVB (AX), CL
	MOVB CL, (R10)
	INCQ AX
	INCQ R10
	DECQ R13
	JNZ  copy_slow_3

handle_loop:
	MOVQ ctx+16(FP), AX
	DECQ 96(AX)
	JNS  sequenceDecs_decodeSync_amd64_main_loop

loop_finished:
	MOVQ br+8(FP), AX
	MOVQ DX, 32(AX)
	MOVB BL, 40(AX)
	MOVQ SI, 24(AX)

	// Update the context
	MOVQ ctx+16(FP), AX
	MOVQ R12, 136(AX)
	MOVQ 144(AX), CX
	SUBQ CX, R11
	MOVQ R11, 168(AX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
	MOVQ 16(SP), AX
	MOVQ ctx+16(FP), CX
	MOVQ AX, 216(CX)
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decodeSync_amd64_error_match_len_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
error_match_off_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 8(SP), CX
	MOVQ CX, 224(AX)
	MOVQ R12, 136(AX)
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
error_not_enough_space:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ R12, 136(AX)
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
	MOVQ    br+8(FP), CX
	MOVQ    32(CX), AX
	MOVBQZX 40(CX), DX
	MOVQ    24(CX), BX
	MOVQ    (CX), CX
	ADDQ    BX, CX
	MOVQ    CX, (SP)
	MOVQ    ctx+16(FP), CX
	MOVQ    72(CX), SI
	MOVQ    80(CX), DI
	MOVQ    88(CX), R8
	MOVQ    112(CX), R9
	MOVQ    128(CX), R10
	MOVQ    R10, 32(SP)
	MOVQ    144(CX), R10
	MOVQ    136(CX), R11
	MOVQ    200(CX), R12
	MOVQ    R12, 56(SP)
	MOVQ    176(CX), R12
	MOVQ    R12, 48(SP)
	MOVQ    184(CX), CX
	MOVQ    CX, 40(SP)
	MOVQ    40(SP), CX
	ADDQ    CX, 48(SP)

	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
	ADDQ R9, 32(SP)

	// outBase += outPosition
	ADDQ R11, R9

sequenceDecs_decodeSync_bmi2_main_loop:
	MOVQ (SP), R12

	// Fill bitreader to have enough for the offset and match length.
	CMPQ BX, $0x08
	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R12
	MOVQ (R12), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decodeSync_bmi2_fill_end

sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decodeSync_bmi2_fill_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decodeSync_bmi2_fill_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R12
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R12), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte

sequenceDecs_decodeSync_bmi2_fill_end:
	// Update offset
	MOVQ   $0x00000808, CX
	BEXTRQ CX, R8, R13
	MOVQ   AX, R14
	LEAQ   (DX)(R13*1), CX
	ROLQ   CL, R14
	BZHIQ  R13, R14, R14
	MOVQ   CX, DX
	MOVQ   R8, CX
	SHRQ   $0x20, CX
	ADDQ   R14, CX
	MOVQ   CX, 8(SP)

	// Update match length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, DI, R13
	MOVQ   AX, R14
	LEAQ   (DX)(R13*1), CX
	ROLQ   CL, R14
	BZHIQ  R13, R14, R14
	MOVQ   CX, DX
	MOVQ   DI, CX
	SHRQ   $0x20, CX
	ADDQ   R14, CX
	MOVQ   CX, 16(SP)

	// Fill bitreader to have enough for the remaining
	CMPQ BX, $0x08
	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R12
	MOVQ (R12), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end

sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R12
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R12), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte

sequenceDecs_decodeSync_bmi2_fill_2_end:
	// Update literal length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, SI, R13
	MOVQ   AX, R14
	LEAQ   (DX)(R13*1), CX
	ROLQ   CL, R14
	BZHIQ  R13, R14, R14
	MOVQ   CX, DX
	MOVQ   SI, CX
	SHRQ   $0x20, CX
	ADDQ   R14, CX
	MOVQ   CX, 24(SP)

	// Fill bitreader for state updates
	MOVQ    R12, (SP)
	MOVQ    $0x00000808, CX
	BEXTRQ  CX, R8, R12
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decodeSync_bmi2_skip_update
	LEAQ    (SI)(DI*1), R13
	ADDQ    R8, R13
	MOVBQZX R13, R13
	LEAQ    (DX)(R13*1), CX
	MOVQ    AX, R14
	MOVQ    CX, DX
	ROLQ    CL, R14
	BZHIQ   R13, R14, R14

	// Update Offset State
	BZHIQ  R8, R14, CX
	SHRXQ  R8, R14, R14
	MOVQ   $0x00001010, R13
	BEXTRQ R13, R8, R8
	ADDQ   CX, R8

	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Match Length State
	BZHIQ  DI, R14, CX
	SHRXQ  DI, R14, R14
	MOVQ   $0x00001010, R13
	BEXTRQ R13, DI, DI
	ADDQ   CX, DI

	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Literal Length State
	BZHIQ  SI, R14, CX
	MOVQ   $0x00001010, R13
	BEXTRQ R13, SI, SI
	ADDQ   CX, SI

	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(SI*8), SI

sequenceDecs_decodeSync_bmi2_skip_update:
	// Adjust offset
	MOVQ   s+0(FP), CX
	MOVQ   8(SP), R13
	CMPQ   R12, $0x01
	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
	MOVUPS 144(CX), X0
	MOVQ   R13, 144(CX)
	MOVUPS X0, 152(CX)
	JMP    sequenceDecs_decodeSync_bmi2_adjust_end

sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
	CMPQ 24(SP), $0x00000000
	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
	INCQ R13
	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero

sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
	MOVQ  144(CX), R13
	JMP   sequenceDecs_decodeSync_bmi2_adjust_end

sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
	MOVQ    R13, R12
	XORQ    R14, R14
	MOVQ    $-1, R15
	CMPQ    R13, $0x03
	CMOVQEQ R14, R12
	CMOVQEQ R15, R14
	LEAQ    144(CX), R15
	ADDQ    (R15)(R12*8), R14
	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
	MOVQ    $0x00000001, R14

sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
	CMPQ R13, $0x01
	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
	MOVQ 152(CX), R12
	MOVQ R12, 160(CX)

sequenceDecs_decodeSync_bmi2_adjust_skip:
	MOVQ 144(CX), R12
	MOVQ R12, 152(CX)
	MOVQ R14, 144(CX)
	MOVQ R14, R13

sequenceDecs_decodeSync_bmi2_adjust_end:
	MOVQ R13, 8(SP)

	// Check values
	MOVQ  16(SP), CX
	MOVQ  24(SP), R12
	LEAQ  (CX)(R12*1), R14
	MOVQ  s+0(FP), R15
	ADDQ  R14, 256(R15)
	MOVQ  ctx+16(FP), R14
	SUBQ  R12, 104(R14)
	JS    error_not_enough_literals
	CMPQ  CX, $0x00020002
	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
	TESTQ CX, CX
	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch

sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
	MOVQ 24(SP), CX
	MOVQ 8(SP), R12
	MOVQ 16(SP), R13

	// Check if we have enough space in s.out
	LEAQ (CX)(R13*1), R14
	ADDQ R9, R14
	CMPQ R14, 32(SP)
	JA   error_not_enough_space

	// Copy literals
	TESTQ CX, CX
	JZ    check_offset
	XORQ  R14, R14

copy_1:
	MOVUPS (R10)(R14*1), X0
	MOVUPS X0, (R9)(R14*1)
	ADDQ   $0x10, R14
	CMPQ   R14, CX
	JB     copy_1
	ADDQ   CX, R10
	ADDQ   CX, R9
	ADDQ   CX, R11

	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
	MOVQ R11, CX
	ADDQ 40(SP), CX
	CMPQ R12, CX
	JG   error_match_off_too_big
	CMPQ R12, 56(SP)
	JG   error_match_off_too_big

	// Copy match from history
	MOVQ  R12, CX
	SUBQ  R11, CX
	JLS   copy_match
	MOVQ  48(SP), R14
	SUBQ  CX, R14
	CMPQ  R13, CX
	JGE   copy_all_from_history
	XORQ  CX, CX
	TESTQ $0x00000001, R13
	JZ    copy_4_word
	MOVB  (R14)(CX*1), R12
	MOVB  R12, (R9)(CX*1)
	ADDQ  $0x01, CX

copy_4_word:
	TESTQ $0x00000002, R13
	JZ    copy_4_dword
	MOVW  (R14)(CX*1), R12
	MOVW  R12, (R9)(CX*1)
	ADDQ  $0x02, CX

copy_4_dword:
	TESTQ $0x00000004, R13
	JZ    copy_4_qword
	MOVL  (R14)(CX*1), R12
	MOVL  R12, (R9)(CX*1)
	ADDQ  $0x04, CX

copy_4_qword:
	TESTQ $0x00000008, R13
	JZ    copy_4_test
	MOVQ  (R14)(CX*1), R12
	MOVQ  R12, (R9)(CX*1)
	ADDQ  $0x08, CX
	JMP   copy_4_test

copy_4:
	MOVUPS (R14)(CX*1), X0
	MOVUPS X0, (R9)(CX*1)
	ADDQ   $0x10, CX

copy_4_test:
	CMPQ CX, R13
	JB   copy_4
	ADDQ R13, R11
	ADDQ R13, R9
	JMP  handle_loop
	JMP loop_finished

copy_all_from_history:
	XORQ  R15, R15
	TESTQ $0x00000001, CX
	JZ    copy_5_word
	MOVB  (R14)(R15*1), BP
	MOVB  BP, (R9)(R15*1)
	ADDQ  $0x01, R15

copy_5_word:
	TESTQ $0x00000002, CX
	JZ    copy_5_dword
	MOVW  (R14)(R15*1), BP
	MOVW  BP, (R9)(R15*1)
	ADDQ  $0x02, R15

copy_5_dword:
	TESTQ $0x00000004, CX
	JZ    copy_5_qword
	MOVL  (R14)(R15*1), BP
	MOVL  BP, (R9)(R15*1)
	ADDQ  $0x04, R15

copy_5_qword:
	TESTQ $0x00000008, CX
	JZ    copy_5_test
	MOVQ  (R14)(R15*1), BP
	MOVQ  BP, (R9)(R15*1)
	ADDQ  $0x08, R15
	JMP   copy_5_test

copy_5:
	MOVUPS (R14)(R15*1), X0
	MOVUPS X0, (R9)(R15*1)
	ADDQ   $0x10, R15

copy_5_test:
	CMPQ R15, CX
	JB   copy_5
	ADDQ CX, R9
	ADDQ CX, R11
	SUBQ CX, R13

	// Copy match from the current buffer
copy_match:
	TESTQ R13, R13
	JZ    handle_loop
	MOVQ  R9, CX
	SUBQ  R12, CX

	// ml <= mo
	CMPQ R13, R12
	JA   copy_overlapping_match

	// Copy non-overlapping match
	ADDQ R13, R11
	MOVQ R9, R12
	ADDQ R13, R9

copy_2:
	MOVUPS (CX), X0
	MOVUPS X0, (R12)
	ADDQ   $0x10, CX
	ADDQ   $0x10, R12
	SUBQ   $0x10, R13
	JHI    copy_2
	JMP    handle_loop

	// Copy overlapping match
copy_overlapping_match:
	ADDQ R13, R11

copy_slow_3:
	MOVB (CX), R12
	MOVB R12, (R9)
	INCQ CX
	INCQ R9
	DECQ R13
	JNZ  copy_slow_3

handle_loop:
	MOVQ ctx+16(FP), CX
	DECQ 96(CX)
	JNS  sequenceDecs_decodeSync_bmi2_main_loop

loop_finished:
	MOVQ br+8(FP), CX
	MOVQ AX, 32(CX)
	MOVB DL, 40(CX)
	MOVQ BX, 24(CX)

	// Update the context
	MOVQ ctx+16(FP), AX
	MOVQ R11, 136(AX)
	MOVQ 144(AX), CX
	SUBQ CX, R10
	MOVQ R10, 168(AX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
	MOVQ 16(SP), AX
	MOVQ ctx+16(FP), CX
	MOVQ AX, 216(CX)
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
error_match_off_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 8(SP), CX
	MOVQ CX, 224(AX)
	MOVQ R11, 136(AX)
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
error_not_enough_space:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ R11, 136(AX)
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
	MOVQ    br+8(FP), AX
	MOVQ    32(AX), DX
	MOVBQZX 40(AX), BX
	MOVQ    24(AX), SI
	MOVQ    (AX), AX
	ADDQ    SI, AX
	MOVQ    AX, (SP)
	MOVQ    ctx+16(FP), AX
	MOVQ    72(AX), DI
	MOVQ    80(AX), R8
	MOVQ    88(AX), R9
	MOVQ    112(AX), R10
	MOVQ    128(AX), CX
	MOVQ    CX, 32(SP)
	MOVQ    144(AX), R11
	MOVQ    136(AX), R12
	MOVQ    200(AX), CX
	MOVQ    CX, 56(SP)
	MOVQ    176(AX), CX
	MOVQ    CX, 48(SP)
	MOVQ    184(AX), AX
	MOVQ    AX, 40(SP)
	MOVQ    40(SP), AX
	ADDQ    AX, 48(SP)

	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
	ADDQ R10, 32(SP)

	// outBase += outPosition
	ADDQ R12, R10

sequenceDecs_decodeSync_safe_amd64_main_loop:
	MOVQ (SP), R13

	// Fill bitreader to have enough for the offset and match length.
	CMPQ SI, $0x08
	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R13
	MOVQ (R13), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end

sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R13
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R13), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte

sequenceDecs_decodeSync_safe_amd64_fill_end:
	// Update offset
	MOVQ    R9, AX
	MOVQ    BX, CX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R14
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R14
	ADDQ    R14, AX
	MOVQ    AX, 8(SP)

	// Update match length
	MOVQ    R8, AX
	MOVQ    BX, CX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R14
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R14
	ADDQ    R14, AX
	MOVQ    AX, 16(SP)

	// Fill bitreader to have enough for the remaining
	CMPQ SI, $0x08
	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
	MOVQ BX, AX
	SHRQ $0x03, AX
	SUBQ AX, R13
	MOVQ (R13), DX
	SUBQ AX, SI
	ANDQ $0x07, BX
	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end

sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
	CMPQ    SI, $0x00
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
	CMPQ    BX, $0x07
	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
	SHLQ    $0x08, DX
	SUBQ    $0x01, R13
	SUBQ    $0x01, SI
	SUBQ    $0x08, BX
	MOVBQZX (R13), AX
	ORQ     AX, DX
	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte

sequenceDecs_decodeSync_safe_amd64_fill_2_end:
	// Update literal length
	MOVQ    DI, AX
	MOVQ    BX, CX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVB    AH, CL
	ADDQ    CX, BX
	NEGL    CX
	SHRQ    CL, R14
	SHRQ    $0x20, AX
	TESTQ   CX, CX
	CMOVQEQ CX, R14
	ADDQ    R14, AX
	MOVQ    AX, 24(SP)

	// Fill bitreader for state updates
	MOVQ    R13, (SP)
	MOVQ    R9, AX
	SHRQ    $0x08, AX
	MOVBQZX AL, AX
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update

	// Update Literal Length State
	MOVBQZX DI, R13
	SHRQ    $0x10, DI
	MOVWQZX DI, DI
	CMPQ    R13, $0x00
	JZ      sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R13, BX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVQ    R13, CX
	NEGQ    CX
	SHRQ    CL, R14
	ADDQ    R14, DI

sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Match Length State
	MOVBQZX R8, R13
	SHRQ    $0x10, R8
	MOVWQZX R8, R8
	CMPQ    R13, $0x00
	JZ      sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R13, BX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVQ    R13, CX
	NEGQ    CX
	SHRQ    CL, R14
	ADDQ    R14, R8

sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Offset State
	MOVBQZX R9, R13
	SHRQ    $0x10, R9
	MOVWQZX R9, R9
	CMPQ    R13, $0x00
	JZ      sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
	MOVQ    BX, CX
	ADDQ    R13, BX
	MOVQ    DX, R14
	SHLQ    CL, R14
	MOVQ    R13, CX
	NEGQ    CX
	SHRQ    CL, R14
	ADDQ    R14, R9

sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R9*8), R9

sequenceDecs_decodeSync_safe_amd64_skip_update:
	// Adjust offset
	MOVQ   s+0(FP), CX
	MOVQ   8(SP), R13
	CMPQ   AX, $0x01
	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
	MOVUPS 144(CX), X0
	MOVQ   R13, 144(CX)
	MOVUPS X0, 152(CX)
	JMP    sequenceDecs_decodeSync_safe_amd64_adjust_end

sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
	CMPQ 24(SP), $0x00000000
	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
	INCQ R13
	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero

sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
	MOVQ  144(CX), R13
	JMP   sequenceDecs_decodeSync_safe_amd64_adjust_end

sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
	MOVQ    R13, AX
	XORQ    R14, R14
	MOVQ    $-1, R15
	CMPQ    R13, $0x03
	CMOVQEQ R14, AX
	CMOVQEQ R15, R14
	LEAQ    144(CX), R15
	ADDQ    (R15)(AX*8), R14
	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
	MOVQ    $0x00000001, R14

sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
	CMPQ R13, $0x01
	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
	MOVQ 152(CX), AX
	MOVQ AX, 160(CX)

sequenceDecs_decodeSync_safe_amd64_adjust_skip:
	MOVQ 144(CX), AX
	MOVQ AX, 152(CX)
	MOVQ R14, 144(CX)
	MOVQ R14, R13

sequenceDecs_decodeSync_safe_amd64_adjust_end:
	MOVQ R13, 8(SP)

	// Check values
	MOVQ  16(SP), AX
	MOVQ  24(SP), CX
	LEAQ  (AX)(CX*1), R14
	MOVQ  s+0(FP), R15
	ADDQ  R14, 256(R15)
	MOVQ  ctx+16(FP), R14
	SUBQ  CX, 104(R14)
	JS    error_not_enough_literals
	CMPQ  AX, $0x00020002
	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
	TESTQ AX, AX
	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch

sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
	MOVQ 24(SP), AX
	MOVQ 8(SP), CX
	MOVQ 16(SP), R13

	// Check if we have enough space in s.out
	LEAQ (AX)(R13*1), R14
	ADDQ R10, R14
	CMPQ R14, 32(SP)
	JA   error_not_enough_space

	// Copy literals
	TESTQ AX, AX
	JZ    check_offset
	XORQ  R14, R14
	TESTQ $0x00000001, AX
	JZ    copy_1_word
	MOVB  (R11)(R14*1), R15
	MOVB  R15, (R10)(R14*1)
	ADDQ  $0x01, R14

copy_1_word:
	TESTQ $0x00000002, AX
	JZ    copy_1_dword
	MOVW  (R11)(R14*1), R15
	MOVW  R15, (R10)(R14*1)
	ADDQ  $0x02, R14

copy_1_dword:
	TESTQ $0x00000004, AX
	JZ    copy_1_qword
	MOVL  (R11)(R14*1), R15
	MOVL  R15, (R10)(R14*1)
	ADDQ  $0x04, R14

copy_1_qword:
	TESTQ $0x00000008, AX
	JZ    copy_1_test
	MOVQ  (R11)(R14*1), R15
	MOVQ  R15, (R10)(R14*1)
	ADDQ  $0x08, R14
	JMP   copy_1_test

copy_1:
	MOVUPS (R11)(R14*1), X0
	MOVUPS X0, (R10)(R14*1)
	ADDQ   $0x10, R14

copy_1_test:
	CMPQ R14, AX
	JB   copy_1
	ADDQ AX, R11
	ADDQ AX, R10
	ADDQ AX, R12

	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
	MOVQ R12, AX
	ADDQ 40(SP), AX
	CMPQ CX, AX
	JG   error_match_off_too_big
	CMPQ CX, 56(SP)
	JG   error_match_off_too_big

	// Copy match from history
	MOVQ  CX, AX
	SUBQ  R12, AX
	JLS   copy_match
	MOVQ  48(SP), R14
	SUBQ  AX, R14
	CMPQ  R13, AX
	JGE   copy_all_from_history
	XORQ  AX, AX
	TESTQ $0x00000001, R13
	JZ    copy_4_word
	MOVB  (R14)(AX*1), CL
	MOVB  CL, (R10)(AX*1)
	ADDQ  $0x01, AX

copy_4_word:
	TESTQ $0x00000002, R13
	JZ    copy_4_dword
	MOVW  (R14)(AX*1), CX
	MOVW  CX, (R10)(AX*1)
	ADDQ  $0x02, AX

copy_4_dword:
	TESTQ $0x00000004, R13
	JZ    copy_4_qword
	MOVL  (R14)(AX*1), CX
	MOVL  CX, (R10)(AX*1)
	ADDQ  $0x04, AX

copy_4_qword:
	TESTQ $0x00000008, R13
	JZ    copy_4_test
	MOVQ  (R14)(AX*1), CX
	MOVQ  CX, (R10)(AX*1)
	ADDQ  $0x08, AX
	JMP   copy_4_test

copy_4:
	MOVUPS (R14)(AX*1), X0
	MOVUPS X0, (R10)(AX*1)
	ADDQ   $0x10, AX

copy_4_test:
	CMPQ AX, R13
	JB   copy_4
	ADDQ R13, R12
	ADDQ R13, R10
	JMP  handle_loop
	JMP loop_finished

copy_all_from_history:
	XORQ  R15, R15
	TESTQ $0x00000001, AX
	JZ    copy_5_word
	MOVB  (R14)(R15*1), BP
	MOVB  BP, (R10)(R15*1)
	ADDQ  $0x01, R15

copy_5_word:
	TESTQ $0x00000002, AX
	JZ    copy_5_dword
	MOVW  (R14)(R15*1), BP
	MOVW  BP, (R10)(R15*1)
	ADDQ  $0x02, R15

copy_5_dword:
	TESTQ $0x00000004, AX
	JZ    copy_5_qword
	MOVL  (R14)(R15*1), BP
	MOVL  BP, (R10)(R15*1)
	ADDQ  $0x04, R15

copy_5_qword:
	TESTQ $0x00000008, AX
	JZ    copy_5_test
	MOVQ  (R14)(R15*1), BP
	MOVQ  BP, (R10)(R15*1)
	ADDQ  $0x08, R15
	JMP   copy_5_test

copy_5:
	MOVUPS (R14)(R15*1), X0
	MOVUPS X0, (R10)(R15*1)
	ADDQ   $0x10, R15

copy_5_test:
	CMPQ R15, AX
	JB   copy_5
	ADDQ AX, R10
	ADDQ AX, R12
	SUBQ AX, R13

	// Copy match from the current buffer
copy_match:
	TESTQ R13, R13
	JZ    handle_loop
	MOVQ  R10, AX
	SUBQ  CX, AX

	// ml <= mo
	CMPQ R13, CX
	JA   copy_overlapping_match

	// Copy non-overlapping match
	ADDQ  R13, R12
	XORQ  CX, CX
	TESTQ $0x00000001, R13
	JZ    copy_2_word
	MOVB  (AX)(CX*1), R14
	MOVB  R14, (R10)(CX*1)
	ADDQ  $0x01, CX

copy_2_word:
	TESTQ $0x00000002, R13
	JZ    copy_2_dword
	MOVW  (AX)(CX*1), R14
	MOVW  R14, (R10)(CX*1)
	ADDQ  $0x02, CX

copy_2_dword:
	TESTQ $0x00000004, R13
	JZ    copy_2_qword
	MOVL  (AX)(CX*1), R14
	MOVL  R14, (R10)(CX*1)
	ADDQ  $0x04, CX

copy_2_qword:
	TESTQ $0x00000008, R13
	JZ    copy_2_test
	MOVQ  (AX)(CX*1), R14
	MOVQ  R14, (R10)(CX*1)
	ADDQ  $0x08, CX
	JMP   copy_2_test

copy_2:
	MOVUPS (AX)(CX*1), X0
	MOVUPS X0, (R10)(CX*1)
	ADDQ   $0x10, CX

copy_2_test:
	CMPQ CX, R13
	JB   copy_2
	ADDQ R13, R10
	JMP  handle_loop

	// Copy overlapping match
copy_overlapping_match:
	ADDQ R13, R12

copy_slow_3:
	MOVB (AX), CL
	MOVB CL, (R10)
	INCQ AX
	INCQ R10
	DECQ R13
	JNZ  copy_slow_3

handle_loop:
	MOVQ ctx+16(FP), AX
	DECQ 96(AX)
	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop

loop_finished:
	MOVQ br+8(FP), AX
	MOVQ DX, 32(AX)
	MOVB BL, 40(AX)
	MOVQ SI, 24(AX)

	// Update the context
	MOVQ ctx+16(FP), AX
	MOVQ R12, 136(AX)
	MOVQ 144(AX), CX
	SUBQ CX, R11
	MOVQ R11, 168(AX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
	MOVQ 16(SP), AX
	MOVQ ctx+16(FP), CX
	MOVQ AX, 216(CX)
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
error_match_off_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 8(SP), CX
	MOVQ CX, 224(AX)
	MOVQ R12, 136(AX)
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
error_not_enough_space:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ R12, 136(AX)
	MOVQ $0x00000005, ret+24(FP)
	RET

// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
	MOVQ    br+8(FP), CX
	MOVQ    32(CX), AX
	MOVBQZX 40(CX), DX
	MOVQ    24(CX), BX
	MOVQ    (CX), CX
	ADDQ    BX, CX
	MOVQ    CX, (SP)
	MOVQ    ctx+16(FP), CX
	MOVQ    72(CX), SI
	MOVQ    80(CX), DI
	MOVQ    88(CX), R8
	MOVQ    112(CX), R9
	MOVQ    128(CX), R10
	MOVQ    R10, 32(SP)
	MOVQ    144(CX), R10
	MOVQ    136(CX), R11
	MOVQ    200(CX), R12
	MOVQ    R12, 56(SP)
	MOVQ    176(CX), R12
	MOVQ    R12, 48(SP)
	MOVQ    184(CX), CX
	MOVQ    CX, 40(SP)
	MOVQ    40(SP), CX
	ADDQ    CX, 48(SP)

	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
	ADDQ R9, 32(SP)

	// outBase += outPosition
	ADDQ R11, R9

sequenceDecs_decodeSync_safe_bmi2_main_loop:
	MOVQ (SP), R12

	// Fill bitreader to have enough for the offset and match length.
	CMPQ BX, $0x08
	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R12
	MOVQ (R12), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end

sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R12
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R12), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte

sequenceDecs_decodeSync_safe_bmi2_fill_end:
	// Update offset
	MOVQ   $0x00000808, CX
	BEXTRQ CX, R8, R13
	MOVQ   AX, R14
	LEAQ   (DX)(R13*1), CX
	ROLQ   CL, R14
	BZHIQ  R13, R14, R14
	MOVQ   CX, DX
	MOVQ   R8, CX
	SHRQ   $0x20, CX
	ADDQ   R14, CX
	MOVQ   CX, 8(SP)

	// Update match length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, DI, R13
	MOVQ   AX, R14
	LEAQ   (DX)(R13*1), CX
	ROLQ   CL, R14
	BZHIQ  R13, R14, R14
	MOVQ   CX, DX
	MOVQ   DI, CX
	SHRQ   $0x20, CX
	ADDQ   R14, CX
	MOVQ   CX, 16(SP)

	// Fill bitreader to have enough for the remaining
	CMPQ BX, $0x08
	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
	MOVQ DX, CX
	SHRQ $0x03, CX
	SUBQ CX, R12
	MOVQ (R12), AX
	SUBQ CX, BX
	ANDQ $0x07, DX
	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end

sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
	CMPQ    BX, $0x00
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
	CMPQ    DX, $0x07
	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
	SHLQ    $0x08, AX
	SUBQ    $0x01, R12
	SUBQ    $0x01, BX
	SUBQ    $0x08, DX
	MOVBQZX (R12), CX
	ORQ     CX, AX
	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte

sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
	// Update literal length
	MOVQ   $0x00000808, CX
	BEXTRQ CX, SI, R13
	MOVQ   AX, R14
	LEAQ   (DX)(R13*1), CX
	ROLQ   CL, R14
	BZHIQ  R13, R14, R14
	MOVQ   CX, DX
	MOVQ   SI, CX
	SHRQ   $0x20, CX
	ADDQ   R14, CX
	MOVQ   CX, 24(SP)

	// Fill bitreader for state updates
	MOVQ    R12, (SP)
	MOVQ    $0x00000808, CX
	BEXTRQ  CX, R8, R12
	MOVQ    ctx+16(FP), CX
	CMPQ    96(CX), $0x00
	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
	LEAQ    (SI)(DI*1), R13
	ADDQ    R8, R13
	MOVBQZX R13, R13
	LEAQ    (DX)(R13*1), CX
	MOVQ    AX, R14
	MOVQ    CX, DX
	ROLQ    CL, R14
	BZHIQ   R13, R14, R14

	// Update Offset State
	BZHIQ  R8, R14, CX
	SHRXQ  R8, R14, R14
	MOVQ   $0x00001010, R13
	BEXTRQ R13, R8, R8
	ADDQ   CX, R8

	// Load ctx.ofTable
	MOVQ ctx+16(FP), CX
	MOVQ 48(CX), CX
	MOVQ (CX)(R8*8), R8

	// Update Match Length State
	BZHIQ  DI, R14, CX
	SHRXQ  DI, R14, R14
	MOVQ   $0x00001010, R13
	BEXTRQ R13, DI, DI
	ADDQ   CX, DI

	// Load ctx.mlTable
	MOVQ ctx+16(FP), CX
	MOVQ 24(CX), CX
	MOVQ (CX)(DI*8), DI

	// Update Literal Length State
	BZHIQ  SI, R14, CX
	MOVQ   $0x00001010, R13
	BEXTRQ R13, SI, SI
	ADDQ   CX, SI

	// Load ctx.llTable
	MOVQ ctx+16(FP), CX
	MOVQ (CX), CX
	MOVQ (CX)(SI*8), SI

sequenceDecs_decodeSync_safe_bmi2_skip_update:
	// Adjust offset
	MOVQ   s+0(FP), CX
	MOVQ   8(SP), R13
	CMPQ   R12, $0x01
	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
	MOVUPS 144(CX), X0
	MOVQ   R13, 144(CX)
	MOVUPS X0, 152(CX)
	JMP    sequenceDecs_decodeSync_safe_bmi2_adjust_end

sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
	CMPQ 24(SP), $0x00000000
	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
	INCQ R13
	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero

sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
	MOVQ  144(CX), R13
	JMP   sequenceDecs_decodeSync_safe_bmi2_adjust_end

sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
	MOVQ    R13, R12
	XORQ    R14, R14
	MOVQ    $-1, R15
	CMPQ    R13, $0x03
	CMOVQEQ R14, R12
	CMOVQEQ R15, R14
	LEAQ    144(CX), R15
	ADDQ    (R15)(R12*8), R14
	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
	MOVQ    $0x00000001, R14

sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
	CMPQ R13, $0x01
	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
	MOVQ 152(CX), R12
	MOVQ R12, 160(CX)

sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
	MOVQ 144(CX), R12
	MOVQ R12, 152(CX)
	MOVQ R14, 144(CX)
	MOVQ R14, R13

sequenceDecs_decodeSync_safe_bmi2_adjust_end:
	MOVQ R13, 8(SP)

	// Check values
	MOVQ  16(SP), CX
	MOVQ  24(SP), R12
	LEAQ  (CX)(R12*1), R14
	MOVQ  s+0(FP), R15
	ADDQ  R14, 256(R15)
	MOVQ  ctx+16(FP), R14
	SUBQ  R12, 104(R14)
	JS    error_not_enough_literals
	CMPQ  CX, $0x00020002
	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
	TESTQ R13, R13
	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
	TESTQ CX, CX
	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch

sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
	MOVQ 24(SP), CX
	MOVQ 8(SP), R12
	MOVQ 16(SP), R13

	// Check if we have enough space in s.out
	LEAQ (CX)(R13*1), R14
	ADDQ R9, R14
	CMPQ R14, 32(SP)
	JA   error_not_enough_space

	// Copy literals
	TESTQ CX, CX
	JZ    check_offset
	XORQ  R14, R14
	TESTQ $0x00000001, CX
	JZ    copy_1_word
	MOVB  (R10)(R14*1), R15
	MOVB  R15, (R9)(R14*1)
	ADDQ  $0x01, R14

copy_1_word:
	TESTQ $0x00000002, CX
	JZ    copy_1_dword
	MOVW  (R10)(R14*1), R15
	MOVW  R15, (R9)(R14*1)
	ADDQ  $0x02, R14

copy_1_dword:
	TESTQ $0x00000004, CX
	JZ    copy_1_qword
	MOVL  (R10)(R14*1), R15
	MOVL  R15, (R9)(R14*1)
	ADDQ  $0x04, R14

copy_1_qword:
	TESTQ $0x00000008, CX
	JZ    copy_1_test
	MOVQ  (R10)(R14*1), R15
	MOVQ  R15, (R9)(R14*1)
	ADDQ  $0x08, R14
	JMP   copy_1_test

copy_1:
	MOVUPS (R10)(R14*1), X0
	MOVUPS X0, (R9)(R14*1)
	ADDQ   $0x10, R14

copy_1_test:
	CMPQ R14, CX
	JB   copy_1
	ADDQ CX, R10
	ADDQ CX, R9
	ADDQ CX, R11

	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
	MOVQ R11, CX
	ADDQ 40(SP), CX
	CMPQ R12, CX
	JG   error_match_off_too_big
	CMPQ R12, 56(SP)
	JG   error_match_off_too_big

	// Copy match from history
	MOVQ  R12, CX
	SUBQ  R11, CX
	JLS   copy_match
	MOVQ  48(SP), R14
	SUBQ  CX, R14
	CMPQ  R13, CX
	JGE   copy_all_from_history
	XORQ  CX, CX
	TESTQ $0x00000001, R13
	JZ    copy_4_word
	MOVB  (R14)(CX*1), R12
	MOVB  R12, (R9)(CX*1)
	ADDQ  $0x01, CX

copy_4_word:
	TESTQ $0x00000002, R13
	JZ    copy_4_dword
	MOVW  (R14)(CX*1), R12
	MOVW  R12, (R9)(CX*1)
	ADDQ  $0x02, CX

copy_4_dword:
	TESTQ $0x00000004, R13
	JZ    copy_4_qword
	MOVL  (R14)(CX*1), R12
	MOVL  R12, (R9)(CX*1)
	ADDQ  $0x04, CX

copy_4_qword:
	TESTQ $0x00000008, R13
	JZ    copy_4_test
	MOVQ  (R14)(CX*1), R12
	MOVQ  R12, (R9)(CX*1)
	ADDQ  $0x08, CX
	JMP   copy_4_test

copy_4:
	MOVUPS (R14)(CX*1), X0
	MOVUPS X0, (R9)(CX*1)
	ADDQ   $0x10, CX

copy_4_test:
	CMPQ CX, R13
	JB   copy_4
	ADDQ R13, R11
	ADDQ R13, R9
	JMP  handle_loop
	JMP loop_finished

copy_all_from_history:
	XORQ  R15, R15
	TESTQ $0x00000001, CX
	JZ    copy_5_word
	MOVB  (R14)(R15*1), BP
	MOVB  BP, (R9)(R15*1)
	ADDQ  $0x01, R15

copy_5_word:
	TESTQ $0x00000002, CX
	JZ    copy_5_dword
	MOVW  (R14)(R15*1), BP
	MOVW  BP, (R9)(R15*1)
	ADDQ  $0x02, R15

copy_5_dword:
	TESTQ $0x00000004, CX
	JZ    copy_5_qword
	MOVL  (R14)(R15*1), BP
	MOVL  BP, (R9)(R15*1)
	ADDQ  $0x04, R15

copy_5_qword:
	TESTQ $0x00000008, CX
	JZ    copy_5_test
	MOVQ  (R14)(R15*1), BP
	MOVQ  BP, (R9)(R15*1)
	ADDQ  $0x08, R15
	JMP   copy_5_test

copy_5:
	MOVUPS (R14)(R15*1), X0
	MOVUPS X0, (R9)(R15*1)
	ADDQ   $0x10, R15

copy_5_test:
	CMPQ R15, CX
	JB   copy_5
	ADDQ CX, R9
	ADDQ CX, R11
	SUBQ CX, R13

	// Copy match from the current buffer
copy_match:
	TESTQ R13, R13
	JZ    handle_loop
	MOVQ  R9, CX
	SUBQ  R12, CX

	// ml <= mo
	CMPQ R13, R12
	JA   copy_overlapping_match

	// Copy non-overlapping match
	ADDQ  R13, R11
	XORQ  R12, R12
	TESTQ $0x00000001, R13
	JZ    copy_2_word
	MOVB  (CX)(R12*1), R14
	MOVB  R14, (R9)(R12*1)
	ADDQ  $0x01, R12

copy_2_word:
	TESTQ $0x00000002, R13
	JZ    copy_2_dword
	MOVW  (CX)(R12*1), R14
	MOVW  R14, (R9)(R12*1)
	ADDQ  $0x02, R12

copy_2_dword:
	TESTQ $0x00000004, R13
	JZ    copy_2_qword
	MOVL  (CX)(R12*1), R14
	MOVL  R14, (R9)(R12*1)
	ADDQ  $0x04, R12

copy_2_qword:
	TESTQ $0x00000008, R13
	JZ    copy_2_test
	MOVQ  (CX)(R12*1), R14
	MOVQ  R14, (R9)(R12*1)
	ADDQ  $0x08, R12
	JMP   copy_2_test

copy_2:
	MOVUPS (CX)(R12*1), X0
	MOVUPS X0, (R9)(R12*1)
	ADDQ   $0x10, R12

copy_2_test:
	CMPQ R12, R13
	JB   copy_2
	ADDQ R13, R9
	JMP  handle_loop

	// Copy overlapping match
copy_overlapping_match:
	ADDQ R13, R11

copy_slow_3:
	MOVB (CX), R12
	MOVB R12, (R9)
	INCQ CX
	INCQ R9
	DECQ R13
	JNZ  copy_slow_3

handle_loop:
	MOVQ ctx+16(FP), CX
	DECQ 96(CX)
	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop

loop_finished:
	MOVQ br+8(FP), CX
	MOVQ AX, 32(CX)
	MOVB DL, 40(CX)
	MOVQ BX, 24(CX)

	// Update the context
	MOVQ ctx+16(FP), AX
	MOVQ R11, 136(AX)
	MOVQ 144(AX), CX
	SUBQ CX, R10
	MOVQ R10, 168(AX)

	// Return success
	MOVQ $0x00000000, ret+24(FP)
	RET

	// Return with match length error
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
	MOVQ 16(SP), AX
	MOVQ ctx+16(FP), CX
	MOVQ AX, 216(CX)
	MOVQ $0x00000001, ret+24(FP)
	RET

	// Return with match too long error
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ $0x00000002, ret+24(FP)
	RET

	// Return with match offset too long error
error_match_off_too_big:
	MOVQ ctx+16(FP), AX
	MOVQ 8(SP), CX
	MOVQ CX, 224(AX)
	MOVQ R11, 136(AX)
	MOVQ $0x00000003, ret+24(FP)
	RET

	// Return with not enough literals error
error_not_enough_literals:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ $0x00000004, ret+24(FP)
	RET

	// Return with not enough output space error
error_not_enough_space:
	MOVQ ctx+16(FP), AX
	MOVQ 24(SP), CX
	MOVQ CX, 208(AX)
	MOVQ 16(SP), CX
	MOVQ CX, 216(AX)
	MOVQ R11, 136(AX)
	MOVQ $0x00000005, ret+24(FP)
	RET