VictoriaMetrics/vendor/github.com/rivo/uniseg/wordrules.go

package uniseg

import "unicode/utf8"

// The states of the word break parser.
const (
	wbAny = iota
	wbCR
	wbLF
	wbNewline
	wbWSegSpace
	wbHebrewLetter
	wbALetter
	wbWB7
	wbWB7c
	wbNumeric
	wbWB11
	wbKatakana
	wbExtendNumLet
	wbOddRI
	wbEvenRI
	wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
)

// wbTransitions implements the word break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
	switch uint64(state) | uint64(prop)<<32 {
	// WB3b.
	case wbAny | prNewline<<32:
		return wbNewline, true, 32
	case wbAny | prCR<<32:
		return wbCR, true, 32
	case wbAny | prLF<<32:
		return wbLF, true, 32

	// WB3a.
	case wbNewline | prAny<<32:
		return wbAny, true, 31
	case wbCR | prAny<<32:
		return wbAny, true, 31
	case wbLF | prAny<<32:
		return wbAny, true, 31

	// WB3.
	case wbCR | prLF<<32:
		return wbLF, false, 30

	// WB3d.
	case wbAny | prWSegSpace<<32:
		return wbWSegSpace, true, 9990
	case wbWSegSpace | prWSegSpace<<32:
		return wbWSegSpace, false, 34

	// WB5.
	case wbAny | prALetter<<32:
		return wbALetter, true, 9990
	case wbAny | prHebrewLetter<<32:
		return wbHebrewLetter, true, 9990
	case wbALetter | prALetter<<32:
		return wbALetter, false, 50
	case wbALetter | prHebrewLetter<<32:
		return wbHebrewLetter, false, 50
	case wbHebrewLetter | prALetter<<32:
		return wbALetter, false, 50
	case wbHebrewLetter | prHebrewLetter<<32:
		return wbHebrewLetter, false, 50

	// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
	case wbWB7 | prALetter<<32:
		return wbALetter, false, 70
	case wbWB7 | prHebrewLetter<<32:
		return wbHebrewLetter, false, 70

	// WB7a.
	case wbHebrewLetter | prSingleQuote<<32:
		return wbAny, false, 71

	// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
	case wbWB7c | prHebrewLetter<<32:
		return wbHebrewLetter, false, 73

	// WB8.
	case wbAny | prNumeric<<32:
		return wbNumeric, true, 9990
	case wbNumeric | prNumeric<<32:
		return wbNumeric, false, 80

	// WB9.
	case wbALetter | prNumeric<<32:
		return wbNumeric, false, 90
	case wbHebrewLetter | prNumeric<<32:
		return wbNumeric, false, 90

	// WB10.
	case wbNumeric | prALetter<<32:
		return wbALetter, false, 100
	case wbNumeric | prHebrewLetter<<32:
		return wbHebrewLetter, false, 100

	// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
	case wbWB11 | prNumeric<<32:
		return wbNumeric, false, 110

	// WB13.
	case wbAny | prKatakana<<32:
		return wbKatakana, true, 9990
	case wbKatakana | prKatakana<<32:
		return wbKatakana, false, 130

	// WB13a.
	case wbAny | prExtendNumLet<<32:
		return wbExtendNumLet, true, 9990
	case wbALetter | prExtendNumLet<<32:
		return wbExtendNumLet, false, 131
	case wbHebrewLetter | prExtendNumLet<<32:
		return wbExtendNumLet, false, 131
	case wbNumeric | prExtendNumLet<<32:
		return wbExtendNumLet, false, 131
	case wbKatakana | prExtendNumLet<<32:
		return wbExtendNumLet, false, 131
	case wbExtendNumLet | prExtendNumLet<<32:
		return wbExtendNumLet, false, 131

	// WB13b.
	case wbExtendNumLet | prALetter<<32:
		return wbALetter, false, 132
	case wbExtendNumLet | prHebrewLetter<<32:
		return wbHebrewLetter, false, 132
	case wbExtendNumLet | prNumeric<<32:
		return wbNumeric, false, 132
	case wbExtendNumLet | prKatakana<<32:
		return wbKatakana, false, 132

	default:
		return -1, false, -1
	}
}

// transitionWordBreakState determines the new state of the word break parser
// given the current state and the next code point. It also returns whether a
// word boundary was detected. If more than one code point is needed to
// determine the new state, the byte slice or the string starting after rune "r"
// can be used (whichever is not nil or empty) for further lookups.
func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
	// Determine the property of the next character.
	nextProperty := property(workBreakCodePoints, r)

	// "Replacing Ignore Rules".
	if nextProperty == prZWJ {
		// WB4 (for zero-width joiners).
		if state == wbNewline || state == wbCR || state == wbLF {
			return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
		}
		if state < 0 {
			return wbAny | wbZWJBit, false
		}
		return state | wbZWJBit, false
	} else if nextProperty == prExtend || nextProperty == prFormat {
		// WB4 (for Extend and Format).
		if state == wbNewline || state == wbCR || state == wbLF {
			return wbAny, true // Make sure we don't apply WB4 to WB3a.
		}
		if state == wbWSegSpace || state == wbAny|wbZWJBit {
			return wbAny, false // We don't break but this is also not WB3d or WB3c.
		}
		if state < 0 {
			return wbAny, false
		}
		return state, false
	} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
		// WB3c.
		return wbAny, false
	}
	if state >= 0 {
		state = state &^ wbZWJBit
	}

	// Find the applicable transition in the table.
	var rule int
	newState, wordBreak, rule = wbTransitions(state, nextProperty)
	if newState < 0 {
		// No specific transition found. Try the less specific ones.
		anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
		anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
		if anyPropState >= 0 && anyStateState >= 0 {
			// Both apply. We'll use a mix (see comments for grTransitions).
			newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
			if anyPropRule < anyStateRule {
				wordBreak, rule = anyPropWordBreak, anyPropRule
			}
		} else if anyPropState >= 0 {
			// We only have a specific state.
			newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
			// This branch will probably never be reached because okAnyState will
			// always be true given the current transition map. But we keep it here
			// for future modifications to the transition map where this may not be
			// true anymore.
		} else if anyStateState >= 0 {
			// We only have a specific property.
			newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
		} else {
			// No known transition. WB999: Any ÷ Any.
			newState, wordBreak, rule = wbAny, true, 9990
		}
	}

	// For those rules that need to look up runes further in the string, we
	// determine the property after nextProperty, skipping over Format, Extend,
	// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
	// be determined (because the text ends or the rune is faulty).
	farProperty := -1
	if rule > 60 &&
		(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
		(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
			nextProperty == prDoubleQuote || // WB7b.
			nextProperty == prMidNum) { // WB12.
		for {
			var (
				r      rune
				length int
			)
			if b != nil { // Byte slice version.
				r, length = utf8.DecodeRune(b)
				b = b[length:]
			} else { // String version.
				r, length = utf8.DecodeRuneInString(str)
				str = str[length:]
			}
			if r == utf8.RuneError {
				break
			}
			prop := property(workBreakCodePoints, r)
			if prop == prExtend || prop == prFormat || prop == prZWJ {
				continue
			}
			farProperty = prop
			break
		}
	}

	// WB6.
	if rule > 60 &&
		(state == wbALetter || state == wbHebrewLetter) &&
		(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
		(farProperty == prALetter || farProperty == prHebrewLetter) {
		return wbWB7, false
	}

	// WB7b.
	if rule > 72 &&
		state == wbHebrewLetter &&
		nextProperty == prDoubleQuote &&
		farProperty == prHebrewLetter {
		return wbWB7c, false
	}

	// WB12.
	if rule > 120 &&
		state == wbNumeric &&
		(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
		farProperty == prNumeric {
		return wbWB11, false
	}

	// WB15 and WB16.
	if newState == wbAny && nextProperty == prRegionalIndicator {
		if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
			// Transition into the first RI.
			return wbOddRI, true
		}
		if state == wbOddRI {
			// Don't break pairs of Regional Indicators.
			return wbEvenRI, false
		}
		return wbOddRI, true // We can break after a pair.
	}

	return
}
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`package uniseg`

			`import "unicode/utf8"`

			`// The states of the word break parser.`
			`const (`
			`wbAny = iota`
			`wbCR`
			`wbLF`
			`wbNewline`
			`wbWSegSpace`
			`wbHebrewLetter`
			`wbALetter`
			`wbWB7`
			`wbWB7c`
			`wbNumeric`
			`wbWB11`
			`wbKatakana`
			`wbExtendNumLet`
			`wbOddRI`
			`wbEvenRI`
			`wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).`
			`)`

vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`// wbTransitions implements the word break parser's state transitions. It's`
			`// anologous to [grTransitions], see comments there for details.`
			`//`
			`// Unicode version 15.0.0.`
			`func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {`
			`switch uint64(state) \| uint64(prop)<<32 {`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`// WB3b.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbAny \| prNewline<<32:`
			`return wbNewline, true, 32`
			`case wbAny \| prCR<<32:`
			`return wbCR, true, 32`
			`case wbAny \| prLF<<32:`
			`return wbLF, true, 32`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB3a.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbNewline \| prAny<<32:`
			`return wbAny, true, 31`
			`case wbCR \| prAny<<32:`
			`return wbAny, true, 31`
			`case wbLF \| prAny<<32:`
			`return wbAny, true, 31`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB3.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbCR \| prLF<<32:`
			`return wbLF, false, 30`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB3d.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbAny \| prWSegSpace<<32:`
			`return wbWSegSpace, true, 9990`
			`case wbWSegSpace \| prWSegSpace<<32:`
			`return wbWSegSpace, false, 34`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB5.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbAny \| prALetter<<32:`
			`return wbALetter, true, 9990`
			`case wbAny \| prHebrewLetter<<32:`
			`return wbHebrewLetter, true, 9990`
			`case wbALetter \| prALetter<<32:`
			`return wbALetter, false, 50`
			`case wbALetter \| prHebrewLetter<<32:`
			`return wbHebrewLetter, false, 50`
			`case wbHebrewLetter \| prALetter<<32:`
			`return wbALetter, false, 50`
			`case wbHebrewLetter \| prHebrewLetter<<32:`
			`return wbHebrewLetter, false, 50`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB7. Transitions to wbWB7 handled by transitionWordBreakState().`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbWB7 \| prALetter<<32:`
			`return wbALetter, false, 70`
			`case wbWB7 \| prHebrewLetter<<32:`
			`return wbHebrewLetter, false, 70`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB7a.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbHebrewLetter \| prSingleQuote<<32:`
			`return wbAny, false, 71`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbWB7c \| prHebrewLetter<<32:`
			`return wbHebrewLetter, false, 73`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB8.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbAny \| prNumeric<<32:`
			`return wbNumeric, true, 9990`
			`case wbNumeric \| prNumeric<<32:`
			`return wbNumeric, false, 80`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB9.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbALetter \| prNumeric<<32:`
			`return wbNumeric, false, 90`
			`case wbHebrewLetter \| prNumeric<<32:`
			`return wbNumeric, false, 90`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB10.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbNumeric \| prALetter<<32:`
			`return wbALetter, false, 100`
			`case wbNumeric \| prHebrewLetter<<32:`
			`return wbHebrewLetter, false, 100`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB11. Transitions to wbWB11 handled by transitionWordBreakState().`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbWB11 \| prNumeric<<32:`
			`return wbNumeric, false, 110`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB13.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbAny \| prKatakana<<32:`
			`return wbKatakana, true, 9990`
			`case wbKatakana \| prKatakana<<32:`
			`return wbKatakana, false, 130`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB13a.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbAny \| prExtendNumLet<<32:`
			`return wbExtendNumLet, true, 9990`
			`case wbALetter \| prExtendNumLet<<32:`
			`return wbExtendNumLet, false, 131`
			`case wbHebrewLetter \| prExtendNumLet<<32:`
			`return wbExtendNumLet, false, 131`
			`case wbNumeric \| prExtendNumLet<<32:`
			`return wbExtendNumLet, false, 131`
			`case wbKatakana \| prExtendNumLet<<32:`
			`return wbExtendNumLet, false, 131`
			`case wbExtendNumLet \| prExtendNumLet<<32:`
			`return wbExtendNumLet, false, 131`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00
			`// WB13b.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`case wbExtendNumLet \| prALetter<<32:`
			`return wbALetter, false, 132`
			`case wbExtendNumLet \| prHebrewLetter<<32:`
			`return wbHebrewLetter, false, 132`
			`case wbExtendNumLet \| prNumeric<<32:`
			`return wbNumeric, false, 132`
			`case wbExtendNumLet \| prKatakana<<32:`
			`return wbKatakana, false, 132`

			`default:`
			`return -1, false, -1`
			`}`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`}`

			`// transitionWordBreakState determines the new state of the word break parser`
			`// given the current state and the next code point. It also returns whether a`
			`// word boundary was detected. If more than one code point is needed to`
			`// determine the new state, the byte slice or the string starting after rune "r"`
			`// can be used (whichever is not nil or empty) for further lookups.`
			`func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {`
			`// Determine the property of the next character.`
			`nextProperty := property(workBreakCodePoints, r)`

			`// "Replacing Ignore Rules".`
			`if nextProperty == prZWJ {`
			`// WB4 (for zero-width joiners).`
			`if state == wbNewline \|\| state == wbCR \|\| state == wbLF {`
			`return wbAny \| wbZWJBit, true // Make sure we don't apply WB4 to WB3a.`
			`}`
			`if state < 0 {`
			`return wbAny \| wbZWJBit, false`
			`}`
			`return state \| wbZWJBit, false`
			`} else if nextProperty == prExtend \|\| nextProperty == prFormat {`
			`// WB4 (for Extend and Format).`
			`if state == wbNewline \|\| state == wbCR \|\| state == wbLF {`
			`return wbAny, true // Make sure we don't apply WB4 to WB3a.`
			`}`
			`if state == wbWSegSpace \|\| state == wbAny\|wbZWJBit {`
			`return wbAny, false // We don't break but this is also not WB3d or WB3c.`
			`}`
			`if state < 0 {`
			`return wbAny, false`
			`}`
			`return state, false`
			`} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {`
			`// WB3c.`
			`return wbAny, false`
			`}`
			`if state >= 0 {`
			`state = state &^ wbZWJBit`
			`}`

			`// Find the applicable transition in the table.`
			`var rule int`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`newState, wordBreak, rule = wbTransitions(state, nextProperty)`
			`if newState < 0 {`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`// No specific transition found. Try the less specific ones.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)`
			`anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)`
			`if anyPropState >= 0 && anyStateState >= 0 {`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`// Both apply. We'll use a mix (see comments for grTransitions).`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule`
			`if anyPropRule < anyStateRule {`
			`wordBreak, rule = anyPropWordBreak, anyPropRule`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`}`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`} else if anyPropState >= 0 {`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`// We only have a specific state.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`// This branch will probably never be reached because okAnyState will`
			`// always be true given the current transition map. But we keep it here`
			`// for future modifications to the transition map where this may not be`
			`// true anymore.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`} else if anyStateState >= 0 {`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`// We only have a specific property.`
vendor: run `make vendor-update` 2024-01-30 17:47:01 +01:00			`newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule`
vendor: `make vendor-update` 2022-08-02 08:19:38 +02:00			`} else {`
			`// No known transition. WB999: Any ÷ Any.`
			`newState, wordBreak, rule = wbAny, true, 9990`
			`}`
			`}`

			`// For those rules that need to look up runes further in the string, we`
			`// determine the property after nextProperty, skipping over Format, Extend,`
			`// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot`
			`// be determined (because the text ends or the rune is faulty).`
			`farProperty := -1`
			`if rule > 60 &&`
			`(state == wbALetter \|\| state == wbHebrewLetter \|\| state == wbNumeric) &&`
			`(nextProperty == prMidLetter \|\| nextProperty == prMidNumLet \|\| nextProperty == prSingleQuote \|\| // WB6.`
			`nextProperty == prDoubleQuote \|\| // WB7b.`
			`nextProperty == prMidNum) { // WB12.`
			`for {`
			`var (`
			`r rune`
			`length int`
			`)`
			`if b != nil { // Byte slice version.`
			`r, length = utf8.DecodeRune(b)`
			`b = b[length:]`
			`} else { // String version.`
			`r, length = utf8.DecodeRuneInString(str)`
			`str = str[length:]`
			`}`
			`if r == utf8.RuneError {`
			`break`
			`}`
			`prop := property(workBreakCodePoints, r)`
			`if prop == prExtend \|\| prop == prFormat \|\| prop == prZWJ {`
			`continue`
			`}`
			`farProperty = prop`
			`break`
			`}`
			`}`

			`// WB6.`
			`if rule > 60 &&`
			`(state == wbALetter \|\| state == wbHebrewLetter) &&`
			`(nextProperty == prMidLetter \|\| nextProperty == prMidNumLet \|\| nextProperty == prSingleQuote) &&`
			`(farProperty == prALetter \|\| farProperty == prHebrewLetter) {`
			`return wbWB7, false`
			`}`

			`// WB7b.`
			`if rule > 72 &&`
			`state == wbHebrewLetter &&`
			`nextProperty == prDoubleQuote &&`
			`farProperty == prHebrewLetter {`
			`return wbWB7c, false`
			`}`

			`// WB12.`
			`if rule > 120 &&`
			`state == wbNumeric &&`
			`(nextProperty == prMidNum \|\| nextProperty == prMidNumLet \|\| nextProperty == prSingleQuote) &&`
			`farProperty == prNumeric {`
			`return wbWB11, false`
			`}`

			`// WB15 and WB16.`
			`if newState == wbAny && nextProperty == prRegionalIndicator {`
			`if state != wbOddRI && state != wbEvenRI { // Includes state == -1.`
			`// Transition into the first RI.`
			`return wbOddRI, true`
			`}`
			`if state == wbOddRI {`
			`// Don't break pairs of Regional Indicators.`
			`return wbEvenRI, false`
			`}`
			`return wbOddRI, true // We can break after a pair.`
			`}`

			`return`
			`}`