VictoriaMetrics/vendor/github.com/rivo/uniseg/graphemerules.go

177 lines
5.0 KiB
Go
Raw Normal View History

2022-08-02 08:19:38 +02:00
package uniseg
// The states of the grapheme cluster parser.
const (
grAny = iota
grCR
grControlLF
grL
grLVV
grLVTT
grPrepend
grExtendedPictographic
grExtendedPictographicZWJ
grRIOdd
grRIEven
)
// The grapheme cluster parser's breaking instructions.
const (
grNoBoundary = iota
grBoundary
)
2024-01-30 17:47:01 +01:00
// grTransitions implements the grapheme cluster parser's state transitions.
// Maps state and property to a new state, a breaking instruction, and rule
// number. The breaking instruction always refers to the boundary between the
// last and next code point. Returns negative values if no transition is found.
2022-08-02 08:19:38 +02:00
//
2024-01-30 17:47:01 +01:00
// This function is used as follows:
2022-08-02 08:19:38 +02:00
//
2022-09-13 15:44:44 +02:00
// 1. Find specific state + specific property. Stop if found.
// 2. Find specific state + any property.
// 3. Find any state + specific property.
// 4. If only (2) or (3) (but not both) was found, stop.
// 5. If both (2) and (3) were found, use state from (3) and breaking instruction
// from the transition with the lower rule number, prefer (3) if rule numbers
// are equal. Stop.
// 6. Assume grAny and grBoundary.
2022-08-02 08:19:38 +02:00
//
2024-01-30 17:47:01 +01:00
// Unicode version 15.0.0.
func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
// It turns out that using a big switch statement is much faster than using
// a map.
switch uint64(state) | uint64(prop)<<32 {
2022-08-02 08:19:38 +02:00
// GB5
2024-01-30 17:47:01 +01:00
case grAny | prCR<<32:
return grCR, grBoundary, 50
case grAny | prLF<<32:
return grControlLF, grBoundary, 50
case grAny | prControl<<32:
return grControlLF, grBoundary, 50
2022-08-02 08:19:38 +02:00
// GB4
2024-01-30 17:47:01 +01:00
case grCR | prAny<<32:
return grAny, grBoundary, 40
case grControlLF | prAny<<32:
return grAny, grBoundary, 40
// GB3
case grCR | prLF<<32:
return grControlLF, grNoBoundary, 30
// GB6
case grAny | prL<<32:
return grL, grBoundary, 9990
case grL | prL<<32:
return grL, grNoBoundary, 60
case grL | prV<<32:
return grLVV, grNoBoundary, 60
case grL | prLV<<32:
return grLVV, grNoBoundary, 60
case grL | prLVT<<32:
return grLVTT, grNoBoundary, 60
// GB7
case grAny | prLV<<32:
return grLVV, grBoundary, 9990
case grAny | prV<<32:
return grLVV, grBoundary, 9990
case grLVV | prV<<32:
return grLVV, grNoBoundary, 70
case grLVV | prT<<32:
return grLVTT, grNoBoundary, 70
// GB8
case grAny | prLVT<<32:
return grLVTT, grBoundary, 9990
case grAny | prT<<32:
return grLVTT, grBoundary, 9990
case grLVTT | prT<<32:
return grLVTT, grNoBoundary, 80
// GB9
case grAny | prExtend<<32:
return grAny, grNoBoundary, 90
case grAny | prZWJ<<32:
return grAny, grNoBoundary, 90
// GB9a
case grAny | prSpacingMark<<32:
return grAny, grNoBoundary, 91
// GB9b
case grAny | prPrepend<<32:
return grPrepend, grBoundary, 9990
case grPrepend | prAny<<32:
return grAny, grNoBoundary, 92
// GB11
case grAny | prExtendedPictographic<<32:
return grExtendedPictographic, grBoundary, 9990
case grExtendedPictographic | prExtend<<32:
return grExtendedPictographic, grNoBoundary, 110
case grExtendedPictographic | prZWJ<<32:
return grExtendedPictographicZWJ, grNoBoundary, 110
case grExtendedPictographicZWJ | prExtendedPictographic<<32:
return grExtendedPictographic, grNoBoundary, 110
// GB12 / GB13
case grAny | prRegionalIndicator<<32:
return grRIOdd, grBoundary, 9990
case grRIOdd | prRegionalIndicator<<32:
return grRIEven, grNoBoundary, 120
case grRIEven | prRegionalIndicator<<32:
return grRIOdd, grBoundary, 120
default:
return -1, -1, -1
}
2022-08-02 08:19:38 +02:00
}
// transitionGraphemeState determines the new state of the grapheme cluster
2022-09-13 15:44:44 +02:00
// parser given the current state and the next code point. It also returns the
// code point's grapheme property (the value mapped by the [graphemeCodePoints]
// table) and whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
2022-08-02 08:19:38 +02:00
// Determine the property of the next character.
2024-01-30 17:47:01 +01:00
prop = propertyGraphemes(r)
2022-08-02 08:19:38 +02:00
// Find the applicable transition.
2024-01-30 17:47:01 +01:00
nextState, nextProp, _ := grTransitions(state, prop)
if nextState >= 0 {
2022-08-02 08:19:38 +02:00
// We have a specific transition. We'll use it.
2024-01-30 17:47:01 +01:00
return nextState, prop, nextProp == grBoundary
2022-08-02 08:19:38 +02:00
}
// No specific transition found. Try the less specific ones.
2024-01-30 17:47:01 +01:00
anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
if anyPropState >= 0 && anyStateState >= 0 {
2022-08-02 08:19:38 +02:00
// Both apply. We'll use a mix (see comments for grTransitions).
2024-01-30 17:47:01 +01:00
newState = anyStateState
boundary = anyStateProp == grBoundary
if anyPropRule < anyStateRule {
boundary = anyPropProp == grBoundary
2022-08-02 08:19:38 +02:00
}
return
}
2024-01-30 17:47:01 +01:00
if anyPropState >= 0 {
2022-08-02 08:19:38 +02:00
// We only have a specific state.
2024-01-30 17:47:01 +01:00
return anyPropState, prop, anyPropProp == grBoundary
2022-08-02 08:19:38 +02:00
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
}
2024-01-30 17:47:01 +01:00
if anyStateState >= 0 {
2022-08-02 08:19:38 +02:00
// We only have a specific property.
2024-01-30 17:47:01 +01:00
return anyStateState, prop, anyStateProp == grBoundary
2022-08-02 08:19:38 +02:00
}
// No known transition. GB999: Any ÷ Any.
2022-09-13 15:44:44 +02:00
return grAny, prop, true
2022-08-02 08:19:38 +02:00
}