mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-22 08:26:19 +01:00
206 lines
7.5 KiB
Go
206 lines
7.5 KiB
Go
|
package uniseg
|
|||
|
|
|||
|
import "unicode/utf8"
|
|||
|
|
|||
|
// The states of the sentence break parser.
|
|||
|
const (
|
|||
|
sbAny = iota
|
|||
|
sbCR
|
|||
|
sbParaSep
|
|||
|
sbATerm
|
|||
|
sbUpper
|
|||
|
sbLower
|
|||
|
sbSB7
|
|||
|
sbSB8Close
|
|||
|
sbSB8Sp
|
|||
|
sbSTerm
|
|||
|
sbSB8aClose
|
|||
|
sbSB8aSp
|
|||
|
)
|
|||
|
|
|||
|
// The sentence break parser's breaking instructions.
|
|||
|
const (
|
|||
|
sbDontBreak = iota
|
|||
|
sbBreak
|
|||
|
)
|
|||
|
|
|||
|
// The sentence break parser's state transitions. It's anologous to
|
|||
|
// grTransitions, see comments there for details. Unicode version 14.0.0.
|
|||
|
var sbTransitions = map[[2]int][3]int{
|
|||
|
// SB3.
|
|||
|
{sbAny, prCR}: {sbCR, sbDontBreak, 9990},
|
|||
|
{sbCR, prLF}: {sbParaSep, sbDontBreak, 30},
|
|||
|
|
|||
|
// SB4.
|
|||
|
{sbAny, prSep}: {sbParaSep, sbDontBreak, 9990},
|
|||
|
{sbAny, prLF}: {sbParaSep, sbDontBreak, 9990},
|
|||
|
{sbParaSep, prAny}: {sbAny, sbBreak, 40},
|
|||
|
{sbCR, prAny}: {sbAny, sbBreak, 40},
|
|||
|
|
|||
|
// SB6.
|
|||
|
{sbAny, prATerm}: {sbATerm, sbDontBreak, 9990},
|
|||
|
{sbATerm, prNumeric}: {sbAny, sbDontBreak, 60},
|
|||
|
{sbSB7, prNumeric}: {sbAny, sbDontBreak, 60}, // Because ATerm also appears in SB7.
|
|||
|
|
|||
|
// SB7.
|
|||
|
{sbAny, prUpper}: {sbUpper, sbDontBreak, 9990},
|
|||
|
{sbAny, prLower}: {sbLower, sbDontBreak, 9990},
|
|||
|
{sbUpper, prATerm}: {sbSB7, sbDontBreak, 70},
|
|||
|
{sbLower, prATerm}: {sbSB7, sbDontBreak, 70},
|
|||
|
{sbSB7, prUpper}: {sbUpper, sbDontBreak, 70},
|
|||
|
|
|||
|
// SB8a.
|
|||
|
{sbAny, prSTerm}: {sbSTerm, sbDontBreak, 9990},
|
|||
|
{sbATerm, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbATerm, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbATerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
{sbSB7, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbSB7, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbSB7, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
{sbSB8Close, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbSB8Close, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbSB8Close, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
{sbSB8Sp, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbSB8Sp, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbSB8Sp, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
{sbSTerm, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbSTerm, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbSTerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
{sbSB8aClose, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbSB8aClose, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbSB8aClose, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
{sbSB8aSp, prSContinue}: {sbAny, sbDontBreak, 81},
|
|||
|
{sbSB8aSp, prATerm}: {sbATerm, sbDontBreak, 81},
|
|||
|
{sbSB8aSp, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
|||
|
|
|||
|
// SB9.
|
|||
|
{sbATerm, prClose}: {sbSB8Close, sbDontBreak, 90},
|
|||
|
{sbSB7, prClose}: {sbSB8Close, sbDontBreak, 90},
|
|||
|
{sbSB8Close, prClose}: {sbSB8Close, sbDontBreak, 90},
|
|||
|
{sbATerm, prSp}: {sbSB8Sp, sbDontBreak, 90},
|
|||
|
{sbSB7, prSp}: {sbSB8Sp, sbDontBreak, 90},
|
|||
|
{sbSB8Close, prSp}: {sbSB8Sp, sbDontBreak, 90},
|
|||
|
{sbSTerm, prClose}: {sbSB8aClose, sbDontBreak, 90},
|
|||
|
{sbSB8aClose, prClose}: {sbSB8aClose, sbDontBreak, 90},
|
|||
|
{sbSTerm, prSp}: {sbSB8aSp, sbDontBreak, 90},
|
|||
|
{sbSB8aClose, prSp}: {sbSB8aSp, sbDontBreak, 90},
|
|||
|
{sbATerm, prSep}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbATerm, prCR}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbATerm, prLF}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB7, prSep}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB7, prCR}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB7, prLF}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB8Close, prSep}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB8Close, prCR}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB8Close, prLF}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSTerm, prSep}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSTerm, prCR}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSTerm, prLF}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB8aClose, prSep}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB8aClose, prCR}: {sbParaSep, sbDontBreak, 90},
|
|||
|
{sbSB8aClose, prLF}: {sbParaSep, sbDontBreak, 90},
|
|||
|
|
|||
|
// SB10.
|
|||
|
{sbSB8Sp, prSp}: {sbSB8Sp, sbDontBreak, 100},
|
|||
|
{sbSB8aSp, prSp}: {sbSB8aSp, sbDontBreak, 100},
|
|||
|
{sbSB8Sp, prSep}: {sbParaSep, sbDontBreak, 100},
|
|||
|
{sbSB8Sp, prCR}: {sbParaSep, sbDontBreak, 100},
|
|||
|
{sbSB8Sp, prLF}: {sbParaSep, sbDontBreak, 100},
|
|||
|
|
|||
|
// SB11.
|
|||
|
{sbATerm, prAny}: {sbAny, sbBreak, 110},
|
|||
|
{sbSB7, prAny}: {sbAny, sbBreak, 110},
|
|||
|
{sbSB8Close, prAny}: {sbAny, sbBreak, 110},
|
|||
|
{sbSB8Sp, prAny}: {sbAny, sbBreak, 110},
|
|||
|
{sbSTerm, prAny}: {sbAny, sbBreak, 110},
|
|||
|
{sbSB8aClose, prAny}: {sbAny, sbBreak, 110},
|
|||
|
{sbSB8aSp, prAny}: {sbAny, sbBreak, 110},
|
|||
|
// We'll always break after ParaSep due to SB4.
|
|||
|
}
|
|||
|
|
|||
|
// transitionSentenceBreakState determines the new state of the sentence break
|
|||
|
// parser given the current state and the next code point. It also returns
|
|||
|
// whether a sentence boundary was detected. If more than one code point is
|
|||
|
// needed to determine the new state, the byte slice or the string starting
|
|||
|
// after rune "r" can be used (whichever is not nil or empty) for further
|
|||
|
// lookups.
|
|||
|
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
|
|||
|
// Determine the property of the next character.
|
|||
|
nextProperty := property(sentenceBreakCodePoints, r)
|
|||
|
|
|||
|
// SB5 (Replacing Ignore Rules).
|
|||
|
if nextProperty == prExtend || nextProperty == prFormat {
|
|||
|
if state == sbParaSep || state == sbCR {
|
|||
|
return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
|
|||
|
}
|
|||
|
if state < 0 {
|
|||
|
return sbAny, true // SB1.
|
|||
|
}
|
|||
|
return state, false
|
|||
|
}
|
|||
|
|
|||
|
// Find the applicable transition in the table.
|
|||
|
var rule int
|
|||
|
transition, ok := sbTransitions[[2]int{state, nextProperty}]
|
|||
|
if ok {
|
|||
|
// We have a specific transition. We'll use it.
|
|||
|
newState, sentenceBreak, rule = transition[0], transition[1] == sbBreak, transition[2]
|
|||
|
} else {
|
|||
|
// No specific transition found. Try the less specific ones.
|
|||
|
transAnyProp, okAnyProp := sbTransitions[[2]int{state, prAny}]
|
|||
|
transAnyState, okAnyState := sbTransitions[[2]int{sbAny, nextProperty}]
|
|||
|
if okAnyProp && okAnyState {
|
|||
|
// Both apply. We'll use a mix (see comments for grTransitions).
|
|||
|
newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
|
|||
|
if transAnyProp[2] < transAnyState[2] {
|
|||
|
sentenceBreak, rule = transAnyProp[1] == sbBreak, transAnyProp[2]
|
|||
|
}
|
|||
|
} else if okAnyProp {
|
|||
|
// We only have a specific state.
|
|||
|
newState, sentenceBreak, rule = transAnyProp[0], transAnyProp[1] == sbBreak, transAnyProp[2]
|
|||
|
// This branch will probably never be reached because okAnyState will
|
|||
|
// always be true given the current transition map. But we keep it here
|
|||
|
// for future modifications to the transition map where this may not be
|
|||
|
// true anymore.
|
|||
|
} else if okAnyState {
|
|||
|
// We only have a specific property.
|
|||
|
newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
|
|||
|
} else {
|
|||
|
// No known transition. SB999: Any × Any.
|
|||
|
newState, sentenceBreak, rule = sbAny, false, 9990
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// SB8.
|
|||
|
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
|
|||
|
// Check the right side of the rule.
|
|||
|
var length int
|
|||
|
for nextProperty != prOLetter &&
|
|||
|
nextProperty != prUpper &&
|
|||
|
nextProperty != prLower &&
|
|||
|
nextProperty != prSep &&
|
|||
|
nextProperty != prCR &&
|
|||
|
nextProperty != prLF &&
|
|||
|
nextProperty != prATerm &&
|
|||
|
nextProperty != prSTerm {
|
|||
|
// Move on to the next rune.
|
|||
|
if b != nil { // Byte slice version.
|
|||
|
r, length = utf8.DecodeRune(b)
|
|||
|
b = b[length:]
|
|||
|
} else { // String version.
|
|||
|
r, length = utf8.DecodeRuneInString(str)
|
|||
|
str = str[length:]
|
|||
|
}
|
|||
|
if r == utf8.RuneError {
|
|||
|
break
|
|||
|
}
|
|||
|
nextProperty = property(sentenceBreakCodePoints, r)
|
|||
|
}
|
|||
|
if nextProperty == prLower {
|
|||
|
return sbLower, false
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
return
|
|||
|
}
|