2022-08-02 08:19:38 +02:00
|
|
|
|
package uniseg
|
|
|
|
|
|
|
|
|
|
import "unicode/utf8"
|
|
|
|
|
|
|
|
|
|
// The states of the sentence break parser.
|
|
|
|
|
const (
|
|
|
|
|
sbAny = iota
|
|
|
|
|
sbCR
|
|
|
|
|
sbParaSep
|
|
|
|
|
sbATerm
|
|
|
|
|
sbUpper
|
|
|
|
|
sbLower
|
|
|
|
|
sbSB7
|
|
|
|
|
sbSB8Close
|
|
|
|
|
sbSB8Sp
|
|
|
|
|
sbSTerm
|
|
|
|
|
sbSB8aClose
|
|
|
|
|
sbSB8aSp
|
|
|
|
|
)
|
|
|
|
|
|
2024-01-30 17:47:01 +01:00
|
|
|
|
// sbTransitions implements the sentence break parser's state transitions. It's
|
|
|
|
|
// anologous to [grTransitions], see comments there for details.
|
|
|
|
|
//
|
|
|
|
|
// Unicode version 15.0.0.
|
|
|
|
|
func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
|
|
|
|
|
switch uint64(state) | uint64(prop)<<32 {
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// SB3.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbAny | prCR<<32:
|
|
|
|
|
return sbCR, false, 9990
|
|
|
|
|
case sbCR | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 30
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB4.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbAny | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 9990
|
|
|
|
|
case sbAny | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 9990
|
|
|
|
|
case sbParaSep | prAny<<32:
|
|
|
|
|
return sbAny, true, 40
|
|
|
|
|
case sbCR | prAny<<32:
|
|
|
|
|
return sbAny, true, 40
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB6.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbAny | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 9990
|
|
|
|
|
case sbATerm | prNumeric<<32:
|
|
|
|
|
return sbAny, false, 60
|
|
|
|
|
case sbSB7 | prNumeric<<32:
|
|
|
|
|
return sbAny, false, 60 // Because ATerm also appears in SB7.
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB7.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbAny | prUpper<<32:
|
|
|
|
|
return sbUpper, false, 9990
|
|
|
|
|
case sbAny | prLower<<32:
|
|
|
|
|
return sbLower, false, 9990
|
|
|
|
|
case sbUpper | prATerm<<32:
|
|
|
|
|
return sbSB7, false, 70
|
|
|
|
|
case sbLower | prATerm<<32:
|
|
|
|
|
return sbSB7, false, 70
|
|
|
|
|
case sbSB7 | prUpper<<32:
|
|
|
|
|
return sbUpper, false, 70
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB8a.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbAny | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 9990
|
|
|
|
|
case sbATerm | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbATerm | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbATerm | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
|
|
|
|
case sbSB7 | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbSB7 | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbSB7 | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
|
|
|
|
case sbSB8Close | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbSB8Close | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbSB8Close | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
|
|
|
|
case sbSB8Sp | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbSB8Sp | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbSB8Sp | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
|
|
|
|
case sbSTerm | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbSTerm | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbSTerm | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
|
|
|
|
case sbSB8aClose | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbSB8aClose | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbSB8aClose | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
|
|
|
|
case sbSB8aSp | prSContinue<<32:
|
|
|
|
|
return sbAny, false, 81
|
|
|
|
|
case sbSB8aSp | prATerm<<32:
|
|
|
|
|
return sbATerm, false, 81
|
|
|
|
|
case sbSB8aSp | prSTerm<<32:
|
|
|
|
|
return sbSTerm, false, 81
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB9.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbATerm | prClose<<32:
|
|
|
|
|
return sbSB8Close, false, 90
|
|
|
|
|
case sbSB7 | prClose<<32:
|
|
|
|
|
return sbSB8Close, false, 90
|
|
|
|
|
case sbSB8Close | prClose<<32:
|
|
|
|
|
return sbSB8Close, false, 90
|
|
|
|
|
case sbATerm | prSp<<32:
|
|
|
|
|
return sbSB8Sp, false, 90
|
|
|
|
|
case sbSB7 | prSp<<32:
|
|
|
|
|
return sbSB8Sp, false, 90
|
|
|
|
|
case sbSB8Close | prSp<<32:
|
|
|
|
|
return sbSB8Sp, false, 90
|
|
|
|
|
case sbSTerm | prClose<<32:
|
|
|
|
|
return sbSB8aClose, false, 90
|
|
|
|
|
case sbSB8aClose | prClose<<32:
|
|
|
|
|
return sbSB8aClose, false, 90
|
|
|
|
|
case sbSTerm | prSp<<32:
|
|
|
|
|
return sbSB8aSp, false, 90
|
|
|
|
|
case sbSB8aClose | prSp<<32:
|
|
|
|
|
return sbSB8aSp, false, 90
|
|
|
|
|
case sbATerm | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbATerm | prCR<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbATerm | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB7 | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB7 | prCR<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB7 | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB8Close | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB8Close | prCR<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB8Close | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSTerm | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSTerm | prCR<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSTerm | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB8aClose | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB8aClose | prCR<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
|
|
|
|
case sbSB8aClose | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 90
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB10.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbSB8Sp | prSp<<32:
|
|
|
|
|
return sbSB8Sp, false, 100
|
|
|
|
|
case sbSB8aSp | prSp<<32:
|
|
|
|
|
return sbSB8aSp, false, 100
|
|
|
|
|
case sbSB8Sp | prSep<<32:
|
|
|
|
|
return sbParaSep, false, 100
|
|
|
|
|
case sbSB8Sp | prCR<<32:
|
|
|
|
|
return sbParaSep, false, 100
|
|
|
|
|
case sbSB8Sp | prLF<<32:
|
|
|
|
|
return sbParaSep, false, 100
|
2022-08-02 08:19:38 +02:00
|
|
|
|
|
|
|
|
|
// SB11.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
case sbATerm | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
|
|
|
|
case sbSB7 | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
|
|
|
|
case sbSB8Close | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
|
|
|
|
case sbSB8Sp | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
|
|
|
|
case sbSTerm | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
|
|
|
|
case sbSB8aClose | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
|
|
|
|
case sbSB8aSp | prAny<<32:
|
|
|
|
|
return sbAny, true, 110
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// We'll always break after ParaSep due to SB4.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return -1, false, -1
|
|
|
|
|
}
|
2022-08-02 08:19:38 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// transitionSentenceBreakState determines the new state of the sentence break
|
|
|
|
|
// parser given the current state and the next code point. It also returns
|
|
|
|
|
// whether a sentence boundary was detected. If more than one code point is
|
|
|
|
|
// needed to determine the new state, the byte slice or the string starting
|
|
|
|
|
// after rune "r" can be used (whichever is not nil or empty) for further
|
|
|
|
|
// lookups.
|
|
|
|
|
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
|
|
|
|
|
// Determine the property of the next character.
|
|
|
|
|
nextProperty := property(sentenceBreakCodePoints, r)
|
|
|
|
|
|
|
|
|
|
// SB5 (Replacing Ignore Rules).
|
|
|
|
|
if nextProperty == prExtend || nextProperty == prFormat {
|
|
|
|
|
if state == sbParaSep || state == sbCR {
|
|
|
|
|
return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
|
|
|
|
|
}
|
|
|
|
|
if state < 0 {
|
|
|
|
|
return sbAny, true // SB1.
|
|
|
|
|
}
|
|
|
|
|
return state, false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Find the applicable transition in the table.
|
|
|
|
|
var rule int
|
2024-01-30 17:47:01 +01:00
|
|
|
|
newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
|
|
|
|
|
if newState < 0 {
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// No specific transition found. Try the less specific ones.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
|
|
|
|
|
anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
|
|
|
|
|
if anyPropState >= 0 && anyStateState >= 0 {
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// Both apply. We'll use a mix (see comments for grTransitions).
|
2024-01-30 17:47:01 +01:00
|
|
|
|
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
|
|
|
|
|
if anyPropRule < anyStateRule {
|
|
|
|
|
sentenceBreak, rule = anyPropProp, anyPropRule
|
2022-08-02 08:19:38 +02:00
|
|
|
|
}
|
2024-01-30 17:47:01 +01:00
|
|
|
|
} else if anyPropState >= 0 {
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// We only have a specific state.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// This branch will probably never be reached because okAnyState will
|
|
|
|
|
// always be true given the current transition map. But we keep it here
|
|
|
|
|
// for future modifications to the transition map where this may not be
|
|
|
|
|
// true anymore.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
} else if anyStateState >= 0 {
|
2022-08-02 08:19:38 +02:00
|
|
|
|
// We only have a specific property.
|
2024-01-30 17:47:01 +01:00
|
|
|
|
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
|
2022-08-02 08:19:38 +02:00
|
|
|
|
} else {
|
|
|
|
|
// No known transition. SB999: Any × Any.
|
|
|
|
|
newState, sentenceBreak, rule = sbAny, false, 9990
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SB8.
|
|
|
|
|
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
|
|
|
|
|
// Check the right side of the rule.
|
|
|
|
|
var length int
|
|
|
|
|
for nextProperty != prOLetter &&
|
|
|
|
|
nextProperty != prUpper &&
|
|
|
|
|
nextProperty != prLower &&
|
|
|
|
|
nextProperty != prSep &&
|
|
|
|
|
nextProperty != prCR &&
|
|
|
|
|
nextProperty != prLF &&
|
|
|
|
|
nextProperty != prATerm &&
|
|
|
|
|
nextProperty != prSTerm {
|
|
|
|
|
// Move on to the next rune.
|
|
|
|
|
if b != nil { // Byte slice version.
|
|
|
|
|
r, length = utf8.DecodeRune(b)
|
|
|
|
|
b = b[length:]
|
|
|
|
|
} else { // String version.
|
|
|
|
|
r, length = utf8.DecodeRuneInString(str)
|
|
|
|
|
str = str[length:]
|
|
|
|
|
}
|
|
|
|
|
if r == utf8.RuneError {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
nextProperty = property(sentenceBreakCodePoints, r)
|
|
|
|
|
}
|
|
|
|
|
if nextProperty == prLower {
|
|
|
|
|
return sbLower, false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
}
|