VictoriaMetrics/vendor/github.com/antzucaro/matchr/metaphone.go
2022-05-20 14:48:23 +03:00

722 lines
18 KiB
Go

package matchr
import (
"bytes"
"strings"
)
type metaphoneresult struct {
// the maximum number of code values to calculate
maxLength int
// whether to calculate an alternate
calcAlternate bool
// no direct modifications - only through add()
primary bytes.Buffer
alternate bytes.Buffer
// length of the private buffers
PrimaryLength int
AlternateLength int
}
func newMetaphoneresult(maxLength int, calcAlternate bool) (r *metaphoneresult) {
r = &metaphoneresult{maxLength: maxLength, calcAlternate: calcAlternate}
return
}
func (r *metaphoneresult) add(c1 string, c2 string) {
if c1 != "" {
r.primary.WriteString(c1)
r.PrimaryLength += len(c1)
}
if c2 != "" && r.calcAlternate {
r.alternate.WriteString(c2)
r.AlternateLength += len(c2)
}
}
func (r *metaphoneresult) isComplete() bool {
return r.PrimaryLength >= r.maxLength && r.AlternateLength >= r.maxLength
}
func (r *metaphoneresult) result() (primary string, alternate string) {
primary = r.primary.String()
if len(primary) > r.maxLength {
primary = primary[0:r.maxLength]
}
alternate = r.alternate.String()
if len(alternate) > r.maxLength {
alternate = alternate[0:r.maxLength]
}
return
}
// utility functions for checking things within a string
func isSlavoGermanic(value string) bool {
return strings.Contains(value, "W") || strings.Contains(value, "K") ||
strings.Contains(value, "CZ") || strings.Contains(value, "WITZ")
}
func isSilentStart(input runestring) bool {
SILENT_START := [...]string{"GN", "KN", "PN", "WR", "PS"}
prefix := input.SafeSubstr(0, 2)
for _, criteria := range SILENT_START {
if prefix == criteria {
return true
}
}
return false
}
func handleVowel(result *metaphoneresult, index int) int {
if index == 0 {
result.add("A", "A")
}
return index + 1
}
/******************************************************************************
* Entry handlers for letters.
*****************************************************************************/
func handleC(input runestring, result *metaphoneresult, index int) int {
if conditionC0(input, index) {
result.add("K", "K")
index += 2
} else if index == 0 && input.Contains(index, 6, "CAESAR") {
result.add("S", "S")
index += 2
} else if input.Contains(index, 2, "CH") {
index = handleCH(input, result, index)
} else if input.Contains(index, 2, "CZ") &&
!input.Contains(index-2, 4, "WICZ") {
result.add("S", "X")
index += 2
} else if input.Contains(index+1, 3, "CIA") {
result.add("X", "X")
index += 3
} else if input.Contains(index, 2, "CC") &&
!(index == 1 && input.SafeAt(0) == 'M') {
return handleCC(input, result, index)
} else if input.Contains(index, 2, "CK") ||
input.Contains(index, 2, "CG") ||
input.Contains(index, 2, "CQ") {
result.add("K", "K")
index += 2
} else if input.Contains(index, 2, "CI") ||
input.Contains(index, 2, "CE") ||
input.Contains(index, 2, "CY") {
if input.Contains(index, 3, "CIO") ||
input.Contains(index, 3, "CIE") ||
input.Contains(index, 3, "CIA") {
result.add("S", "X")
} else {
result.add("S", "S")
}
index += 2
} else {
result.add("K", "K")
if input.Contains(index+1, 2, " C") ||
input.Contains(index+1, 2, " Q") ||
input.Contains(index+1, 2, " G") {
index += 3
} else if (input.Contains(index+1, 1, "C") ||
input.Contains(index+1, 1, "K") ||
input.Contains(index+1, 1, "Q")) &&
!(input.Contains(index+1, 2, "CE") ||
input.Contains(index+1, 2, "CI")) {
index += 2
} else {
index++
}
}
return index
}
func handleCC(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index+2, 1, "I", "E", "H") &&
!input.Contains(index+2, 2, "HU") {
if (index == 1 && input.SafeAt(index-1) == 'A') ||
(input.Contains(index-1, 5, "UCCEE", "UCCES")) {
result.add("KS", "KS")
} else {
result.add("X", "X")
}
index += 3
} else {
result.add("K", "K")
index += 2
}
return index
}
func handleCH(input runestring, result *metaphoneresult, index int) int {
if index > 0 && input.Contains(index, 4, "CHAE") {
result.add("K", "X")
return index + 2
} else if conditionCH0(input, index) {
result.add("K", "K")
return index + 2
// TODO: combine this condition with the one above?
} else if conditionCH1(input, index) {
result.add("K", "K")
return index + 2
} else {
if index > 0 {
if input.Contains(0, 2, "MC") {
result.add("K", "K")
} else {
result.add("X", "K")
}
} else {
result.add("X", "X")
}
return index + 2
}
}
func handleD(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index, 2, "DG") {
if input.Contains(index+2, 1, "I", "E", "Y") {
result.add("J", "J")
index += 3
} else {
result.add("TK", "TK")
index += 2
}
} else if input.Contains(index, 2, "DT", "DD") {
result.add("T", "T")
index += 2
} else {
result.add("T", "T")
index++
}
return index
}
func handleG(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.SafeAt(index+1) == 'H' {
index = handleGH(input, result, index)
} else if input.SafeAt(index+1) == 'N' {
if index == 1 && isVowel(input.SafeAt(0)) && !slavoGermanic {
result.add("KN", "N")
} else if !input.Contains(index+2, 2, "EY") && input.SafeAt(index+1) != 'Y' && !slavoGermanic {
result.add("N", "KN")
} else {
result.add("KN", "KN")
}
index += 2
} else if input.Contains(index+1, 2, "LI") && !slavoGermanic {
result.add("KL", "L")
index += 2
} else if index == 0 && (input.SafeAt(index+1) == 'Y' ||
input.Contains(index+1, 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER")) {
result.add("K", "J")
index += 2
} else if (input.Contains(index+1, 2, "ER") ||
input.SafeAt(index+1) == 'Y') &&
!input.Contains(0, 6, "DANGER", "RANGER", "MANGER") &&
!input.Contains(index-1, 1, "E", "I") &&
!input.Contains(index-1, 3, "RGY", "OGY") {
result.add("K", "J")
index += 2
} else if input.Contains(index+1, 1, "E", "I", "Y") ||
input.Contains(index-1, 4, "AGGI", "OGGI") {
if input.Contains(0, 4, "VAN ", "VON ") ||
input.Contains(0, 3, "SCH") ||
input.Contains(index+1, 2, "ET") {
result.add("K", "K")
} else if input.Contains(index+1, 3, "IER") {
result.add("J", "J")
} else {
result.add("J", "K")
}
index += 2
} else if input.SafeAt(index+1) == 'G' {
result.add("K", "K")
index += 2
} else {
result.add("K", "K")
index++
}
return index
}
func handleGH(input runestring, result *metaphoneresult, index int) int {
if index > 0 && !isVowel(input.SafeAt(index-1)) {
result.add("K", "K")
index += 2
} else if index == 0 {
if input.SafeAt(index+2) == 'I' {
result.add("J", "J")
} else {
result.add("K", "K")
}
index += 2
} else if (index > 1 && input.Contains(index-2, 1, "B", "H", "D")) ||
(index > 2 && input.Contains(index-3, 1, "B", "H", "D")) ||
(index > 3 && input.Contains(index-4, 1, "B", "H")) {
index += 2
} else {
if index > 2 && input.SafeAt(index-1) == 'U' &&
input.Contains(index-3, 1, "C", "G", "L", "R", "T") {
result.add("F", "F")
} else if index > 0 && input.SafeAt(index-1) != 'I' {
result.add("K", "K")
}
index += 2
}
return index
}
func handleH(input runestring, result *metaphoneresult, index int) int {
if (index == 0 || isVowel(input.SafeAt(index-1))) &&
isVowel(input.SafeAt(index+1)) {
result.add("H", "H")
index += 2
} else {
index++
}
return index
}
func handleJ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.Contains(index, 4, "JOSE") || input.Contains(0, 4, "SAN ") {
if (index == 0 && (input.SafeAt(index+4) == ' ') ||
len(input) == 4) || input.Contains(0, 4, "SAN ") {
result.add("H", "H")
} else {
result.add("J", "H")
}
index++
} else {
if index == 0 && !input.Contains(index, 4, "JOSE") {
result.add("J", "A")
} else if isVowel(input.SafeAt(index-1)) && !slavoGermanic &&
(input.SafeAt(index+1) == 'A' || input.SafeAt(index+1) == 'O') {
result.add("J", "H")
} else if index == (len(input) - 1) {
result.add("J", " ")
} else if !input.Contains(index+1, 1,
"L", "T", "K", "S", "N", "M", "B", "Z") &&
!input.Contains(index-1, 1, "S", "K", "L") {
result.add("J", "J")
}
if input.SafeAt(index+1) == 'J' {
index += 2
} else {
index++
}
}
return index
}
func handleL(input runestring, result *metaphoneresult, index int) int {
if input.SafeAt(index+1) == 'L' {
if conditionL0(input, index) {
result.add("L", "")
} else {
result.add("L", "L")
}
index += 2
} else {
result.add("L", "L")
index++
}
return index
}
func handleP(input runestring, result *metaphoneresult, index int) int {
if input.SafeAt(index+1) == 'H' {
result.add("F", "F")
index += 2
} else {
result.add("P", "P")
if input.Contains(index+1, 1, "P", "B") {
index += 2
} else {
index++
}
}
return index
}
func handleR(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if index == (len(input)-1) && !slavoGermanic &&
input.Contains(index-2, 2, "IE") &&
!input.Contains(index-4, 2, "ME", "MA") {
result.add("", "R")
} else {
result.add("R", "R")
}
if input.SafeAt(index+1) == 'R' {
index += 2
} else {
index++
}
return index
}
func handleS(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.Contains(index-1, 3, "ISL", "YSL") {
index++
} else if index == 0 && input.Contains(index, 5, "SUGAR") {
result.add("X", "S")
index++
} else if input.Contains(index, 2, "SH") {
if input.Contains(index+1, 4, "HEIM", "HOEK", "HOLM", "HOLZ") {
result.add("S", "S")
} else {
result.add("X", "X")
}
index += 2
} else if input.Contains(index, 3, "SIO", "SIA") ||
input.Contains(index, 4, "SIAN") {
if slavoGermanic {
result.add("S", "S")
} else {
result.add("S", "X")
}
index += 3
} else if (index == 0 && input.Contains(index+1, 1, "M", "N", "L", "W")) ||
input.Contains(index+1, 1, "Z") {
result.add("S", "X")
if input.Contains(index+1, 1, "Z") {
index += 2
} else {
index++
}
} else if input.Contains(index, 2, "SC") {
index = handleSC(input, result, index)
} else {
if index == len(input)-1 &&
input.Contains(index-2, 2, "AI", "OI") {
result.add("", "S")
} else {
result.add("S", "S")
}
if input.Contains(index+1, 1, "S", "Z") {
index += 2
} else {
index++
}
}
return index
}
func handleSC(input runestring, result *metaphoneresult, index int) int {
if input.SafeAt(index+2) == 'H' {
if input.Contains(index+3, 2, "OO", "ER", "EN", "UY", "ED", "EM") {
if input.Contains(index+3, 2, "ER", "EN") {
result.add("X", "SK")
} else {
result.add("SK", "SK")
}
} else {
if index == 0 && !isVowel(input.SafeAt(3)) && input.SafeAt(3) != 'W' {
result.add("X", "S")
} else {
result.add("X", "X")
}
}
} else if input.Contains(index+2, 1, "I", "E", "Y") {
result.add("S", "S")
} else {
result.add("SK", "SK")
}
index += 3
return index
}
func handleT(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index, 4, "TION") {
result.add("X", "X")
index += 3
} else if input.Contains(index, 3, "TIA", "TCH") {
result.add("X", "X")
index += 3
} else if input.Contains(index, 2, "TH") || input.Contains(index, 3, "TTH") {
if input.Contains(index+2, 2, "OM", "AM") ||
input.Contains(0, 4, "VAN ", "VON ") ||
input.Contains(0, 3, "SCH") {
result.add("T", "T")
} else {
result.add("0", "T")
}
index += 2
} else {
result.add("T", "T")
if input.Contains(index+1, 1, "T", "D") {
index += 2
} else {
index++
}
}
return index
}
func handleW(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index, 2, "WR") {
result.add("R", "R")
index += 2
} else {
if index == 0 && (isVowel(input.SafeAt(index+1)) ||
input.Contains(index, 2, "WH")) {
if isVowel(input.SafeAt(index + 1)) {
result.add("A", "F")
} else {
result.add("A", "A")
}
index++
} else if (index == len(input)-1 && isVowel(input.SafeAt(index-1))) ||
input.Contains(index-1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
input.Contains(0, 3, "SCH") {
result.add("", "F")
index++
} else if input.Contains(index, 4, "WICZ", "WITZ") {
result.add("TS", "FX")
index += 4
} else {
index++
}
}
return index
}
func handleX(input runestring, result *metaphoneresult, index int) int {
if index == 0 {
result.add("S", "S")
index++
} else {
if !((index == len(input)-1) &&
(input.Contains(index-3, 3, "IAU", "EAU") ||
input.Contains(index-2, 2, "AU", "OU"))) {
result.add("KS", "KS")
}
if input.Contains(index+1, 1, "C", "X") {
index += 2
} else {
index++
}
}
return index
}
func handleZ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.SafeAt(index+1) == 'H' {
result.add("J", "J")
} else {
if input.Contains(index+1, 2, "ZO", "ZI", "ZA") ||
(slavoGermanic && (index > 0 && input.SafeAt(index-1) != 'T')) {
result.add("S", "TS")
} else {
result.add("S", "S")
}
}
if input.SafeAt(index+1) == 'Z' {
index += 2
} else {
index++
}
return index
}
/******************************************************************************
* Complex conditional handlers for letters
*****************************************************************************/
func conditionC0(input runestring, index int) bool {
if input.Contains(index, 4, "CHIA") {
return true
} else if index <= 1 {
return false
} else if isVowel(input.SafeAt(index - 2)) {
return false
} else if !input.Contains(index-1, 3, "ACH") {
return false
} else {
c := input.SafeAt(index + 2)
return (c != 'I' && c != 'E') ||
(input.Contains(index-2, 6, "BACHER") ||
input.Contains(index-2, 6, "MACHER"))
}
}
func conditionCH0(input runestring, index int) bool {
if index != 0 {
return false
} else if !input.Contains(index+1, 5, "HARAC", "HARIS") &&
!input.Contains(index+1, 3, "HOR", "HYM", "HIA", "HEM") {
return false
} else if input.Contains(0, 5, "CHORE") {
return false
} else {
return true
}
}
func conditionCH1(input runestring, index int) bool {
// good god this is ugly
return (input.Contains(0, 4, "VAN ", "VON ") || input.Contains(0, 3, "SCH")) ||
input.Contains(index-2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
input.Contains(index+2, 1, "T", "S") ||
((input.Contains(index-1, 1, "A", "O", "U", "E") || index == 0) &&
(input.Contains(index+2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") ||
index+1 == len(input)-1))
}
func conditionL0(input runestring, index int) bool {
if index == (len(input)-3) &&
input.Contains(index-1, 4, "ILLO", "ILLA", "ALLE") {
return true
} else if (input.Contains(len(input)-2, 2, "AS", "OS") ||
input.Contains(len(input)-1, 1, "A", "O")) &&
(input.Contains(index-1, 4, "ALLE")) {
return true
} else {
return false
}
}
func conditionM0(input runestring, index int) bool {
if input.SafeAt(index+1) == 'M' {
return true
}
return input.Contains(index-1, 3, "UMB") &&
((index+1) == (len(input)-1) ||
input.Contains(index+2, 2, "ER"))
}
// DoubleMetaphone computes the Double-Metaphone value of the input string.
// This value is a phonetic representation of how the string sounds, with
// affordances for many different language dialects. It was originally
// developed by Lawrence Phillips in the 1990s.
//
// More information about this algorithm can be found on Wikipedia at
// http://en.wikipedia.org/wiki/Metaphone.
func DoubleMetaphone(s1 string) (string, string) {
// trim, upper space
s1 = cleanInput(s1)
// structure to traverse the string by code point, not byte
input := runestring(s1)
slavoGermanic := isSlavoGermanic(s1)
// where we are in the string
index := 0
if isSilentStart(input) {
index += 1
}
result := newMetaphoneresult(4, true)
for !result.isComplete() && index <= len(input)-1 {
c := rune(input.SafeAt(index))
switch c {
case 'A', 'E', 'I', 'O', 'U', 'Y':
index = handleVowel(result, index)
case 'B':
result.add("P", "P")
if input.SafeAt(index+1) == 'B' {
index += 2
} else {
index++
}
case 'Ç':
result.add("S", "S")
index++
case 'C':
index = handleC(input, result, index)
case 'D':
index = handleD(input, result, index)
case 'F':
result.add("F", "F")
if input.SafeAt(index+1) == 'F' {
index += 2
} else {
index++
}
case 'G':
index = handleG(input, result, index, slavoGermanic)
case 'H':
index = handleH(input, result, index)
case 'J':
index = handleJ(input, result, index, slavoGermanic)
case 'K':
result.add("K", "K")
if input.SafeAt(index+1) == 'K' {
index += 2
} else {
index++
}
case 'L':
index = handleL(input, result, index)
case 'M':
result.add("M", "M")
if conditionM0(input, index) {
index += 2
} else {
index++
}
case 'N':
result.add("N", "N")
if input.SafeAt(index+1) == 'N' {
index += 2
} else {
index++
}
case 'Ñ':
result.add("N", "N")
index++
case 'P':
index = handleP(input, result, index)
case 'Q':
result.add("K", "K")
if input.SafeAt(index+1) == 'Q' {
index += 2
} else {
index++
}
case 'R':
index = handleR(input, result, index, slavoGermanic)
case 'S':
index = handleS(input, result, index, slavoGermanic)
case 'T':
index = handleT(input, result, index)
case 'V':
result.add("F", "F")
if input.SafeAt(index+1) == 'V' {
index += 2
} else {
index++
}
case 'W':
index = handleW(input, result, index)
case 'X':
index = handleX(input, result, index)
case 'Z':
index = handleZ(input, result, index, slavoGermanic)
default:
index++
}
}
return result.result()
}