mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-22 16:36:27 +01:00
154 lines
2.4 KiB
Go
154 lines
2.4 KiB
Go
|
package logstorage
|
||
|
|
||
|
import (
|
||
|
"sort"
|
||
|
"sync"
|
||
|
"unicode"
|
||
|
)
|
||
|
|
||
|
// tokenizeStrings extracts word tokens from a, appends them to dst and returns the result.
|
||
|
func tokenizeStrings(dst, a []string) []string {
|
||
|
t := getTokenizer()
|
||
|
m := t.m
|
||
|
for i, s := range a {
|
||
|
if i > 0 && s == a[i-1] {
|
||
|
// This string has been already tokenized
|
||
|
continue
|
||
|
}
|
||
|
tokenizeString(m, s)
|
||
|
}
|
||
|
dstLen := len(dst)
|
||
|
for k := range t.m {
|
||
|
dst = append(dst, k)
|
||
|
}
|
||
|
putTokenizer(t)
|
||
|
|
||
|
// Sort tokens with zero memory allocations
|
||
|
ss := getStringsSorter(dst[dstLen:])
|
||
|
sort.Sort(ss)
|
||
|
putStringsSorter(ss)
|
||
|
|
||
|
return dst
|
||
|
}
|
||
|
|
||
|
type tokenizer struct {
|
||
|
m map[string]struct{}
|
||
|
}
|
||
|
|
||
|
func (t *tokenizer) reset() {
|
||
|
m := t.m
|
||
|
for k := range m {
|
||
|
delete(m, k)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func tokenizeString(dst map[string]struct{}, s string) {
|
||
|
for len(s) > 0 {
|
||
|
// Search for the next token.
|
||
|
nextIdx := len(s)
|
||
|
for i, c := range s {
|
||
|
if isTokenRune(c) {
|
||
|
nextIdx = i
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
s = s[nextIdx:]
|
||
|
// Search for the end of the token
|
||
|
nextIdx = len(s)
|
||
|
for i, c := range s {
|
||
|
if !isTokenRune(c) {
|
||
|
nextIdx = i
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
token := s[:nextIdx]
|
||
|
if len(token) > 0 {
|
||
|
dst[token] = struct{}{}
|
||
|
}
|
||
|
s = s[nextIdx:]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func isTokenRune(c rune) bool {
|
||
|
return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
|
||
|
}
|
||
|
|
||
|
func getTokenizer() *tokenizer {
|
||
|
v := tokenizerPool.Get()
|
||
|
if v == nil {
|
||
|
return &tokenizer{
|
||
|
m: make(map[string]struct{}),
|
||
|
}
|
||
|
}
|
||
|
return v.(*tokenizer)
|
||
|
}
|
||
|
|
||
|
func putTokenizer(t *tokenizer) {
|
||
|
t.reset()
|
||
|
tokenizerPool.Put(t)
|
||
|
}
|
||
|
|
||
|
var tokenizerPool sync.Pool
|
||
|
|
||
|
type stringsSorter struct {
|
||
|
a []string
|
||
|
}
|
||
|
|
||
|
func (ss *stringsSorter) Len() int {
|
||
|
return len(ss.a)
|
||
|
}
|
||
|
func (ss *stringsSorter) Swap(i, j int) {
|
||
|
a := ss.a
|
||
|
a[i], a[j] = a[j], a[i]
|
||
|
}
|
||
|
func (ss *stringsSorter) Less(i, j int) bool {
|
||
|
a := ss.a
|
||
|
return a[i] < a[j]
|
||
|
}
|
||
|
|
||
|
func getStringsSorter(a []string) *stringsSorter {
|
||
|
v := stringsSorterPool.Get()
|
||
|
if v == nil {
|
||
|
return &stringsSorter{
|
||
|
a: a,
|
||
|
}
|
||
|
}
|
||
|
ss := v.(*stringsSorter)
|
||
|
ss.a = a
|
||
|
return ss
|
||
|
}
|
||
|
|
||
|
func putStringsSorter(ss *stringsSorter) {
|
||
|
ss.a = nil
|
||
|
stringsSorterPool.Put(ss)
|
||
|
}
|
||
|
|
||
|
var stringsSorterPool sync.Pool
|
||
|
|
||
|
type tokensBuf struct {
|
||
|
A []string
|
||
|
}
|
||
|
|
||
|
func (tb *tokensBuf) reset() {
|
||
|
a := tb.A
|
||
|
for i := range a {
|
||
|
a[i] = ""
|
||
|
}
|
||
|
tb.A = a[:0]
|
||
|
}
|
||
|
|
||
|
func getTokensBuf() *tokensBuf {
|
||
|
v := tokensBufPool.Get()
|
||
|
if v == nil {
|
||
|
return &tokensBuf{}
|
||
|
}
|
||
|
return v.(*tokensBuf)
|
||
|
}
|
||
|
|
||
|
func putTokensBuf(tb *tokensBuf) {
|
||
|
tb.reset()
|
||
|
tokensBufPool.Put(tb)
|
||
|
}
|
||
|
|
||
|
var tokensBufPool sync.Pool
|