VictoriaMetrics/vendor/github.com/rivo/uniseg/gen_properties.go

262 lines
7.7 KiB
Go
Raw Normal View History

2022-08-02 08:19:38 +02:00
//go:build generate
// This program generates a property file in Go file from Unicode Character
// Database auxiliary data files. The command line arguments are as follows:
//
2022-09-13 15:44:44 +02:00
// 1. The name of the Unicode data file (just the filename, without extension).
// Can be "-" (to skip) if the emoji flag is included.
// 2. The name of the locally generated Go file.
// 3. The name of the slice mapping code points to properties.
// 4. The name of the generator, for logging purposes.
// 5. (Optional) Flags, comma-separated. The following flags are available:
// - "emojis=<property>": include the specified emoji properties (e.g.
// "Extended_Pictographic").
// - "gencat": include general category properties.
2022-08-02 08:19:38 +02:00
//
2022-09-13 15:44:44 +02:00
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
2022-08-02 08:19:38 +02:00
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
2022-09-13 15:44:44 +02:00
//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
2022-08-02 08:19:38 +02:00
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"go/format"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"sort"
"strconv"
"strings"
"time"
)
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
2024-01-30 17:47:30 +01:00
propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
2022-08-02 08:19:38 +02:00
)
// The regular expression for a line containing a code point range property.
var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
func main() {
if len(os.Args) < 5 {
fmt.Println("Not enough arguments, see code for details")
os.Exit(1)
}
log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
log.SetFlags(0)
// Parse flags.
2022-09-13 15:44:44 +02:00
flags := make(map[string]string)
2022-08-02 08:19:38 +02:00
if len(os.Args) >= 6 {
for _, flag := range strings.Split(os.Args[5], ",") {
2022-09-13 15:44:44 +02:00
flagFields := strings.Split(flag, "=")
if len(flagFields) == 1 {
flags[flagFields[0]] = "yes"
} else {
flags[flagFields[0]] = flagFields[1]
}
2022-08-02 08:19:38 +02:00
}
}
// Parse the text file and generate Go source code from it.
_, includeGeneralCategory := flags["gencat"]
2022-09-13 15:44:44 +02:00
var mainURL string
if os.Args[1] != "-" {
mainURL = fmt.Sprintf(propertyURL, os.Args[1])
}
src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
2022-08-02 08:19:38 +02:00
if err != nil {
log.Fatal(err)
}
// Format the Go code.
formatted, err := format.Source([]byte(src))
if err != nil {
log.Fatal("gofmt:", err)
}
// Save it to the (local) target file.
log.Print("Writing to ", os.Args[2])
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
log.Fatal(err)
}
}
// parse parses the Unicode Properties text files located at the given URLs and
// returns their equivalent Go source code to be used in the uniseg package. If
2022-09-13 15:44:44 +02:00
// "emojiProperty" is not an empty string, emoji code points for that emoji
// property (e.g. "Extended_Pictographic") will be included. In those cases, you
// may pass an empty "propertyURL" to skip parsing the main properties file. If
2022-08-02 08:19:38 +02:00
// "includeGeneralCategory" is true, the Unicode General Category property will
// be extracted from the comments and included in the output.
2022-09-13 15:44:44 +02:00
func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
if propertyURL == "" && emojiProperty == "" {
return "", errors.New("no properties to parse")
}
2022-08-02 08:19:38 +02:00
// Temporary buffer to hold properties.
var properties [][4]string
// Open the first URL.
2022-09-13 15:44:44 +02:00
if propertyURL != "" {
log.Printf("Parsing %s", propertyURL)
res, err := http.Get(propertyURL)
if err != nil {
return "", err
2022-08-02 08:19:38 +02:00
}
2022-09-13 15:44:44 +02:00
in1 := res.Body
defer in1.Close()
2022-08-02 08:19:38 +02:00
2022-09-13 15:44:44 +02:00
// Parse it.
scanner := bufio.NewScanner(in1)
num := 0
for scanner.Scan() {
num++
line := strings.TrimSpace(scanner.Text())
// Skip comments and empty lines.
if strings.HasPrefix(line, "#") || line == "" {
continue
}
// Everything else must be a code point range, a property and a comment.
from, to, property, comment, err := parseProperty(line)
if err != nil {
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
}
properties = append(properties, [4]string{from, to, property, comment})
}
if err := scanner.Err(); err != nil {
return "", err
2022-08-02 08:19:38 +02:00
}
}
// Open the second URL.
2022-09-13 15:44:44 +02:00
if emojiProperty != "" {
2022-08-02 08:19:38 +02:00
log.Printf("Parsing %s", emojiURL)
2022-09-13 15:44:44 +02:00
res, err := http.Get(emojiURL)
2022-08-02 08:19:38 +02:00
if err != nil {
return "", err
}
in2 := res.Body
defer in2.Close()
// Parse it.
2022-09-13 15:44:44 +02:00
scanner := bufio.NewScanner(in2)
num := 0
2022-08-02 08:19:38 +02:00
for scanner.Scan() {
num++
line := scanner.Text()
// Skip comments, empty lines, and everything not containing
// "Extended_Pictographic".
2022-09-13 15:44:44 +02:00
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
2022-08-02 08:19:38 +02:00
continue
}
// Everything else must be a code point range, a property and a comment.
from, to, property, comment, err := parseProperty(line)
if err != nil {
return "", fmt.Errorf("emojis line %d: %v", num, err)
}
properties = append(properties, [4]string{from, to, property, comment})
}
if err := scanner.Err(); err != nil {
return "", err
}
}
2024-01-30 17:47:30 +01:00
// Avoid overflow during binary search.
if len(properties) >= 1<<31 {
return "", errors.New("too many properties")
}
2022-08-02 08:19:38 +02:00
// Sort properties.
sort.Slice(properties, func(i, j int) bool {
left, _ := strconv.ParseUint(properties[i][0], 16, 64)
right, _ := strconv.ParseUint(properties[j][0], 16, 64)
return left < right
})
// Header.
var (
buf bytes.Buffer
emojiComment string
)
columns := 3
if includeGeneralCategory {
columns = 4
}
if emojiURL != "" {
emojiComment = `
// and
// ` + emojiURL + `
// ("Extended_Pictographic" only)`
}
2024-01-30 17:47:30 +01:00
buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
2022-08-02 08:19:38 +02:00
2024-01-30 17:47:30 +01:00
package uniseg
2022-08-02 08:19:38 +02:00
// ` + os.Args[3] + ` are taken from
2022-09-13 15:44:44 +02:00
// ` + propertyURL + emojiComment + `
2022-08-02 08:19:38 +02:00
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
`)
// Properties.
for _, prop := range properties {
if includeGeneralCategory {
generalCategory := "gc" + prop[3][:2]
if generalCategory == "gcL&" {
generalCategory = "gcLC"
}
prop[3] = prop[3][3:]
fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
} else {
fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
}
}
// Tail.
buf.WriteString("}")
return buf.String(), nil
}
// parseProperty parses a line of the Unicode properties text file containing a
// property for a code point range and returns it along with its comment.
func parseProperty(line string) (from, to, property, comment string, err error) {
fields := propertyPattern.FindStringSubmatch(line)
if fields == nil {
err = errors.New("no property found")
return
}
from = fields[1]
to = fields[3]
if to == "" {
to = from
}
property = fields[4]
comment = fields[5]
return
}
// translateProperty translates a property name as used in the Unicode data file
// to a variable used in the Go code.
func translateProperty(prefix, property string) string {
return prefix + strings.ReplaceAll(property, "_", "")
}