src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

gen_properties.go (7932B)


      1 //go:build generate
      2 
      3 // This program generates a property file in Go file from Unicode Character
      4 // Database auxiliary data files. The command line arguments are as follows:
      5 //
      6 //  1. The name of the Unicode data file (just the filename, without extension).
      7 //     Can be "-" (to skip) if the emoji flag is included.
      8 //  2. The name of the locally generated Go file.
      9 //  3. The name of the slice mapping code points to properties.
     10 //  4. The name of the generator, for logging purposes.
     11 //  5. (Optional) Flags, comma-separated. The following flags are available:
     12 //     - "emojis=<property>": include the specified emoji properties (e.g.
     13 //     "Extended_Pictographic").
     14 //     - "gencat": include general category properties.
     15 //
     16 //go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
     17 //go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
     18 //go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
     19 //go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
     20 //go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
     21 //go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
     22 package main
     23 
     24 import (
     25 	"bufio"
     26 	"bytes"
     27 	"errors"
     28 	"fmt"
     29 	"go/format"
     30 	"io/ioutil"
     31 	"log"
     32 	"net/http"
     33 	"os"
     34 	"regexp"
     35 	"sort"
     36 	"strconv"
     37 	"strings"
     38 	"time"
     39 )
     40 
     41 // We want to test against a specific version rather than the latest. When the
     42 // package is upgraded to a new version, change these to generate new tests.
     43 const (
     44 	propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
     45 	emojiURL    = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
     46 )
     47 
     48 // The regular expression for a line containing a code point range property.
     49 var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
     50 
     51 func main() {
     52 	if len(os.Args) < 5 {
     53 		fmt.Println("Not enough arguments, see code for details")
     54 		os.Exit(1)
     55 	}
     56 
     57 	log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
     58 	log.SetFlags(0)
     59 
     60 	// Parse flags.
     61 	flags := make(map[string]string)
     62 	if len(os.Args) >= 6 {
     63 		for _, flag := range strings.Split(os.Args[5], ",") {
     64 			flagFields := strings.Split(flag, "=")
     65 			if len(flagFields) == 1 {
     66 				flags[flagFields[0]] = "yes"
     67 			} else {
     68 				flags[flagFields[0]] = flagFields[1]
     69 			}
     70 		}
     71 	}
     72 
     73 	// Parse the text file and generate Go source code from it.
     74 	_, includeGeneralCategory := flags["gencat"]
     75 	var mainURL string
     76 	if os.Args[1] != "-" {
     77 		mainURL = fmt.Sprintf(propertyURL, os.Args[1])
     78 	}
     79 	src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
     80 	if err != nil {
     81 		log.Fatal(err)
     82 	}
     83 
     84 	// Format the Go code.
     85 	formatted, err := format.Source([]byte(src))
     86 	if err != nil {
     87 		log.Fatal("gofmt:", err)
     88 	}
     89 
     90 	// Save it to the (local) target file.
     91 	log.Print("Writing to ", os.Args[2])
     92 	if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
     93 		log.Fatal(err)
     94 	}
     95 }
     96 
     97 // parse parses the Unicode Properties text files located at the given URLs and
     98 // returns their equivalent Go source code to be used in the uniseg package. If
     99 // "emojiProperty" is not an empty string, emoji code points for that emoji
    100 // property (e.g. "Extended_Pictographic") will be included. In those cases, you
    101 // may pass an empty "propertyURL" to skip parsing the main properties file. If
    102 // "includeGeneralCategory" is true, the Unicode General Category property will
    103 // be extracted from the comments and included in the output.
    104 func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
    105 	if propertyURL == "" && emojiProperty == "" {
    106 		return "", errors.New("no properties to parse")
    107 	}
    108 
    109 	// Temporary buffer to hold properties.
    110 	var properties [][4]string
    111 
    112 	// Open the first URL.
    113 	if propertyURL != "" {
    114 		log.Printf("Parsing %s", propertyURL)
    115 		res, err := http.Get(propertyURL)
    116 		if err != nil {
    117 			return "", err
    118 		}
    119 		in1 := res.Body
    120 		defer in1.Close()
    121 
    122 		// Parse it.
    123 		scanner := bufio.NewScanner(in1)
    124 		num := 0
    125 		for scanner.Scan() {
    126 			num++
    127 			line := strings.TrimSpace(scanner.Text())
    128 
    129 			// Skip comments and empty lines.
    130 			if strings.HasPrefix(line, "#") || line == "" {
    131 				continue
    132 			}
    133 
    134 			// Everything else must be a code point range, a property and a comment.
    135 			from, to, property, comment, err := parseProperty(line)
    136 			if err != nil {
    137 				return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
    138 			}
    139 			properties = append(properties, [4]string{from, to, property, comment})
    140 		}
    141 		if err := scanner.Err(); err != nil {
    142 			return "", err
    143 		}
    144 	}
    145 
    146 	// Open the second URL.
    147 	if emojiProperty != "" {
    148 		log.Printf("Parsing %s", emojiURL)
    149 		res, err := http.Get(emojiURL)
    150 		if err != nil {
    151 			return "", err
    152 		}
    153 		in2 := res.Body
    154 		defer in2.Close()
    155 
    156 		// Parse it.
    157 		scanner := bufio.NewScanner(in2)
    158 		num := 0
    159 		for scanner.Scan() {
    160 			num++
    161 			line := scanner.Text()
    162 
    163 			// Skip comments, empty lines, and everything not containing
    164 			// "Extended_Pictographic".
    165 			if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
    166 				continue
    167 			}
    168 
    169 			// Everything else must be a code point range, a property and a comment.
    170 			from, to, property, comment, err := parseProperty(line)
    171 			if err != nil {
    172 				return "", fmt.Errorf("emojis line %d: %v", num, err)
    173 			}
    174 			properties = append(properties, [4]string{from, to, property, comment})
    175 		}
    176 		if err := scanner.Err(); err != nil {
    177 			return "", err
    178 		}
    179 	}
    180 
    181 	// Avoid overflow during binary search.
    182 	if len(properties) >= 1<<31 {
    183 		return "", errors.New("too many properties")
    184 	}
    185 
    186 	// Sort properties.
    187 	sort.Slice(properties, func(i, j int) bool {
    188 		left, _ := strconv.ParseUint(properties[i][0], 16, 64)
    189 		right, _ := strconv.ParseUint(properties[j][0], 16, 64)
    190 		return left < right
    191 	})
    192 
    193 	// Header.
    194 	var (
    195 		buf          bytes.Buffer
    196 		emojiComment string
    197 	)
    198 	columns := 3
    199 	if includeGeneralCategory {
    200 		columns = 4
    201 	}
    202 	if emojiURL != "" {
    203 		emojiComment = `
    204 // and
    205 // ` + emojiURL + `
    206 // ("Extended_Pictographic" only)`
    207 	}
    208 	buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
    209 
    210 package uniseg
    211 
    212 // ` + os.Args[3] + ` are taken from
    213 // ` + propertyURL + emojiComment + `
    214 // on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
    215 // license agreement.
    216 var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
    217 	`)
    218 
    219 	// Properties.
    220 	for _, prop := range properties {
    221 		if includeGeneralCategory {
    222 			generalCategory := "gc" + prop[3][:2]
    223 			if generalCategory == "gcL&" {
    224 				generalCategory = "gcLC"
    225 			}
    226 			prop[3] = prop[3][3:]
    227 			fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
    228 		} else {
    229 			fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
    230 		}
    231 	}
    232 
    233 	// Tail.
    234 	buf.WriteString("}")
    235 
    236 	return buf.String(), nil
    237 }
    238 
    239 // parseProperty parses a line of the Unicode properties text file containing a
    240 // property for a code point range and returns it along with its comment.
    241 func parseProperty(line string) (from, to, property, comment string, err error) {
    242 	fields := propertyPattern.FindStringSubmatch(line)
    243 	if fields == nil {
    244 		err = errors.New("no property found")
    245 		return
    246 	}
    247 	from = fields[1]
    248 	to = fields[3]
    249 	if to == "" {
    250 		to = from
    251 	}
    252 	property = fields[4]
    253 	comment = fields[5]
    254 	return
    255 }
    256 
    257 // translateProperty translates a property name as used in the Unicode data file
    258 // to a variable used in the Go code.
    259 func translateProperty(prefix, property string) string {
    260 	return prefix + strings.ReplaceAll(property, "_", "")
    261 }