src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

gen_breaktest.go (5925B)


      1 //go:build generate
      2 
      3 // This program generates a Go containing a slice of test cases based on the
      4 // Unicode Character Database auxiliary data files. The command line arguments
      5 // are as follows:
      6 //
      7 //   1. The name of the Unicode data file (just the filename, without extension).
      8 //   2. The name of the locally generated Go file.
      9 //   3. The name of the slice containing the test cases.
     10 //   4. The name of the generator, for logging purposes.
     11 //
     12 //go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
     13 //go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
     14 //go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
     15 //go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
     16 
     17 package main
     18 
     19 import (
     20 	"bufio"
     21 	"bytes"
     22 	"errors"
     23 	"fmt"
     24 	"go/format"
     25 	"io/ioutil"
     26 	"log"
     27 	"net/http"
     28 	"os"
     29 	"time"
     30 )
     31 
     32 // We want to test against a specific version rather than the latest. When the
     33 // package is upgraded to a new version, change these to generate new tests.
     34 const (
     35 	testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
     36 )
     37 
     38 func main() {
     39 	if len(os.Args) < 5 {
     40 		fmt.Println("Not enough arguments, see code for details")
     41 		os.Exit(1)
     42 	}
     43 
     44 	log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
     45 	log.SetFlags(0)
     46 
     47 	// Read text of testcases and parse into Go source code.
     48 	src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
     49 	if err != nil {
     50 		log.Fatal(err)
     51 	}
     52 
     53 	// Format the Go code.
     54 	formatted, err := format.Source(src)
     55 	if err != nil {
     56 		log.Fatalln("gofmt:", err)
     57 	}
     58 
     59 	// Write it out.
     60 	log.Print("Writing to ", os.Args[2])
     61 	if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
     62 		log.Fatal(err)
     63 	}
     64 }
     65 
     66 // parse reads a break text file, either from a local file or from a URL. It
     67 // parses the file data into Go source code representing the test cases.
     68 func parse(url string) ([]byte, error) {
     69 	log.Printf("Parsing %s", url)
     70 	res, err := http.Get(url)
     71 	if err != nil {
     72 		return nil, err
     73 	}
     74 	body := res.Body
     75 	defer body.Close()
     76 
     77 	buf := new(bytes.Buffer)
     78 	buf.Grow(120 << 10)
     79 	buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
     80 
     81 package uniseg
     82 
     83 // ` + os.Args[3] + ` are Grapheme testcases taken from
     84 // ` + url + `
     85 // on ` + time.Now().Format("January 2, 2006") + `. See
     86 // https://www.unicode.org/license.html for the Unicode license agreement.
     87 var ` + os.Args[3] + ` = []testCase {
     88 `)
     89 
     90 	sc := bufio.NewScanner(body)
     91 	num := 1
     92 	var line []byte
     93 	original := make([]byte, 0, 64)
     94 	expected := make([]byte, 0, 64)
     95 	for sc.Scan() {
     96 		num++
     97 		line = sc.Bytes()
     98 		if len(line) == 0 || line[0] == '#' {
     99 			continue
    100 		}
    101 		var comment []byte
    102 		if i := bytes.IndexByte(line, '#'); i >= 0 {
    103 			comment = bytes.TrimSpace(line[i+1:])
    104 			line = bytes.TrimSpace(line[:i])
    105 		}
    106 		original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
    107 		if err != nil {
    108 			return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
    109 		}
    110 		fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
    111 	}
    112 	if err := sc.Err(); err != nil {
    113 		return nil, err
    114 	}
    115 
    116 	// Check for final "# EOF", useful check if we're streaming via HTTP
    117 	if !bytes.Equal(line, []byte("# EOF")) {
    118 		return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
    119 	}
    120 	buf.WriteString("}\n")
    121 	return buf.Bytes(), nil
    122 }
    123 
    124 // Used by parseRuneSequence to match input via bytes.HasPrefix.
    125 var (
    126 	prefixBreak     = []byte("÷ ")
    127 	prefixDontBreak = []byte("× ")
    128 	breakOk         = []byte("÷")
    129 	breakNo         = []byte("×")
    130 )
    131 
    132 // parseRuneSequence parses a rune + breaking opportunity sequence from b
    133 // and appends the Go code for testcase.original to orig
    134 // and appends the Go code for testcase.expected to exp.
    135 // It retuns the new orig and exp slices.
    136 //
    137 // E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
    138 // it will append
    139 //
    140 //	"\u0020\u0308\U0001F1E6"
    141 //
    142 // and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
    143 // to orig and exp respectively.
    144 //
    145 // The formatting of exp is expected to be cleaned up by gofmt or format.Source.
    146 // Note we explicitly require the sequence to start with ÷ and we implicitly
    147 // require it to end with ÷.
    148 func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
    149 	// Check for and remove first ÷ or ×.
    150 	if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
    151 		return nil, nil, errors.New("expected ÷ or × as first character")
    152 	}
    153 	if bytes.HasPrefix(b, prefixBreak) {
    154 		b = b[len(prefixBreak):]
    155 	} else {
    156 		b = b[len(prefixDontBreak):]
    157 	}
    158 
    159 	boundary := true
    160 	exp = append(exp, "[][]rune{"...)
    161 	for len(b) > 0 {
    162 		if boundary {
    163 			exp = append(exp, '{')
    164 		}
    165 		exp = append(exp, "0x"...)
    166 		// Find end of hex digits.
    167 		var i int
    168 		for i = 0; i < len(b) && b[i] != ' '; i++ {
    169 			if d := b[i]; ('0' <= d || d <= '9') ||
    170 				('A' <= d || d <= 'F') ||
    171 				('a' <= d || d <= 'f') {
    172 				continue
    173 			}
    174 			return nil, nil, errors.New("bad hex digit")
    175 		}
    176 		switch i {
    177 		case 4:
    178 			orig = append(orig, "\\u"...)
    179 		case 5:
    180 			orig = append(orig, "\\U000"...)
    181 		default:
    182 			return nil, nil, errors.New("unsupport code point hex length")
    183 		}
    184 		orig = append(orig, b[:i]...)
    185 		exp = append(exp, b[:i]...)
    186 		b = b[i:]
    187 
    188 		// Check for space between hex and ÷ or ×.
    189 		if len(b) < 1 || b[0] != ' ' {
    190 			return nil, nil, errors.New("bad input")
    191 		}
    192 		b = b[1:]
    193 
    194 		// Check for next boundary.
    195 		switch {
    196 		case bytes.HasPrefix(b, breakOk):
    197 			boundary = true
    198 			b = b[len(breakOk):]
    199 		case bytes.HasPrefix(b, breakNo):
    200 			boundary = false
    201 			b = b[len(breakNo):]
    202 		default:
    203 			return nil, nil, errors.New("missing ÷ or ×")
    204 		}
    205 		if boundary {
    206 			exp = append(exp, '}')
    207 		}
    208 		exp = append(exp, ',')
    209 		if len(b) > 0 && b[0] == ' ' {
    210 			b = b[1:]
    211 		}
    212 	}
    213 	exp = append(exp, '}')
    214 	return orig, exp, nil
    215 }