gen_breaktest.go (5925B)
1 //go:build generate 2 3 // This program generates a Go containing a slice of test cases based on the 4 // Unicode Character Database auxiliary data files. The command line arguments 5 // are as follows: 6 // 7 // 1. The name of the Unicode data file (just the filename, without extension). 8 // 2. The name of the locally generated Go file. 9 // 3. The name of the slice containing the test cases. 10 // 4. The name of the generator, for logging purposes. 11 // 12 //go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes 13 //go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words 14 //go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences 15 //go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines 16 17 package main 18 19 import ( 20 "bufio" 21 "bytes" 22 "errors" 23 "fmt" 24 "go/format" 25 "io/ioutil" 26 "log" 27 "net/http" 28 "os" 29 "time" 30 ) 31 32 // We want to test against a specific version rather than the latest. When the 33 // package is upgraded to a new version, change these to generate new tests. 34 const ( 35 testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt` 36 ) 37 38 func main() { 39 if len(os.Args) < 5 { 40 fmt.Println("Not enough arguments, see code for details") 41 os.Exit(1) 42 } 43 44 log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ") 45 log.SetFlags(0) 46 47 // Read text of testcases and parse into Go source code. 48 src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1])) 49 if err != nil { 50 log.Fatal(err) 51 } 52 53 // Format the Go code. 54 formatted, err := format.Source(src) 55 if err != nil { 56 log.Fatalln("gofmt:", err) 57 } 58 59 // Write it out. 60 log.Print("Writing to ", os.Args[2]) 61 if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil { 62 log.Fatal(err) 63 } 64 } 65 66 // parse reads a break text file, either from a local file or from a URL. It 67 // parses the file data into Go source code representing the test cases. 68 func parse(url string) ([]byte, error) { 69 log.Printf("Parsing %s", url) 70 res, err := http.Get(url) 71 if err != nil { 72 return nil, err 73 } 74 body := res.Body 75 defer body.Close() 76 77 buf := new(bytes.Buffer) 78 buf.Grow(120 << 10) 79 buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT. 80 81 package uniseg 82 83 // ` + os.Args[3] + ` are Grapheme testcases taken from 84 // ` + url + ` 85 // on ` + time.Now().Format("January 2, 2006") + `. See 86 // https://www.unicode.org/license.html for the Unicode license agreement. 87 var ` + os.Args[3] + ` = []testCase { 88 `) 89 90 sc := bufio.NewScanner(body) 91 num := 1 92 var line []byte 93 original := make([]byte, 0, 64) 94 expected := make([]byte, 0, 64) 95 for sc.Scan() { 96 num++ 97 line = sc.Bytes() 98 if len(line) == 0 || line[0] == '#' { 99 continue 100 } 101 var comment []byte 102 if i := bytes.IndexByte(line, '#'); i >= 0 { 103 comment = bytes.TrimSpace(line[i+1:]) 104 line = bytes.TrimSpace(line[:i]) 105 } 106 original, expected, err := parseRuneSequence(line, original[:0], expected[:0]) 107 if err != nil { 108 return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line) 109 } 110 fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment) 111 } 112 if err := sc.Err(); err != nil { 113 return nil, err 114 } 115 116 // Check for final "# EOF", useful check if we're streaming via HTTP 117 if !bytes.Equal(line, []byte("# EOF")) { 118 return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line) 119 } 120 buf.WriteString("}\n") 121 return buf.Bytes(), nil 122 } 123 124 // Used by parseRuneSequence to match input via bytes.HasPrefix. 125 var ( 126 prefixBreak = []byte("÷ ") 127 prefixDontBreak = []byte("× ") 128 breakOk = []byte("÷") 129 breakNo = []byte("×") 130 ) 131 132 // parseRuneSequence parses a rune + breaking opportunity sequence from b 133 // and appends the Go code for testcase.original to orig 134 // and appends the Go code for testcase.expected to exp. 135 // It retuns the new orig and exp slices. 136 // 137 // E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷" 138 // it will append 139 // 140 // "\u0020\u0308\U0001F1E6" 141 // 142 // and "[][]rune{{0x0020,0x0308},{0x1F1E6},}" 143 // to orig and exp respectively. 144 // 145 // The formatting of exp is expected to be cleaned up by gofmt or format.Source. 146 // Note we explicitly require the sequence to start with ÷ and we implicitly 147 // require it to end with ÷. 148 func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) { 149 // Check for and remove first ÷ or ×. 150 if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) { 151 return nil, nil, errors.New("expected ÷ or × as first character") 152 } 153 if bytes.HasPrefix(b, prefixBreak) { 154 b = b[len(prefixBreak):] 155 } else { 156 b = b[len(prefixDontBreak):] 157 } 158 159 boundary := true 160 exp = append(exp, "[][]rune{"...) 161 for len(b) > 0 { 162 if boundary { 163 exp = append(exp, '{') 164 } 165 exp = append(exp, "0x"...) 166 // Find end of hex digits. 167 var i int 168 for i = 0; i < len(b) && b[i] != ' '; i++ { 169 if d := b[i]; ('0' <= d || d <= '9') || 170 ('A' <= d || d <= 'F') || 171 ('a' <= d || d <= 'f') { 172 continue 173 } 174 return nil, nil, errors.New("bad hex digit") 175 } 176 switch i { 177 case 4: 178 orig = append(orig, "\\u"...) 179 case 5: 180 orig = append(orig, "\\U000"...) 181 default: 182 return nil, nil, errors.New("unsupport code point hex length") 183 } 184 orig = append(orig, b[:i]...) 185 exp = append(exp, b[:i]...) 186 b = b[i:] 187 188 // Check for space between hex and ÷ or ×. 189 if len(b) < 1 || b[0] != ' ' { 190 return nil, nil, errors.New("bad input") 191 } 192 b = b[1:] 193 194 // Check for next boundary. 195 switch { 196 case bytes.HasPrefix(b, breakOk): 197 boundary = true 198 b = b[len(breakOk):] 199 case bytes.HasPrefix(b, breakNo): 200 boundary = false 201 b = b[len(breakNo):] 202 default: 203 return nil, nil, errors.New("missing ÷ or ×") 204 } 205 if boundary { 206 exp = append(exp, '}') 207 } 208 exp = append(exp, ',') 209 if len(b) > 0 && b[0] == ' ' { 210 b = b[1:] 211 } 212 } 213 exp = append(exp, '}') 214 return orig, exp, nil 215 }