gen_properties.go (7932B)
1 //go:build generate 2 3 // This program generates a property file in Go file from Unicode Character 4 // Database auxiliary data files. The command line arguments are as follows: 5 // 6 // 1. The name of the Unicode data file (just the filename, without extension). 7 // Can be "-" (to skip) if the emoji flag is included. 8 // 2. The name of the locally generated Go file. 9 // 3. The name of the slice mapping code points to properties. 10 // 4. The name of the generator, for logging purposes. 11 // 5. (Optional) Flags, comma-separated. The following flags are available: 12 // - "emojis=<property>": include the specified emoji properties (e.g. 13 // "Extended_Pictographic"). 14 // - "gencat": include general category properties. 15 // 16 //go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic 17 //go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic 18 //go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences 19 //go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat 20 //go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth 21 //go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation 22 package main 23 24 import ( 25 "bufio" 26 "bytes" 27 "errors" 28 "fmt" 29 "go/format" 30 "io/ioutil" 31 "log" 32 "net/http" 33 "os" 34 "regexp" 35 "sort" 36 "strconv" 37 "strings" 38 "time" 39 ) 40 41 // We want to test against a specific version rather than the latest. When the 42 // package is upgraded to a new version, change these to generate new tests. 43 const ( 44 propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt` 45 emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt` 46 ) 47 48 // The regular expression for a line containing a code point range property. 49 var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`) 50 51 func main() { 52 if len(os.Args) < 5 { 53 fmt.Println("Not enough arguments, see code for details") 54 os.Exit(1) 55 } 56 57 log.SetPrefix("gen_properties (" + os.Args[4] + "): ") 58 log.SetFlags(0) 59 60 // Parse flags. 61 flags := make(map[string]string) 62 if len(os.Args) >= 6 { 63 for _, flag := range strings.Split(os.Args[5], ",") { 64 flagFields := strings.Split(flag, "=") 65 if len(flagFields) == 1 { 66 flags[flagFields[0]] = "yes" 67 } else { 68 flags[flagFields[0]] = flagFields[1] 69 } 70 } 71 } 72 73 // Parse the text file and generate Go source code from it. 74 _, includeGeneralCategory := flags["gencat"] 75 var mainURL string 76 if os.Args[1] != "-" { 77 mainURL = fmt.Sprintf(propertyURL, os.Args[1]) 78 } 79 src, err := parse(mainURL, flags["emojis"], includeGeneralCategory) 80 if err != nil { 81 log.Fatal(err) 82 } 83 84 // Format the Go code. 85 formatted, err := format.Source([]byte(src)) 86 if err != nil { 87 log.Fatal("gofmt:", err) 88 } 89 90 // Save it to the (local) target file. 91 log.Print("Writing to ", os.Args[2]) 92 if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil { 93 log.Fatal(err) 94 } 95 } 96 97 // parse parses the Unicode Properties text files located at the given URLs and 98 // returns their equivalent Go source code to be used in the uniseg package. If 99 // "emojiProperty" is not an empty string, emoji code points for that emoji 100 // property (e.g. "Extended_Pictographic") will be included. In those cases, you 101 // may pass an empty "propertyURL" to skip parsing the main properties file. If 102 // "includeGeneralCategory" is true, the Unicode General Category property will 103 // be extracted from the comments and included in the output. 104 func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) { 105 if propertyURL == "" && emojiProperty == "" { 106 return "", errors.New("no properties to parse") 107 } 108 109 // Temporary buffer to hold properties. 110 var properties [][4]string 111 112 // Open the first URL. 113 if propertyURL != "" { 114 log.Printf("Parsing %s", propertyURL) 115 res, err := http.Get(propertyURL) 116 if err != nil { 117 return "", err 118 } 119 in1 := res.Body 120 defer in1.Close() 121 122 // Parse it. 123 scanner := bufio.NewScanner(in1) 124 num := 0 125 for scanner.Scan() { 126 num++ 127 line := strings.TrimSpace(scanner.Text()) 128 129 // Skip comments and empty lines. 130 if strings.HasPrefix(line, "#") || line == "" { 131 continue 132 } 133 134 // Everything else must be a code point range, a property and a comment. 135 from, to, property, comment, err := parseProperty(line) 136 if err != nil { 137 return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err) 138 } 139 properties = append(properties, [4]string{from, to, property, comment}) 140 } 141 if err := scanner.Err(); err != nil { 142 return "", err 143 } 144 } 145 146 // Open the second URL. 147 if emojiProperty != "" { 148 log.Printf("Parsing %s", emojiURL) 149 res, err := http.Get(emojiURL) 150 if err != nil { 151 return "", err 152 } 153 in2 := res.Body 154 defer in2.Close() 155 156 // Parse it. 157 scanner := bufio.NewScanner(in2) 158 num := 0 159 for scanner.Scan() { 160 num++ 161 line := scanner.Text() 162 163 // Skip comments, empty lines, and everything not containing 164 // "Extended_Pictographic". 165 if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) { 166 continue 167 } 168 169 // Everything else must be a code point range, a property and a comment. 170 from, to, property, comment, err := parseProperty(line) 171 if err != nil { 172 return "", fmt.Errorf("emojis line %d: %v", num, err) 173 } 174 properties = append(properties, [4]string{from, to, property, comment}) 175 } 176 if err := scanner.Err(); err != nil { 177 return "", err 178 } 179 } 180 181 // Avoid overflow during binary search. 182 if len(properties) >= 1<<31 { 183 return "", errors.New("too many properties") 184 } 185 186 // Sort properties. 187 sort.Slice(properties, func(i, j int) bool { 188 left, _ := strconv.ParseUint(properties[i][0], 16, 64) 189 right, _ := strconv.ParseUint(properties[j][0], 16, 64) 190 return left < right 191 }) 192 193 // Header. 194 var ( 195 buf bytes.Buffer 196 emojiComment string 197 ) 198 columns := 3 199 if includeGeneralCategory { 200 columns = 4 201 } 202 if emojiURL != "" { 203 emojiComment = ` 204 // and 205 // ` + emojiURL + ` 206 // ("Extended_Pictographic" only)` 207 } 208 buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT. 209 210 package uniseg 211 212 // ` + os.Args[3] + ` are taken from 213 // ` + propertyURL + emojiComment + ` 214 // on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode 215 // license agreement. 216 var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{ 217 `) 218 219 // Properties. 220 for _, prop := range properties { 221 if includeGeneralCategory { 222 generalCategory := "gc" + prop[3][:2] 223 if generalCategory == "gcL&" { 224 generalCategory = "gcLC" 225 } 226 prop[3] = prop[3][3:] 227 fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3]) 228 } else { 229 fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3]) 230 } 231 } 232 233 // Tail. 234 buf.WriteString("}") 235 236 return buf.String(), nil 237 } 238 239 // parseProperty parses a line of the Unicode properties text file containing a 240 // property for a code point range and returns it along with its comment. 241 func parseProperty(line string) (from, to, property, comment string, err error) { 242 fields := propertyPattern.FindStringSubmatch(line) 243 if fields == nil { 244 err = errors.New("no property found") 245 return 246 } 247 from = fields[1] 248 to = fields[3] 249 if to == "" { 250 to = from 251 } 252 property = fields[4] 253 comment = fields[5] 254 return 255 } 256 257 // translateProperty translates a property name as used in the Unicode data file 258 // to a variable used in the Go code. 259 func translateProperty(prefix, property string) string { 260 return prefix + strings.ReplaceAll(property, "_", "") 261 }