properties.go (3509B)
1 package uniseg 2 3 // The Unicode properties as used in the various parsers. Only the ones needed 4 // in the context of this package are included. 5 const ( 6 prXX = 0 // Same as prAny. 7 prAny = iota // prAny must be 0. 8 prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector. 9 prCR 10 prLF 11 prControl 12 prExtend 13 prRegionalIndicator 14 prSpacingMark 15 prL 16 prV 17 prT 18 prLV 19 prLVT 20 prZWJ 21 prExtendedPictographic 22 prNewline 23 prWSegSpace 24 prDoubleQuote 25 prSingleQuote 26 prMidNumLet 27 prNumeric 28 prMidLetter 29 prMidNum 30 prExtendNumLet 31 prALetter 32 prFormat 33 prHebrewLetter 34 prKatakana 35 prSp 36 prSTerm 37 prClose 38 prSContinue 39 prATerm 40 prUpper 41 prLower 42 prSep 43 prOLetter 44 prCM 45 prBA 46 prBK 47 prSP 48 prEX 49 prQU 50 prAL 51 prPR 52 prPO 53 prOP 54 prCP 55 prIS 56 prHY 57 prSY 58 prNU 59 prCL 60 prNL 61 prGL 62 prAI 63 prBB 64 prHL 65 prSA 66 prJL 67 prJV 68 prJT 69 prNS 70 prZW 71 prB2 72 prIN 73 prWJ 74 prID 75 prEB 76 prCJ 77 prH2 78 prH3 79 prSG 80 prCB 81 prRI 82 prEM 83 prN 84 prNa 85 prA 86 prW 87 prH 88 prF 89 prEmojiPresentation 90 ) 91 92 // Unicode General Categories. Only the ones needed in the context of this 93 // package are included. 94 const ( 95 gcNone = iota // gcNone must be 0. 96 gcCc 97 gcZs 98 gcPo 99 gcSc 100 gcPs 101 gcPe 102 gcSm 103 gcPd 104 gcNd 105 gcLu 106 gcSk 107 gcPc 108 gcLl 109 gcSo 110 gcLo 111 gcPi 112 gcCf 113 gcNo 114 gcPf 115 gcLC 116 gcLm 117 gcMn 118 gcMe 119 gcMc 120 gcNl 121 gcZl 122 gcZp 123 gcCn 124 gcCs 125 gcCo 126 ) 127 128 // Special code points. 129 const ( 130 vs15 = 0xfe0e // Variation Selector-15 (text presentation) 131 vs16 = 0xfe0f // Variation Selector-16 (emoji presentation) 132 ) 133 134 // propertySearch performs a binary search on a property slice and returns the 135 // entry whose range (start = first array element, end = second array element) 136 // includes r, or an array of 0's if no such entry was found. 137 func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) { 138 // Run a binary search. 139 from := 0 140 to := len(dictionary) 141 for to > from { 142 middle := (from + to) / 2 143 cpRange := dictionary[middle] 144 if int(r) < cpRange[0] { 145 to = middle 146 continue 147 } 148 if int(r) > cpRange[1] { 149 from = middle + 1 150 continue 151 } 152 return cpRange 153 } 154 return 155 } 156 157 // property returns the Unicode property value (see constants above) of the 158 // given code point. 159 func property(dictionary [][3]int, r rune) int { 160 return propertySearch(dictionary, r)[2] 161 } 162 163 // propertyLineBreak returns the Unicode property value and General Category 164 // (see constants above) of the given code point, as listed in the line break 165 // code points table, while fast tracking ASCII digits and letters. 166 func propertyLineBreak(r rune) (property, generalCategory int) { 167 if r >= 'a' && r <= 'z' { 168 return prAL, gcLl 169 } 170 if r >= 'A' && r <= 'Z' { 171 return prAL, gcLu 172 } 173 if r >= '0' && r <= '9' { 174 return prNU, gcNd 175 } 176 entry := propertySearch(lineBreakCodePoints, r) 177 return entry[2], entry[3] 178 } 179 180 // propertyGraphemes returns the Unicode grapheme cluster property value of the 181 // given code point while fast tracking ASCII characters. 182 func propertyGraphemes(r rune) int { 183 if r >= 0x20 && r <= 0x7e { 184 return prAny 185 } 186 if r == 0x0a { 187 return prLF 188 } 189 if r == 0x0d { 190 return prCR 191 } 192 if r >= 0 && r <= 0x1f || r == 0x7f { 193 return prControl 194 } 195 return property(graphemeCodePoints, r) 196 } 197 198 // propertyEastAsianWidth returns the Unicode East Asian Width property value of 199 // the given code point while fast tracking ASCII characters. 200 func propertyEastAsianWidth(r rune) int { 201 if r >= 0x20 && r <= 0x7e { 202 return prNa 203 } 204 if r >= 0 && r <= 0x1f || r == 0x7f { 205 return prN 206 } 207 return property(eastAsianWidth, r) 208 }