wordrules.go (8395B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // The states of the word break parser. 6 const ( 7 wbAny = iota 8 wbCR 9 wbLF 10 wbNewline 11 wbWSegSpace 12 wbHebrewLetter 13 wbALetter 14 wbWB7 15 wbWB7c 16 wbNumeric 17 wbWB11 18 wbKatakana 19 wbExtendNumLet 20 wbOddRI 21 wbEvenRI 22 wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c). 23 ) 24 25 // wbTransitions implements the word break parser's state transitions. It's 26 // anologous to [grTransitions], see comments there for details. 27 // 28 // Unicode version 15.0.0. 29 func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) { 30 switch uint64(state) | uint64(prop)<<32 { 31 // WB3b. 32 case wbAny | prNewline<<32: 33 return wbNewline, true, 32 34 case wbAny | prCR<<32: 35 return wbCR, true, 32 36 case wbAny | prLF<<32: 37 return wbLF, true, 32 38 39 // WB3a. 40 case wbNewline | prAny<<32: 41 return wbAny, true, 31 42 case wbCR | prAny<<32: 43 return wbAny, true, 31 44 case wbLF | prAny<<32: 45 return wbAny, true, 31 46 47 // WB3. 48 case wbCR | prLF<<32: 49 return wbLF, false, 30 50 51 // WB3d. 52 case wbAny | prWSegSpace<<32: 53 return wbWSegSpace, true, 9990 54 case wbWSegSpace | prWSegSpace<<32: 55 return wbWSegSpace, false, 34 56 57 // WB5. 58 case wbAny | prALetter<<32: 59 return wbALetter, true, 9990 60 case wbAny | prHebrewLetter<<32: 61 return wbHebrewLetter, true, 9990 62 case wbALetter | prALetter<<32: 63 return wbALetter, false, 50 64 case wbALetter | prHebrewLetter<<32: 65 return wbHebrewLetter, false, 50 66 case wbHebrewLetter | prALetter<<32: 67 return wbALetter, false, 50 68 case wbHebrewLetter | prHebrewLetter<<32: 69 return wbHebrewLetter, false, 50 70 71 // WB7. Transitions to wbWB7 handled by transitionWordBreakState(). 72 case wbWB7 | prALetter<<32: 73 return wbALetter, false, 70 74 case wbWB7 | prHebrewLetter<<32: 75 return wbHebrewLetter, false, 70 76 77 // WB7a. 78 case wbHebrewLetter | prSingleQuote<<32: 79 return wbAny, false, 71 80 81 // WB7c. Transitions to wbWB7c handled by transitionWordBreakState(). 82 case wbWB7c | prHebrewLetter<<32: 83 return wbHebrewLetter, false, 73 84 85 // WB8. 86 case wbAny | prNumeric<<32: 87 return wbNumeric, true, 9990 88 case wbNumeric | prNumeric<<32: 89 return wbNumeric, false, 80 90 91 // WB9. 92 case wbALetter | prNumeric<<32: 93 return wbNumeric, false, 90 94 case wbHebrewLetter | prNumeric<<32: 95 return wbNumeric, false, 90 96 97 // WB10. 98 case wbNumeric | prALetter<<32: 99 return wbALetter, false, 100 100 case wbNumeric | prHebrewLetter<<32: 101 return wbHebrewLetter, false, 100 102 103 // WB11. Transitions to wbWB11 handled by transitionWordBreakState(). 104 case wbWB11 | prNumeric<<32: 105 return wbNumeric, false, 110 106 107 // WB13. 108 case wbAny | prKatakana<<32: 109 return wbKatakana, true, 9990 110 case wbKatakana | prKatakana<<32: 111 return wbKatakana, false, 130 112 113 // WB13a. 114 case wbAny | prExtendNumLet<<32: 115 return wbExtendNumLet, true, 9990 116 case wbALetter | prExtendNumLet<<32: 117 return wbExtendNumLet, false, 131 118 case wbHebrewLetter | prExtendNumLet<<32: 119 return wbExtendNumLet, false, 131 120 case wbNumeric | prExtendNumLet<<32: 121 return wbExtendNumLet, false, 131 122 case wbKatakana | prExtendNumLet<<32: 123 return wbExtendNumLet, false, 131 124 case wbExtendNumLet | prExtendNumLet<<32: 125 return wbExtendNumLet, false, 131 126 127 // WB13b. 128 case wbExtendNumLet | prALetter<<32: 129 return wbALetter, false, 132 130 case wbExtendNumLet | prHebrewLetter<<32: 131 return wbHebrewLetter, false, 132 132 case wbExtendNumLet | prNumeric<<32: 133 return wbNumeric, false, 132 134 case wbExtendNumLet | prKatakana<<32: 135 return wbKatakana, false, 132 136 137 default: 138 return -1, false, -1 139 } 140 } 141 142 // transitionWordBreakState determines the new state of the word break parser 143 // given the current state and the next code point. It also returns whether a 144 // word boundary was detected. If more than one code point is needed to 145 // determine the new state, the byte slice or the string starting after rune "r" 146 // can be used (whichever is not nil or empty) for further lookups. 147 func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) { 148 // Determine the property of the next character. 149 nextProperty := property(workBreakCodePoints, r) 150 151 // "Replacing Ignore Rules". 152 if nextProperty == prZWJ { 153 // WB4 (for zero-width joiners). 154 if state == wbNewline || state == wbCR || state == wbLF { 155 return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a. 156 } 157 if state < 0 { 158 return wbAny | wbZWJBit, false 159 } 160 return state | wbZWJBit, false 161 } else if nextProperty == prExtend || nextProperty == prFormat { 162 // WB4 (for Extend and Format). 163 if state == wbNewline || state == wbCR || state == wbLF { 164 return wbAny, true // Make sure we don't apply WB4 to WB3a. 165 } 166 if state == wbWSegSpace || state == wbAny|wbZWJBit { 167 return wbAny, false // We don't break but this is also not WB3d or WB3c. 168 } 169 if state < 0 { 170 return wbAny, false 171 } 172 return state, false 173 } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 { 174 // WB3c. 175 return wbAny, false 176 } 177 if state >= 0 { 178 state = state &^ wbZWJBit 179 } 180 181 // Find the applicable transition in the table. 182 var rule int 183 newState, wordBreak, rule = wbTransitions(state, nextProperty) 184 if newState < 0 { 185 // No specific transition found. Try the less specific ones. 186 anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny) 187 anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty) 188 if anyPropState >= 0 && anyStateState >= 0 { 189 // Both apply. We'll use a mix (see comments for grTransitions). 190 newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule 191 if anyPropRule < anyStateRule { 192 wordBreak, rule = anyPropWordBreak, anyPropRule 193 } 194 } else if anyPropState >= 0 { 195 // We only have a specific state. 196 newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule 197 // This branch will probably never be reached because okAnyState will 198 // always be true given the current transition map. But we keep it here 199 // for future modifications to the transition map where this may not be 200 // true anymore. 201 } else if anyStateState >= 0 { 202 // We only have a specific property. 203 newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule 204 } else { 205 // No known transition. WB999: Any รท Any. 206 newState, wordBreak, rule = wbAny, true, 9990 207 } 208 } 209 210 // For those rules that need to look up runes further in the string, we 211 // determine the property after nextProperty, skipping over Format, Extend, 212 // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot 213 // be determined (because the text ends or the rune is faulty). 214 farProperty := -1 215 if rule > 60 && 216 (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) && 217 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6. 218 nextProperty == prDoubleQuote || // WB7b. 219 nextProperty == prMidNum) { // WB12. 220 for { 221 var ( 222 r rune 223 length int 224 ) 225 if b != nil { // Byte slice version. 226 r, length = utf8.DecodeRune(b) 227 b = b[length:] 228 } else { // String version. 229 r, length = utf8.DecodeRuneInString(str) 230 str = str[length:] 231 } 232 if r == utf8.RuneError { 233 break 234 } 235 prop := property(workBreakCodePoints, r) 236 if prop == prExtend || prop == prFormat || prop == prZWJ { 237 continue 238 } 239 farProperty = prop 240 break 241 } 242 } 243 244 // WB6. 245 if rule > 60 && 246 (state == wbALetter || state == wbHebrewLetter) && 247 (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) && 248 (farProperty == prALetter || farProperty == prHebrewLetter) { 249 return wbWB7, false 250 } 251 252 // WB7b. 253 if rule > 72 && 254 state == wbHebrewLetter && 255 nextProperty == prDoubleQuote && 256 farProperty == prHebrewLetter { 257 return wbWB7c, false 258 } 259 260 // WB12. 261 if rule > 120 && 262 state == wbNumeric && 263 (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) && 264 farProperty == prNumeric { 265 return wbWB11, false 266 } 267 268 // WB15 and WB16. 269 if newState == wbAny && nextProperty == prRegionalIndicator { 270 if state != wbOddRI && state != wbEvenRI { // Includes state == -1. 271 // Transition into the first RI. 272 return wbOddRI, true 273 } 274 if state == wbOddRI { 275 // Don't break pairs of Regional Indicators. 276 return wbEvenRI, false 277 } 278 return wbOddRI, true // We can break after a pair. 279 } 280 281 return 282 }