sentencerules.go (7887B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // The states of the sentence break parser. 6 const ( 7 sbAny = iota 8 sbCR 9 sbParaSep 10 sbATerm 11 sbUpper 12 sbLower 13 sbSB7 14 sbSB8Close 15 sbSB8Sp 16 sbSTerm 17 sbSB8aClose 18 sbSB8aSp 19 ) 20 21 // sbTransitions implements the sentence break parser's state transitions. It's 22 // anologous to [grTransitions], see comments there for details. 23 // 24 // Unicode version 15.0.0. 25 func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) { 26 switch uint64(state) | uint64(prop)<<32 { 27 // SB3. 28 case sbAny | prCR<<32: 29 return sbCR, false, 9990 30 case sbCR | prLF<<32: 31 return sbParaSep, false, 30 32 33 // SB4. 34 case sbAny | prSep<<32: 35 return sbParaSep, false, 9990 36 case sbAny | prLF<<32: 37 return sbParaSep, false, 9990 38 case sbParaSep | prAny<<32: 39 return sbAny, true, 40 40 case sbCR | prAny<<32: 41 return sbAny, true, 40 42 43 // SB6. 44 case sbAny | prATerm<<32: 45 return sbATerm, false, 9990 46 case sbATerm | prNumeric<<32: 47 return sbAny, false, 60 48 case sbSB7 | prNumeric<<32: 49 return sbAny, false, 60 // Because ATerm also appears in SB7. 50 51 // SB7. 52 case sbAny | prUpper<<32: 53 return sbUpper, false, 9990 54 case sbAny | prLower<<32: 55 return sbLower, false, 9990 56 case sbUpper | prATerm<<32: 57 return sbSB7, false, 70 58 case sbLower | prATerm<<32: 59 return sbSB7, false, 70 60 case sbSB7 | prUpper<<32: 61 return sbUpper, false, 70 62 63 // SB8a. 64 case sbAny | prSTerm<<32: 65 return sbSTerm, false, 9990 66 case sbATerm | prSContinue<<32: 67 return sbAny, false, 81 68 case sbATerm | prATerm<<32: 69 return sbATerm, false, 81 70 case sbATerm | prSTerm<<32: 71 return sbSTerm, false, 81 72 case sbSB7 | prSContinue<<32: 73 return sbAny, false, 81 74 case sbSB7 | prATerm<<32: 75 return sbATerm, false, 81 76 case sbSB7 | prSTerm<<32: 77 return sbSTerm, false, 81 78 case sbSB8Close | prSContinue<<32: 79 return sbAny, false, 81 80 case sbSB8Close | prATerm<<32: 81 return sbATerm, false, 81 82 case sbSB8Close | prSTerm<<32: 83 return sbSTerm, false, 81 84 case sbSB8Sp | prSContinue<<32: 85 return sbAny, false, 81 86 case sbSB8Sp | prATerm<<32: 87 return sbATerm, false, 81 88 case sbSB8Sp | prSTerm<<32: 89 return sbSTerm, false, 81 90 case sbSTerm | prSContinue<<32: 91 return sbAny, false, 81 92 case sbSTerm | prATerm<<32: 93 return sbATerm, false, 81 94 case sbSTerm | prSTerm<<32: 95 return sbSTerm, false, 81 96 case sbSB8aClose | prSContinue<<32: 97 return sbAny, false, 81 98 case sbSB8aClose | prATerm<<32: 99 return sbATerm, false, 81 100 case sbSB8aClose | prSTerm<<32: 101 return sbSTerm, false, 81 102 case sbSB8aSp | prSContinue<<32: 103 return sbAny, false, 81 104 case sbSB8aSp | prATerm<<32: 105 return sbATerm, false, 81 106 case sbSB8aSp | prSTerm<<32: 107 return sbSTerm, false, 81 108 109 // SB9. 110 case sbATerm | prClose<<32: 111 return sbSB8Close, false, 90 112 case sbSB7 | prClose<<32: 113 return sbSB8Close, false, 90 114 case sbSB8Close | prClose<<32: 115 return sbSB8Close, false, 90 116 case sbATerm | prSp<<32: 117 return sbSB8Sp, false, 90 118 case sbSB7 | prSp<<32: 119 return sbSB8Sp, false, 90 120 case sbSB8Close | prSp<<32: 121 return sbSB8Sp, false, 90 122 case sbSTerm | prClose<<32: 123 return sbSB8aClose, false, 90 124 case sbSB8aClose | prClose<<32: 125 return sbSB8aClose, false, 90 126 case sbSTerm | prSp<<32: 127 return sbSB8aSp, false, 90 128 case sbSB8aClose | prSp<<32: 129 return sbSB8aSp, false, 90 130 case sbATerm | prSep<<32: 131 return sbParaSep, false, 90 132 case sbATerm | prCR<<32: 133 return sbParaSep, false, 90 134 case sbATerm | prLF<<32: 135 return sbParaSep, false, 90 136 case sbSB7 | prSep<<32: 137 return sbParaSep, false, 90 138 case sbSB7 | prCR<<32: 139 return sbParaSep, false, 90 140 case sbSB7 | prLF<<32: 141 return sbParaSep, false, 90 142 case sbSB8Close | prSep<<32: 143 return sbParaSep, false, 90 144 case sbSB8Close | prCR<<32: 145 return sbParaSep, false, 90 146 case sbSB8Close | prLF<<32: 147 return sbParaSep, false, 90 148 case sbSTerm | prSep<<32: 149 return sbParaSep, false, 90 150 case sbSTerm | prCR<<32: 151 return sbParaSep, false, 90 152 case sbSTerm | prLF<<32: 153 return sbParaSep, false, 90 154 case sbSB8aClose | prSep<<32: 155 return sbParaSep, false, 90 156 case sbSB8aClose | prCR<<32: 157 return sbParaSep, false, 90 158 case sbSB8aClose | prLF<<32: 159 return sbParaSep, false, 90 160 161 // SB10. 162 case sbSB8Sp | prSp<<32: 163 return sbSB8Sp, false, 100 164 case sbSB8aSp | prSp<<32: 165 return sbSB8aSp, false, 100 166 case sbSB8Sp | prSep<<32: 167 return sbParaSep, false, 100 168 case sbSB8Sp | prCR<<32: 169 return sbParaSep, false, 100 170 case sbSB8Sp | prLF<<32: 171 return sbParaSep, false, 100 172 173 // SB11. 174 case sbATerm | prAny<<32: 175 return sbAny, true, 110 176 case sbSB7 | prAny<<32: 177 return sbAny, true, 110 178 case sbSB8Close | prAny<<32: 179 return sbAny, true, 110 180 case sbSB8Sp | prAny<<32: 181 return sbAny, true, 110 182 case sbSTerm | prAny<<32: 183 return sbAny, true, 110 184 case sbSB8aClose | prAny<<32: 185 return sbAny, true, 110 186 case sbSB8aSp | prAny<<32: 187 return sbAny, true, 110 188 // We'll always break after ParaSep due to SB4. 189 190 default: 191 return -1, false, -1 192 } 193 } 194 195 // transitionSentenceBreakState determines the new state of the sentence break 196 // parser given the current state and the next code point. It also returns 197 // whether a sentence boundary was detected. If more than one code point is 198 // needed to determine the new state, the byte slice or the string starting 199 // after rune "r" can be used (whichever is not nil or empty) for further 200 // lookups. 201 func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) { 202 // Determine the property of the next character. 203 nextProperty := property(sentenceBreakCodePoints, r) 204 205 // SB5 (Replacing Ignore Rules). 206 if nextProperty == prExtend || nextProperty == prFormat { 207 if state == sbParaSep || state == sbCR { 208 return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4. 209 } 210 if state < 0 { 211 return sbAny, true // SB1. 212 } 213 return state, false 214 } 215 216 // Find the applicable transition in the table. 217 var rule int 218 newState, sentenceBreak, rule = sbTransitions(state, nextProperty) 219 if newState < 0 { 220 // No specific transition found. Try the less specific ones. 221 anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny) 222 anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty) 223 if anyPropState >= 0 && anyStateState >= 0 { 224 // Both apply. We'll use a mix (see comments for grTransitions). 225 newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule 226 if anyPropRule < anyStateRule { 227 sentenceBreak, rule = anyPropProp, anyPropRule 228 } 229 } else if anyPropState >= 0 { 230 // We only have a specific state. 231 newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule 232 // This branch will probably never be reached because okAnyState will 233 // always be true given the current transition map. But we keep it here 234 // for future modifications to the transition map where this may not be 235 // true anymore. 236 } else if anyStateState >= 0 { 237 // We only have a specific property. 238 newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule 239 } else { 240 // No known transition. SB999: Any × Any. 241 newState, sentenceBreak, rule = sbAny, false, 9990 242 } 243 } 244 245 // SB8. 246 if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) { 247 // Check the right side of the rule. 248 var length int 249 for nextProperty != prOLetter && 250 nextProperty != prUpper && 251 nextProperty != prLower && 252 nextProperty != prSep && 253 nextProperty != prCR && 254 nextProperty != prLF && 255 nextProperty != prATerm && 256 nextProperty != prSTerm { 257 // Move on to the next rune. 258 if b != nil { // Byte slice version. 259 r, length = utf8.DecodeRune(b) 260 b = b[length:] 261 } else { // String version. 262 r, length = utf8.DecodeRuneInString(str) 263 str = str[length:] 264 } 265 if r == utf8.RuneError { 266 break 267 } 268 nextProperty = property(sentenceBreakCodePoints, r) 269 } 270 if nextProperty == prLower { 271 return sbLower, false 272 } 273 } 274 275 return 276 }