src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

sentencerules.go (7887B)


      1 package uniseg
      2 
      3 import "unicode/utf8"
      4 
      5 // The states of the sentence break parser.
      6 const (
      7 	sbAny = iota
      8 	sbCR
      9 	sbParaSep
     10 	sbATerm
     11 	sbUpper
     12 	sbLower
     13 	sbSB7
     14 	sbSB8Close
     15 	sbSB8Sp
     16 	sbSTerm
     17 	sbSB8aClose
     18 	sbSB8aSp
     19 )
     20 
     21 // sbTransitions implements the sentence break parser's state transitions. It's
     22 // anologous to [grTransitions], see comments there for details.
     23 //
     24 // Unicode version 15.0.0.
     25 func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
     26 	switch uint64(state) | uint64(prop)<<32 {
     27 	// SB3.
     28 	case sbAny | prCR<<32:
     29 		return sbCR, false, 9990
     30 	case sbCR | prLF<<32:
     31 		return sbParaSep, false, 30
     32 
     33 	// SB4.
     34 	case sbAny | prSep<<32:
     35 		return sbParaSep, false, 9990
     36 	case sbAny | prLF<<32:
     37 		return sbParaSep, false, 9990
     38 	case sbParaSep | prAny<<32:
     39 		return sbAny, true, 40
     40 	case sbCR | prAny<<32:
     41 		return sbAny, true, 40
     42 
     43 	// SB6.
     44 	case sbAny | prATerm<<32:
     45 		return sbATerm, false, 9990
     46 	case sbATerm | prNumeric<<32:
     47 		return sbAny, false, 60
     48 	case sbSB7 | prNumeric<<32:
     49 		return sbAny, false, 60 // Because ATerm also appears in SB7.
     50 
     51 	// SB7.
     52 	case sbAny | prUpper<<32:
     53 		return sbUpper, false, 9990
     54 	case sbAny | prLower<<32:
     55 		return sbLower, false, 9990
     56 	case sbUpper | prATerm<<32:
     57 		return sbSB7, false, 70
     58 	case sbLower | prATerm<<32:
     59 		return sbSB7, false, 70
     60 	case sbSB7 | prUpper<<32:
     61 		return sbUpper, false, 70
     62 
     63 	// SB8a.
     64 	case sbAny | prSTerm<<32:
     65 		return sbSTerm, false, 9990
     66 	case sbATerm | prSContinue<<32:
     67 		return sbAny, false, 81
     68 	case sbATerm | prATerm<<32:
     69 		return sbATerm, false, 81
     70 	case sbATerm | prSTerm<<32:
     71 		return sbSTerm, false, 81
     72 	case sbSB7 | prSContinue<<32:
     73 		return sbAny, false, 81
     74 	case sbSB7 | prATerm<<32:
     75 		return sbATerm, false, 81
     76 	case sbSB7 | prSTerm<<32:
     77 		return sbSTerm, false, 81
     78 	case sbSB8Close | prSContinue<<32:
     79 		return sbAny, false, 81
     80 	case sbSB8Close | prATerm<<32:
     81 		return sbATerm, false, 81
     82 	case sbSB8Close | prSTerm<<32:
     83 		return sbSTerm, false, 81
     84 	case sbSB8Sp | prSContinue<<32:
     85 		return sbAny, false, 81
     86 	case sbSB8Sp | prATerm<<32:
     87 		return sbATerm, false, 81
     88 	case sbSB8Sp | prSTerm<<32:
     89 		return sbSTerm, false, 81
     90 	case sbSTerm | prSContinue<<32:
     91 		return sbAny, false, 81
     92 	case sbSTerm | prATerm<<32:
     93 		return sbATerm, false, 81
     94 	case sbSTerm | prSTerm<<32:
     95 		return sbSTerm, false, 81
     96 	case sbSB8aClose | prSContinue<<32:
     97 		return sbAny, false, 81
     98 	case sbSB8aClose | prATerm<<32:
     99 		return sbATerm, false, 81
    100 	case sbSB8aClose | prSTerm<<32:
    101 		return sbSTerm, false, 81
    102 	case sbSB8aSp | prSContinue<<32:
    103 		return sbAny, false, 81
    104 	case sbSB8aSp | prATerm<<32:
    105 		return sbATerm, false, 81
    106 	case sbSB8aSp | prSTerm<<32:
    107 		return sbSTerm, false, 81
    108 
    109 	// SB9.
    110 	case sbATerm | prClose<<32:
    111 		return sbSB8Close, false, 90
    112 	case sbSB7 | prClose<<32:
    113 		return sbSB8Close, false, 90
    114 	case sbSB8Close | prClose<<32:
    115 		return sbSB8Close, false, 90
    116 	case sbATerm | prSp<<32:
    117 		return sbSB8Sp, false, 90
    118 	case sbSB7 | prSp<<32:
    119 		return sbSB8Sp, false, 90
    120 	case sbSB8Close | prSp<<32:
    121 		return sbSB8Sp, false, 90
    122 	case sbSTerm | prClose<<32:
    123 		return sbSB8aClose, false, 90
    124 	case sbSB8aClose | prClose<<32:
    125 		return sbSB8aClose, false, 90
    126 	case sbSTerm | prSp<<32:
    127 		return sbSB8aSp, false, 90
    128 	case sbSB8aClose | prSp<<32:
    129 		return sbSB8aSp, false, 90
    130 	case sbATerm | prSep<<32:
    131 		return sbParaSep, false, 90
    132 	case sbATerm | prCR<<32:
    133 		return sbParaSep, false, 90
    134 	case sbATerm | prLF<<32:
    135 		return sbParaSep, false, 90
    136 	case sbSB7 | prSep<<32:
    137 		return sbParaSep, false, 90
    138 	case sbSB7 | prCR<<32:
    139 		return sbParaSep, false, 90
    140 	case sbSB7 | prLF<<32:
    141 		return sbParaSep, false, 90
    142 	case sbSB8Close | prSep<<32:
    143 		return sbParaSep, false, 90
    144 	case sbSB8Close | prCR<<32:
    145 		return sbParaSep, false, 90
    146 	case sbSB8Close | prLF<<32:
    147 		return sbParaSep, false, 90
    148 	case sbSTerm | prSep<<32:
    149 		return sbParaSep, false, 90
    150 	case sbSTerm | prCR<<32:
    151 		return sbParaSep, false, 90
    152 	case sbSTerm | prLF<<32:
    153 		return sbParaSep, false, 90
    154 	case sbSB8aClose | prSep<<32:
    155 		return sbParaSep, false, 90
    156 	case sbSB8aClose | prCR<<32:
    157 		return sbParaSep, false, 90
    158 	case sbSB8aClose | prLF<<32:
    159 		return sbParaSep, false, 90
    160 
    161 	// SB10.
    162 	case sbSB8Sp | prSp<<32:
    163 		return sbSB8Sp, false, 100
    164 	case sbSB8aSp | prSp<<32:
    165 		return sbSB8aSp, false, 100
    166 	case sbSB8Sp | prSep<<32:
    167 		return sbParaSep, false, 100
    168 	case sbSB8Sp | prCR<<32:
    169 		return sbParaSep, false, 100
    170 	case sbSB8Sp | prLF<<32:
    171 		return sbParaSep, false, 100
    172 
    173 	// SB11.
    174 	case sbATerm | prAny<<32:
    175 		return sbAny, true, 110
    176 	case sbSB7 | prAny<<32:
    177 		return sbAny, true, 110
    178 	case sbSB8Close | prAny<<32:
    179 		return sbAny, true, 110
    180 	case sbSB8Sp | prAny<<32:
    181 		return sbAny, true, 110
    182 	case sbSTerm | prAny<<32:
    183 		return sbAny, true, 110
    184 	case sbSB8aClose | prAny<<32:
    185 		return sbAny, true, 110
    186 	case sbSB8aSp | prAny<<32:
    187 		return sbAny, true, 110
    188 	// We'll always break after ParaSep due to SB4.
    189 
    190 	default:
    191 		return -1, false, -1
    192 	}
    193 }
    194 
    195 // transitionSentenceBreakState determines the new state of the sentence break
    196 // parser given the current state and the next code point. It also returns
    197 // whether a sentence boundary was detected. If more than one code point is
    198 // needed to determine the new state, the byte slice or the string starting
    199 // after rune "r" can be used (whichever is not nil or empty) for further
    200 // lookups.
    201 func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
    202 	// Determine the property of the next character.
    203 	nextProperty := property(sentenceBreakCodePoints, r)
    204 
    205 	// SB5 (Replacing Ignore Rules).
    206 	if nextProperty == prExtend || nextProperty == prFormat {
    207 		if state == sbParaSep || state == sbCR {
    208 			return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
    209 		}
    210 		if state < 0 {
    211 			return sbAny, true // SB1.
    212 		}
    213 		return state, false
    214 	}
    215 
    216 	// Find the applicable transition in the table.
    217 	var rule int
    218 	newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
    219 	if newState < 0 {
    220 		// No specific transition found. Try the less specific ones.
    221 		anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
    222 		anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
    223 		if anyPropState >= 0 && anyStateState >= 0 {
    224 			// Both apply. We'll use a mix (see comments for grTransitions).
    225 			newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
    226 			if anyPropRule < anyStateRule {
    227 				sentenceBreak, rule = anyPropProp, anyPropRule
    228 			}
    229 		} else if anyPropState >= 0 {
    230 			// We only have a specific state.
    231 			newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
    232 			// This branch will probably never be reached because okAnyState will
    233 			// always be true given the current transition map. But we keep it here
    234 			// for future modifications to the transition map where this may not be
    235 			// true anymore.
    236 		} else if anyStateState >= 0 {
    237 			// We only have a specific property.
    238 			newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
    239 		} else {
    240 			// No known transition. SB999: Any × Any.
    241 			newState, sentenceBreak, rule = sbAny, false, 9990
    242 		}
    243 	}
    244 
    245 	// SB8.
    246 	if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
    247 		// Check the right side of the rule.
    248 		var length int
    249 		for nextProperty != prOLetter &&
    250 			nextProperty != prUpper &&
    251 			nextProperty != prLower &&
    252 			nextProperty != prSep &&
    253 			nextProperty != prCR &&
    254 			nextProperty != prLF &&
    255 			nextProperty != prATerm &&
    256 			nextProperty != prSTerm {
    257 			// Move on to the next rune.
    258 			if b != nil { // Byte slice version.
    259 				r, length = utf8.DecodeRune(b)
    260 				b = b[length:]
    261 			} else { // String version.
    262 				r, length = utf8.DecodeRuneInString(str)
    263 				str = str[length:]
    264 			}
    265 			if r == utf8.RuneError {
    266 				break
    267 			}
    268 			nextProperty = property(sentenceBreakCodePoints, r)
    269 		}
    270 		if nextProperty == prLower {
    271 			return sbLower, false
    272 		}
    273 	}
    274 
    275 	return
    276 }