src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

wordrules.go (8395B)


      1 package uniseg
      2 
      3 import "unicode/utf8"
      4 
      5 // The states of the word break parser.
      6 const (
      7 	wbAny = iota
      8 	wbCR
      9 	wbLF
     10 	wbNewline
     11 	wbWSegSpace
     12 	wbHebrewLetter
     13 	wbALetter
     14 	wbWB7
     15 	wbWB7c
     16 	wbNumeric
     17 	wbWB11
     18 	wbKatakana
     19 	wbExtendNumLet
     20 	wbOddRI
     21 	wbEvenRI
     22 	wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
     23 )
     24 
     25 // wbTransitions implements the word break parser's state transitions. It's
     26 // anologous to [grTransitions], see comments there for details.
     27 //
     28 // Unicode version 15.0.0.
     29 func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
     30 	switch uint64(state) | uint64(prop)<<32 {
     31 	// WB3b.
     32 	case wbAny | prNewline<<32:
     33 		return wbNewline, true, 32
     34 	case wbAny | prCR<<32:
     35 		return wbCR, true, 32
     36 	case wbAny | prLF<<32:
     37 		return wbLF, true, 32
     38 
     39 	// WB3a.
     40 	case wbNewline | prAny<<32:
     41 		return wbAny, true, 31
     42 	case wbCR | prAny<<32:
     43 		return wbAny, true, 31
     44 	case wbLF | prAny<<32:
     45 		return wbAny, true, 31
     46 
     47 	// WB3.
     48 	case wbCR | prLF<<32:
     49 		return wbLF, false, 30
     50 
     51 	// WB3d.
     52 	case wbAny | prWSegSpace<<32:
     53 		return wbWSegSpace, true, 9990
     54 	case wbWSegSpace | prWSegSpace<<32:
     55 		return wbWSegSpace, false, 34
     56 
     57 	// WB5.
     58 	case wbAny | prALetter<<32:
     59 		return wbALetter, true, 9990
     60 	case wbAny | prHebrewLetter<<32:
     61 		return wbHebrewLetter, true, 9990
     62 	case wbALetter | prALetter<<32:
     63 		return wbALetter, false, 50
     64 	case wbALetter | prHebrewLetter<<32:
     65 		return wbHebrewLetter, false, 50
     66 	case wbHebrewLetter | prALetter<<32:
     67 		return wbALetter, false, 50
     68 	case wbHebrewLetter | prHebrewLetter<<32:
     69 		return wbHebrewLetter, false, 50
     70 
     71 	// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
     72 	case wbWB7 | prALetter<<32:
     73 		return wbALetter, false, 70
     74 	case wbWB7 | prHebrewLetter<<32:
     75 		return wbHebrewLetter, false, 70
     76 
     77 	// WB7a.
     78 	case wbHebrewLetter | prSingleQuote<<32:
     79 		return wbAny, false, 71
     80 
     81 	// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
     82 	case wbWB7c | prHebrewLetter<<32:
     83 		return wbHebrewLetter, false, 73
     84 
     85 	// WB8.
     86 	case wbAny | prNumeric<<32:
     87 		return wbNumeric, true, 9990
     88 	case wbNumeric | prNumeric<<32:
     89 		return wbNumeric, false, 80
     90 
     91 	// WB9.
     92 	case wbALetter | prNumeric<<32:
     93 		return wbNumeric, false, 90
     94 	case wbHebrewLetter | prNumeric<<32:
     95 		return wbNumeric, false, 90
     96 
     97 	// WB10.
     98 	case wbNumeric | prALetter<<32:
     99 		return wbALetter, false, 100
    100 	case wbNumeric | prHebrewLetter<<32:
    101 		return wbHebrewLetter, false, 100
    102 
    103 	// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
    104 	case wbWB11 | prNumeric<<32:
    105 		return wbNumeric, false, 110
    106 
    107 	// WB13.
    108 	case wbAny | prKatakana<<32:
    109 		return wbKatakana, true, 9990
    110 	case wbKatakana | prKatakana<<32:
    111 		return wbKatakana, false, 130
    112 
    113 	// WB13a.
    114 	case wbAny | prExtendNumLet<<32:
    115 		return wbExtendNumLet, true, 9990
    116 	case wbALetter | prExtendNumLet<<32:
    117 		return wbExtendNumLet, false, 131
    118 	case wbHebrewLetter | prExtendNumLet<<32:
    119 		return wbExtendNumLet, false, 131
    120 	case wbNumeric | prExtendNumLet<<32:
    121 		return wbExtendNumLet, false, 131
    122 	case wbKatakana | prExtendNumLet<<32:
    123 		return wbExtendNumLet, false, 131
    124 	case wbExtendNumLet | prExtendNumLet<<32:
    125 		return wbExtendNumLet, false, 131
    126 
    127 	// WB13b.
    128 	case wbExtendNumLet | prALetter<<32:
    129 		return wbALetter, false, 132
    130 	case wbExtendNumLet | prHebrewLetter<<32:
    131 		return wbHebrewLetter, false, 132
    132 	case wbExtendNumLet | prNumeric<<32:
    133 		return wbNumeric, false, 132
    134 	case wbExtendNumLet | prKatakana<<32:
    135 		return wbKatakana, false, 132
    136 
    137 	default:
    138 		return -1, false, -1
    139 	}
    140 }
    141 
    142 // transitionWordBreakState determines the new state of the word break parser
    143 // given the current state and the next code point. It also returns whether a
    144 // word boundary was detected. If more than one code point is needed to
    145 // determine the new state, the byte slice or the string starting after rune "r"
    146 // can be used (whichever is not nil or empty) for further lookups.
    147 func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
    148 	// Determine the property of the next character.
    149 	nextProperty := property(workBreakCodePoints, r)
    150 
    151 	// "Replacing Ignore Rules".
    152 	if nextProperty == prZWJ {
    153 		// WB4 (for zero-width joiners).
    154 		if state == wbNewline || state == wbCR || state == wbLF {
    155 			return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
    156 		}
    157 		if state < 0 {
    158 			return wbAny | wbZWJBit, false
    159 		}
    160 		return state | wbZWJBit, false
    161 	} else if nextProperty == prExtend || nextProperty == prFormat {
    162 		// WB4 (for Extend and Format).
    163 		if state == wbNewline || state == wbCR || state == wbLF {
    164 			return wbAny, true // Make sure we don't apply WB4 to WB3a.
    165 		}
    166 		if state == wbWSegSpace || state == wbAny|wbZWJBit {
    167 			return wbAny, false // We don't break but this is also not WB3d or WB3c.
    168 		}
    169 		if state < 0 {
    170 			return wbAny, false
    171 		}
    172 		return state, false
    173 	} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
    174 		// WB3c.
    175 		return wbAny, false
    176 	}
    177 	if state >= 0 {
    178 		state = state &^ wbZWJBit
    179 	}
    180 
    181 	// Find the applicable transition in the table.
    182 	var rule int
    183 	newState, wordBreak, rule = wbTransitions(state, nextProperty)
    184 	if newState < 0 {
    185 		// No specific transition found. Try the less specific ones.
    186 		anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
    187 		anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
    188 		if anyPropState >= 0 && anyStateState >= 0 {
    189 			// Both apply. We'll use a mix (see comments for grTransitions).
    190 			newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
    191 			if anyPropRule < anyStateRule {
    192 				wordBreak, rule = anyPropWordBreak, anyPropRule
    193 			}
    194 		} else if anyPropState >= 0 {
    195 			// We only have a specific state.
    196 			newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
    197 			// This branch will probably never be reached because okAnyState will
    198 			// always be true given the current transition map. But we keep it here
    199 			// for future modifications to the transition map where this may not be
    200 			// true anymore.
    201 		} else if anyStateState >= 0 {
    202 			// We only have a specific property.
    203 			newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
    204 		} else {
    205 			// No known transition. WB999: Any รท Any.
    206 			newState, wordBreak, rule = wbAny, true, 9990
    207 		}
    208 	}
    209 
    210 	// For those rules that need to look up runes further in the string, we
    211 	// determine the property after nextProperty, skipping over Format, Extend,
    212 	// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
    213 	// be determined (because the text ends or the rune is faulty).
    214 	farProperty := -1
    215 	if rule > 60 &&
    216 		(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
    217 		(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
    218 			nextProperty == prDoubleQuote || // WB7b.
    219 			nextProperty == prMidNum) { // WB12.
    220 		for {
    221 			var (
    222 				r      rune
    223 				length int
    224 			)
    225 			if b != nil { // Byte slice version.
    226 				r, length = utf8.DecodeRune(b)
    227 				b = b[length:]
    228 			} else { // String version.
    229 				r, length = utf8.DecodeRuneInString(str)
    230 				str = str[length:]
    231 			}
    232 			if r == utf8.RuneError {
    233 				break
    234 			}
    235 			prop := property(workBreakCodePoints, r)
    236 			if prop == prExtend || prop == prFormat || prop == prZWJ {
    237 				continue
    238 			}
    239 			farProperty = prop
    240 			break
    241 		}
    242 	}
    243 
    244 	// WB6.
    245 	if rule > 60 &&
    246 		(state == wbALetter || state == wbHebrewLetter) &&
    247 		(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
    248 		(farProperty == prALetter || farProperty == prHebrewLetter) {
    249 		return wbWB7, false
    250 	}
    251 
    252 	// WB7b.
    253 	if rule > 72 &&
    254 		state == wbHebrewLetter &&
    255 		nextProperty == prDoubleQuote &&
    256 		farProperty == prHebrewLetter {
    257 		return wbWB7c, false
    258 	}
    259 
    260 	// WB12.
    261 	if rule > 120 &&
    262 		state == wbNumeric &&
    263 		(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
    264 		farProperty == prNumeric {
    265 		return wbWB11, false
    266 	}
    267 
    268 	// WB15 and WB16.
    269 	if newState == wbAny && nextProperty == prRegionalIndicator {
    270 		if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
    271 			// Transition into the first RI.
    272 			return wbOddRI, true
    273 		}
    274 		if state == wbOddRI {
    275 			// Don't break pairs of Regional Indicators.
    276 			return wbEvenRI, false
    277 		}
    278 		return wbOddRI, true // We can break after a pair.
    279 	}
    280 
    281 	return
    282 }