src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

graphemerules.go (5152B)


      1 package uniseg
      2 
      3 // The states of the grapheme cluster parser.
      4 const (
      5 	grAny = iota
      6 	grCR
      7 	grControlLF
      8 	grL
      9 	grLVV
     10 	grLVTT
     11 	grPrepend
     12 	grExtendedPictographic
     13 	grExtendedPictographicZWJ
     14 	grRIOdd
     15 	grRIEven
     16 )
     17 
     18 // The grapheme cluster parser's breaking instructions.
     19 const (
     20 	grNoBoundary = iota
     21 	grBoundary
     22 )
     23 
     24 // grTransitions implements the grapheme cluster parser's state transitions.
     25 // Maps state and property to a new state, a breaking instruction, and rule
     26 // number. The breaking instruction always refers to the boundary between the
     27 // last and next code point. Returns negative values if no transition is found.
     28 //
     29 // This function is used as follows:
     30 //
     31 //  1. Find specific state + specific property. Stop if found.
     32 //  2. Find specific state + any property.
     33 //  3. Find any state + specific property.
     34 //  4. If only (2) or (3) (but not both) was found, stop.
     35 //  5. If both (2) and (3) were found, use state from (3) and breaking instruction
     36 //     from the transition with the lower rule number, prefer (3) if rule numbers
     37 //     are equal. Stop.
     38 //  6. Assume grAny and grBoundary.
     39 //
     40 // Unicode version 15.0.0.
     41 func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
     42 	// It turns out that using a big switch statement is much faster than using
     43 	// a map.
     44 
     45 	switch uint64(state) | uint64(prop)<<32 {
     46 	// GB5
     47 	case grAny | prCR<<32:
     48 		return grCR, grBoundary, 50
     49 	case grAny | prLF<<32:
     50 		return grControlLF, grBoundary, 50
     51 	case grAny | prControl<<32:
     52 		return grControlLF, grBoundary, 50
     53 
     54 	// GB4
     55 	case grCR | prAny<<32:
     56 		return grAny, grBoundary, 40
     57 	case grControlLF | prAny<<32:
     58 		return grAny, grBoundary, 40
     59 
     60 	// GB3
     61 	case grCR | prLF<<32:
     62 		return grControlLF, grNoBoundary, 30
     63 
     64 	// GB6
     65 	case grAny | prL<<32:
     66 		return grL, grBoundary, 9990
     67 	case grL | prL<<32:
     68 		return grL, grNoBoundary, 60
     69 	case grL | prV<<32:
     70 		return grLVV, grNoBoundary, 60
     71 	case grL | prLV<<32:
     72 		return grLVV, grNoBoundary, 60
     73 	case grL | prLVT<<32:
     74 		return grLVTT, grNoBoundary, 60
     75 
     76 	// GB7
     77 	case grAny | prLV<<32:
     78 		return grLVV, grBoundary, 9990
     79 	case grAny | prV<<32:
     80 		return grLVV, grBoundary, 9990
     81 	case grLVV | prV<<32:
     82 		return grLVV, grNoBoundary, 70
     83 	case grLVV | prT<<32:
     84 		return grLVTT, grNoBoundary, 70
     85 
     86 	// GB8
     87 	case grAny | prLVT<<32:
     88 		return grLVTT, grBoundary, 9990
     89 	case grAny | prT<<32:
     90 		return grLVTT, grBoundary, 9990
     91 	case grLVTT | prT<<32:
     92 		return grLVTT, grNoBoundary, 80
     93 
     94 	// GB9
     95 	case grAny | prExtend<<32:
     96 		return grAny, grNoBoundary, 90
     97 	case grAny | prZWJ<<32:
     98 		return grAny, grNoBoundary, 90
     99 
    100 	// GB9a
    101 	case grAny | prSpacingMark<<32:
    102 		return grAny, grNoBoundary, 91
    103 
    104 	// GB9b
    105 	case grAny | prPrepend<<32:
    106 		return grPrepend, grBoundary, 9990
    107 	case grPrepend | prAny<<32:
    108 		return grAny, grNoBoundary, 92
    109 
    110 	// GB11
    111 	case grAny | prExtendedPictographic<<32:
    112 		return grExtendedPictographic, grBoundary, 9990
    113 	case grExtendedPictographic | prExtend<<32:
    114 		return grExtendedPictographic, grNoBoundary, 110
    115 	case grExtendedPictographic | prZWJ<<32:
    116 		return grExtendedPictographicZWJ, grNoBoundary, 110
    117 	case grExtendedPictographicZWJ | prExtendedPictographic<<32:
    118 		return grExtendedPictographic, grNoBoundary, 110
    119 
    120 	// GB12 / GB13
    121 	case grAny | prRegionalIndicator<<32:
    122 		return grRIOdd, grBoundary, 9990
    123 	case grRIOdd | prRegionalIndicator<<32:
    124 		return grRIEven, grNoBoundary, 120
    125 	case grRIEven | prRegionalIndicator<<32:
    126 		return grRIOdd, grBoundary, 120
    127 	default:
    128 		return -1, -1, -1
    129 	}
    130 }
    131 
    132 // transitionGraphemeState determines the new state of the grapheme cluster
    133 // parser given the current state and the next code point. It also returns the
    134 // code point's grapheme property (the value mapped by the [graphemeCodePoints]
    135 // table) and whether a cluster boundary was detected.
    136 func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
    137 	// Determine the property of the next character.
    138 	prop = propertyGraphemes(r)
    139 
    140 	// Find the applicable transition.
    141 	nextState, nextProp, _ := grTransitions(state, prop)
    142 	if nextState >= 0 {
    143 		// We have a specific transition. We'll use it.
    144 		return nextState, prop, nextProp == grBoundary
    145 	}
    146 
    147 	// No specific transition found. Try the less specific ones.
    148 	anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
    149 	anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
    150 	if anyPropState >= 0 && anyStateState >= 0 {
    151 		// Both apply. We'll use a mix (see comments for grTransitions).
    152 		newState = anyStateState
    153 		boundary = anyStateProp == grBoundary
    154 		if anyPropRule < anyStateRule {
    155 			boundary = anyPropProp == grBoundary
    156 		}
    157 		return
    158 	}
    159 
    160 	if anyPropState >= 0 {
    161 		// We only have a specific state.
    162 		return anyPropState, prop, anyPropProp == grBoundary
    163 		// This branch will probably never be reached because okAnyState will
    164 		// always be true given the current transition map. But we keep it here
    165 		// for future modifications to the transition map where this may not be
    166 		// true anymore.
    167 	}
    168 
    169 	if anyStateState >= 0 {
    170 		// We only have a specific property.
    171 		return anyStateState, prop, anyStateProp == grBoundary
    172 	}
    173 
    174 	// No known transition. GB999: Any รท Any.
    175 	return grAny, prop, true
    176 }