graphemerules.go (5152B)
1 package uniseg 2 3 // The states of the grapheme cluster parser. 4 const ( 5 grAny = iota 6 grCR 7 grControlLF 8 grL 9 grLVV 10 grLVTT 11 grPrepend 12 grExtendedPictographic 13 grExtendedPictographicZWJ 14 grRIOdd 15 grRIEven 16 ) 17 18 // The grapheme cluster parser's breaking instructions. 19 const ( 20 grNoBoundary = iota 21 grBoundary 22 ) 23 24 // grTransitions implements the grapheme cluster parser's state transitions. 25 // Maps state and property to a new state, a breaking instruction, and rule 26 // number. The breaking instruction always refers to the boundary between the 27 // last and next code point. Returns negative values if no transition is found. 28 // 29 // This function is used as follows: 30 // 31 // 1. Find specific state + specific property. Stop if found. 32 // 2. Find specific state + any property. 33 // 3. Find any state + specific property. 34 // 4. If only (2) or (3) (but not both) was found, stop. 35 // 5. If both (2) and (3) were found, use state from (3) and breaking instruction 36 // from the transition with the lower rule number, prefer (3) if rule numbers 37 // are equal. Stop. 38 // 6. Assume grAny and grBoundary. 39 // 40 // Unicode version 15.0.0. 41 func grTransitions(state, prop int) (newState int, newProp int, boundary int) { 42 // It turns out that using a big switch statement is much faster than using 43 // a map. 44 45 switch uint64(state) | uint64(prop)<<32 { 46 // GB5 47 case grAny | prCR<<32: 48 return grCR, grBoundary, 50 49 case grAny | prLF<<32: 50 return grControlLF, grBoundary, 50 51 case grAny | prControl<<32: 52 return grControlLF, grBoundary, 50 53 54 // GB4 55 case grCR | prAny<<32: 56 return grAny, grBoundary, 40 57 case grControlLF | prAny<<32: 58 return grAny, grBoundary, 40 59 60 // GB3 61 case grCR | prLF<<32: 62 return grControlLF, grNoBoundary, 30 63 64 // GB6 65 case grAny | prL<<32: 66 return grL, grBoundary, 9990 67 case grL | prL<<32: 68 return grL, grNoBoundary, 60 69 case grL | prV<<32: 70 return grLVV, grNoBoundary, 60 71 case grL | prLV<<32: 72 return grLVV, grNoBoundary, 60 73 case grL | prLVT<<32: 74 return grLVTT, grNoBoundary, 60 75 76 // GB7 77 case grAny | prLV<<32: 78 return grLVV, grBoundary, 9990 79 case grAny | prV<<32: 80 return grLVV, grBoundary, 9990 81 case grLVV | prV<<32: 82 return grLVV, grNoBoundary, 70 83 case grLVV | prT<<32: 84 return grLVTT, grNoBoundary, 70 85 86 // GB8 87 case grAny | prLVT<<32: 88 return grLVTT, grBoundary, 9990 89 case grAny | prT<<32: 90 return grLVTT, grBoundary, 9990 91 case grLVTT | prT<<32: 92 return grLVTT, grNoBoundary, 80 93 94 // GB9 95 case grAny | prExtend<<32: 96 return grAny, grNoBoundary, 90 97 case grAny | prZWJ<<32: 98 return grAny, grNoBoundary, 90 99 100 // GB9a 101 case grAny | prSpacingMark<<32: 102 return grAny, grNoBoundary, 91 103 104 // GB9b 105 case grAny | prPrepend<<32: 106 return grPrepend, grBoundary, 9990 107 case grPrepend | prAny<<32: 108 return grAny, grNoBoundary, 92 109 110 // GB11 111 case grAny | prExtendedPictographic<<32: 112 return grExtendedPictographic, grBoundary, 9990 113 case grExtendedPictographic | prExtend<<32: 114 return grExtendedPictographic, grNoBoundary, 110 115 case grExtendedPictographic | prZWJ<<32: 116 return grExtendedPictographicZWJ, grNoBoundary, 110 117 case grExtendedPictographicZWJ | prExtendedPictographic<<32: 118 return grExtendedPictographic, grNoBoundary, 110 119 120 // GB12 / GB13 121 case grAny | prRegionalIndicator<<32: 122 return grRIOdd, grBoundary, 9990 123 case grRIOdd | prRegionalIndicator<<32: 124 return grRIEven, grNoBoundary, 120 125 case grRIEven | prRegionalIndicator<<32: 126 return grRIOdd, grBoundary, 120 127 default: 128 return -1, -1, -1 129 } 130 } 131 132 // transitionGraphemeState determines the new state of the grapheme cluster 133 // parser given the current state and the next code point. It also returns the 134 // code point's grapheme property (the value mapped by the [graphemeCodePoints] 135 // table) and whether a cluster boundary was detected. 136 func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) { 137 // Determine the property of the next character. 138 prop = propertyGraphemes(r) 139 140 // Find the applicable transition. 141 nextState, nextProp, _ := grTransitions(state, prop) 142 if nextState >= 0 { 143 // We have a specific transition. We'll use it. 144 return nextState, prop, nextProp == grBoundary 145 } 146 147 // No specific transition found. Try the less specific ones. 148 anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny) 149 anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop) 150 if anyPropState >= 0 && anyStateState >= 0 { 151 // Both apply. We'll use a mix (see comments for grTransitions). 152 newState = anyStateState 153 boundary = anyStateProp == grBoundary 154 if anyPropRule < anyStateRule { 155 boundary = anyPropProp == grBoundary 156 } 157 return 158 } 159 160 if anyPropState >= 0 { 161 // We only have a specific state. 162 return anyPropState, prop, anyPropProp == grBoundary 163 // This branch will probably never be reached because okAnyState will 164 // always be true given the current transition map. But we keep it here 165 // for future modifications to the transition map where this may not be 166 // true anymore. 167 } 168 169 if anyStateState >= 0 { 170 // We only have a specific property. 171 return anyStateState, prop, anyStateProp == grBoundary 172 } 173 174 // No known transition. GB999: Any รท Any. 175 return grAny, prop, true 176 }