step.go (9698B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // The bit masks used to extract boundary information returned by [Step]. 6 const ( 7 MaskLine = 3 8 MaskWord = 4 9 MaskSentence = 8 10 ) 11 12 // The number of bits to shift the boundary information returned by [Step] to 13 // obtain the monospace width of the grapheme cluster. 14 const ShiftWidth = 4 15 16 // The bit positions by which boundary flags are shifted by the [Step] function. 17 // These must correspond to the Mask constants. 18 const ( 19 shiftWord = 2 20 shiftSentence = 3 21 // shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits. 22 ) 23 24 // The bit positions by which states are shifted by the [Step] function. These 25 // values must ensure state values defined for each of the boundary algorithms 26 // don't overlap (and that they all still fit in a single int). These must 27 // correspond to the Mask constants. 28 const ( 29 shiftWordState = 4 30 shiftSentenceState = 9 31 shiftLineState = 13 32 shiftPropState = 21 // No mask as these are always the remaining bits. 33 ) 34 35 // The bit mask used to extract the state returned by the [Step] function, after 36 // shifting. These values must correspond to the shift constants. 37 const ( 38 maskGraphemeState = 0xf 39 maskWordState = 0x1f 40 maskSentenceState = 0xf 41 maskLineState = 0xff 42 ) 43 44 // Step returns the first grapheme cluster (user-perceived character) found in 45 // the given byte slice. It also returns information about the boundary between 46 // that grapheme cluster and the one following it as well as the monospace width 47 // of the grapheme cluster. There are three types of boundary information: word 48 // boundaries, sentence boundaries, and line breaks. This function is therefore 49 // a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and 50 // [FirstLineSegment]. 51 // 52 // The "boundaries" return value can be evaluated as follows: 53 // 54 // - boundaries&MaskWord != 0: The boundary is a word boundary. 55 // - boundaries&MaskWord == 0: The boundary is not a word boundary. 56 // - boundaries&MaskSentence != 0: The boundary is a sentence boundary. 57 // - boundaries&MaskSentence == 0: The boundary is not a sentence boundary. 58 // - boundaries&MaskLine == LineDontBreak: You must not break the line at the 59 // boundary. 60 // - boundaries&MaskLine == LineMustBreak: You must break the line at the 61 // boundary. 62 // - boundaries&MaskLine == LineCanBreak: You may or may not break the line at 63 // the boundary. 64 // - boundaries >> ShiftWidth: The width of the grapheme cluster for most 65 // monospace fonts where a value of 1 represents one character cell. 66 // 67 // This function can be called continuously to extract all grapheme clusters 68 // from a byte slice, as illustrated in the examples below. 69 // 70 // If you don't know which state to pass, for example when calling the function 71 // for the first time, you must pass -1. For consecutive calls, pass the state 72 // and rest slice returned by the previous call. 73 // 74 // The "rest" slice is the sub-slice of the original byte slice "b" starting 75 // after the last byte of the identified grapheme cluster. If the length of the 76 // "rest" slice is 0, the entire byte slice "b" has been processed. The 77 // "cluster" byte slice is the sub-slice of the input slice containing the 78 // first identified grapheme cluster. 79 // 80 // Given an empty byte slice "b", the function returns nil values. 81 // 82 // While slightly less convenient than using the Graphemes class, this function 83 // has much better performance and makes no allocations. It lends itself well to 84 // large byte slices. 85 // 86 // Note that in accordance with [UAX #14 LB3], the final segment will end with 87 // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose 88 // to ignore this by checking if the length of the "rest" slice is 0 and calling 89 // [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune. 90 // 91 // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm 92 func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) { 93 // An empty byte slice returns nothing. 94 if len(b) == 0 { 95 return 96 } 97 98 // Extract the first rune. 99 r, length := utf8.DecodeRune(b) 100 if len(b) <= length { // If we're already past the end, there is nothing else to parse. 101 var prop int 102 if state < 0 { 103 prop = propertyGraphemes(r) 104 } else { 105 prop = state >> shiftPropState 106 } 107 return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState) 108 } 109 110 // If we don't know the state, determine it now. 111 var graphemeState, wordState, sentenceState, lineState, firstProp int 112 remainder := b[length:] 113 if state < 0 { 114 graphemeState, firstProp, _ = transitionGraphemeState(state, r) 115 wordState, _ = transitionWordBreakState(state, r, remainder, "") 116 sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "") 117 lineState, _ = transitionLineBreakState(state, r, remainder, "") 118 } else { 119 graphemeState = state & maskGraphemeState 120 wordState = (state >> shiftWordState) & maskWordState 121 sentenceState = (state >> shiftSentenceState) & maskSentenceState 122 lineState = (state >> shiftLineState) & maskLineState 123 firstProp = state >> shiftPropState 124 } 125 126 // Transition until we find a grapheme cluster boundary. 127 width := runeWidth(r, firstProp) 128 for { 129 var ( 130 graphemeBoundary, wordBoundary, sentenceBoundary bool 131 lineBreak, prop int 132 ) 133 134 r, l := utf8.DecodeRune(remainder) 135 remainder = b[length+l:] 136 137 graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r) 138 wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "") 139 sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "") 140 lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "") 141 142 if graphemeBoundary { 143 boundary := lineBreak | (width << ShiftWidth) 144 if wordBoundary { 145 boundary |= 1 << shiftWord 146 } 147 if sentenceBoundary { 148 boundary |= 1 << shiftSentence 149 } 150 return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState) 151 } 152 153 if firstProp == prExtendedPictographic { 154 if r == vs15 { 155 width = 1 156 } else if r == vs16 { 157 width = 2 158 } 159 } else if firstProp != prRegionalIndicator && firstProp != prL { 160 width += runeWidth(r, prop) 161 } 162 163 length += l 164 if len(b) <= length { 165 return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState) 166 } 167 } 168 } 169 170 // StepString is like [Step] but its input and outputs are strings. 171 func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) { 172 // An empty byte slice returns nothing. 173 if len(str) == 0 { 174 return 175 } 176 177 // Extract the first rune. 178 r, length := utf8.DecodeRuneInString(str) 179 if len(str) <= length { // If we're already past the end, there is nothing else to parse. 180 prop := propertyGraphemes(r) 181 return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) 182 } 183 184 // If we don't know the state, determine it now. 185 var graphemeState, wordState, sentenceState, lineState, firstProp int 186 remainder := str[length:] 187 if state < 0 { 188 graphemeState, firstProp, _ = transitionGraphemeState(state, r) 189 wordState, _ = transitionWordBreakState(state, r, nil, remainder) 190 sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder) 191 lineState, _ = transitionLineBreakState(state, r, nil, remainder) 192 } else { 193 graphemeState = state & maskGraphemeState 194 wordState = (state >> shiftWordState) & maskWordState 195 sentenceState = (state >> shiftSentenceState) & maskSentenceState 196 lineState = (state >> shiftLineState) & maskLineState 197 firstProp = state >> shiftPropState 198 } 199 200 // Transition until we find a grapheme cluster boundary. 201 width := runeWidth(r, firstProp) 202 for { 203 var ( 204 graphemeBoundary, wordBoundary, sentenceBoundary bool 205 lineBreak, prop int 206 ) 207 208 r, l := utf8.DecodeRuneInString(remainder) 209 remainder = str[length+l:] 210 211 graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r) 212 wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder) 213 sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder) 214 lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder) 215 216 if graphemeBoundary { 217 boundary := lineBreak | (width << ShiftWidth) 218 if wordBoundary { 219 boundary |= 1 << shiftWord 220 } 221 if sentenceBoundary { 222 boundary |= 1 << shiftSentence 223 } 224 return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState) 225 } 226 227 if firstProp == prExtendedPictographic { 228 if r == vs15 { 229 width = 1 230 } else if r == vs16 { 231 width = 2 232 } 233 } else if firstProp != prRegionalIndicator && firstProp != prL { 234 width += runeWidth(r, prop) 235 } 236 237 length += l 238 if len(str) <= length { 239 return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState) 240 } 241 } 242 }