grapheme.go (9910B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // Graphemes implements an iterator over Unicode grapheme clusters, or 6 // user-perceived characters. While iterating, it also provides information 7 // about word boundaries, sentence boundaries, line breaks, and monospace 8 // character widths. 9 // 10 // After constructing the class via [NewGraphemes] for a given string "str", 11 // [Graphemes.Next] is called for every grapheme cluster in a loop until it 12 // returns false. Inside the loop, information about the grapheme cluster as 13 // well as boundary information and character width is available via the various 14 // methods (see examples below). 15 // 16 // This class basically wraps the [StepString] parser and provides a convenient 17 // interface to it. If you are only interested in some parts of this package's 18 // functionality, using the specialized functions starting with "First" is 19 // almost always faster. 20 type Graphemes struct { 21 // The original string. 22 original string 23 24 // The remaining string to be parsed. 25 remaining string 26 27 // The current grapheme cluster. 28 cluster string 29 30 // The byte offset of the current grapheme cluster relative to the original 31 // string. 32 offset int 33 34 // The current boundary information of the [Step] parser. 35 boundaries int 36 37 // The current state of the [Step] parser. 38 state int 39 } 40 41 // NewGraphemes returns a new grapheme cluster iterator. 42 func NewGraphemes(str string) *Graphemes { 43 return &Graphemes{ 44 original: str, 45 remaining: str, 46 state: -1, 47 } 48 } 49 50 // Next advances the iterator by one grapheme cluster and returns false if no 51 // clusters are left. This function must be called before the first cluster is 52 // accessed. 53 func (g *Graphemes) Next() bool { 54 if len(g.remaining) == 0 { 55 // We're already past the end. 56 g.state = -2 57 g.cluster = "" 58 return false 59 } 60 g.offset += len(g.cluster) 61 g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state) 62 return true 63 } 64 65 // Runes returns a slice of runes (code points) which corresponds to the current 66 // grapheme cluster. If the iterator is already past the end or [Graphemes.Next] 67 // has not yet been called, nil is returned. 68 func (g *Graphemes) Runes() []rune { 69 if g.state < 0 { 70 return nil 71 } 72 return []rune(g.cluster) 73 } 74 75 // Str returns a substring of the original string which corresponds to the 76 // current grapheme cluster. If the iterator is already past the end or 77 // [Graphemes.Next] has not yet been called, an empty string is returned. 78 func (g *Graphemes) Str() string { 79 return g.cluster 80 } 81 82 // Bytes returns a byte slice which corresponds to the current grapheme cluster. 83 // If the iterator is already past the end or [Graphemes.Next] has not yet been 84 // called, nil is returned. 85 func (g *Graphemes) Bytes() []byte { 86 if g.state < 0 { 87 return nil 88 } 89 return []byte(g.cluster) 90 } 91 92 // Positions returns the interval of the current grapheme cluster as byte 93 // positions into the original string. The first returned value "from" indexes 94 // the first byte and the second returned value "to" indexes the first byte that 95 // is not included anymore, i.e. str[from:to] is the current grapheme cluster of 96 // the original string "str". If [Graphemes.Next] has not yet been called, both 97 // values are 0. If the iterator is already past the end, both values are 1. 98 func (g *Graphemes) Positions() (int, int) { 99 if g.state == -1 { 100 return 0, 0 101 } else if g.state == -2 { 102 return 1, 1 103 } 104 return g.offset, g.offset + len(g.cluster) 105 } 106 107 // IsWordBoundary returns true if a word ends after the current grapheme 108 // cluster. 109 func (g *Graphemes) IsWordBoundary() bool { 110 if g.state < 0 { 111 return true 112 } 113 return g.boundaries&MaskWord != 0 114 } 115 116 // IsSentenceBoundary returns true if a sentence ends after the current 117 // grapheme cluster. 118 func (g *Graphemes) IsSentenceBoundary() bool { 119 if g.state < 0 { 120 return true 121 } 122 return g.boundaries&MaskSentence != 0 123 } 124 125 // LineBreak returns whether the line can be broken after the current grapheme 126 // cluster. A value of [LineDontBreak] means the line may not be broken, a value 127 // of [LineMustBreak] means the line must be broken, and a value of 128 // [LineCanBreak] means the line may or may not be broken. 129 func (g *Graphemes) LineBreak() int { 130 if g.state == -1 { 131 return LineDontBreak 132 } 133 if g.state == -2 { 134 return LineMustBreak 135 } 136 return g.boundaries & MaskLine 137 } 138 139 // Width returns the monospace width of the current grapheme cluster. 140 func (g *Graphemes) Width() int { 141 if g.state < 0 { 142 return 0 143 } 144 return g.boundaries >> ShiftWidth 145 } 146 147 // Reset puts the iterator into its initial state such that the next call to 148 // [Graphemes.Next] sets it to the first grapheme cluster again. 149 func (g *Graphemes) Reset() { 150 g.state = -1 151 g.offset = 0 152 g.cluster = "" 153 g.remaining = g.original 154 } 155 156 // GraphemeClusterCount returns the number of user-perceived characters 157 // (grapheme clusters) for the given string. 158 func GraphemeClusterCount(s string) (n int) { 159 state := -1 160 for len(s) > 0 { 161 _, s, _, state = FirstGraphemeClusterInString(s, state) 162 n++ 163 } 164 return 165 } 166 167 // ReverseString reverses the given string while observing grapheme cluster 168 // boundaries. 169 func ReverseString(s string) string { 170 str := []byte(s) 171 reversed := make([]byte, len(str)) 172 state := -1 173 index := len(str) 174 for len(str) > 0 { 175 var cluster []byte 176 cluster, str, _, state = FirstGraphemeCluster(str, state) 177 index -= len(cluster) 178 copy(reversed[index:], cluster) 179 if index <= len(str)/2 { 180 break 181 } 182 } 183 return string(reversed) 184 } 185 186 // The number of bits the grapheme property must be shifted to make place for 187 // grapheme states. 188 const shiftGraphemePropState = 4 189 190 // FirstGraphemeCluster returns the first grapheme cluster found in the given 191 // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme 192 // Cluster Boundaries]. This function can be called continuously to extract all 193 // grapheme clusters from a byte slice, as illustrated in the example below. 194 // 195 // If you don't know the current state, for example when calling the function 196 // for the first time, you must pass -1. For consecutive calls, pass the state 197 // and rest slice returned by the previous call. 198 // 199 // The "rest" slice is the sub-slice of the original byte slice "b" starting 200 // after the last byte of the identified grapheme cluster. If the length of the 201 // "rest" slice is 0, the entire byte slice "b" has been processed. The 202 // "cluster" byte slice is the sub-slice of the input slice containing the 203 // identified grapheme cluster. 204 // 205 // The returned width is the width of the grapheme cluster for most monospace 206 // fonts where a value of 1 represents one character cell. 207 // 208 // Given an empty byte slice "b", the function returns nil values. 209 // 210 // While slightly less convenient than using the Graphemes class, this function 211 // has much better performance and makes no allocations. It lends itself well to 212 // large byte slices. 213 // 214 // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries 215 func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) { 216 // An empty byte slice returns nothing. 217 if len(b) == 0 { 218 return 219 } 220 221 // Extract the first rune. 222 r, length := utf8.DecodeRune(b) 223 if len(b) <= length { // If we're already past the end, there is nothing else to parse. 224 var prop int 225 if state < 0 { 226 prop = propertyGraphemes(r) 227 } else { 228 prop = state >> shiftGraphemePropState 229 } 230 return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) 231 } 232 233 // If we don't know the state, determine it now. 234 var firstProp int 235 if state < 0 { 236 state, firstProp, _ = transitionGraphemeState(state, r) 237 } else { 238 firstProp = state >> shiftGraphemePropState 239 } 240 width += runeWidth(r, firstProp) 241 242 // Transition until we find a boundary. 243 for { 244 var ( 245 prop int 246 boundary bool 247 ) 248 249 r, l := utf8.DecodeRune(b[length:]) 250 state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) 251 252 if boundary { 253 return b[:length], b[length:], width, state | (prop << shiftGraphemePropState) 254 } 255 256 if firstProp == prExtendedPictographic { 257 if r == vs15 { 258 width = 1 259 } else if r == vs16 { 260 width = 2 261 } 262 } else if firstProp != prRegionalIndicator && firstProp != prL { 263 width += runeWidth(r, prop) 264 } 265 266 length += l 267 if len(b) <= length { 268 return b, nil, width, grAny | (prop << shiftGraphemePropState) 269 } 270 } 271 } 272 273 // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and 274 // outputs are strings. 275 func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) { 276 // An empty string returns nothing. 277 if len(str) == 0 { 278 return 279 } 280 281 // Extract the first rune. 282 r, length := utf8.DecodeRuneInString(str) 283 if len(str) <= length { // If we're already past the end, there is nothing else to parse. 284 var prop int 285 if state < 0 { 286 prop = propertyGraphemes(r) 287 } else { 288 prop = state >> shiftGraphemePropState 289 } 290 return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) 291 } 292 293 // If we don't know the state, determine it now. 294 var firstProp int 295 if state < 0 { 296 state, firstProp, _ = transitionGraphemeState(state, r) 297 } else { 298 firstProp = state >> shiftGraphemePropState 299 } 300 width += runeWidth(r, firstProp) 301 302 // Transition until we find a boundary. 303 for { 304 var ( 305 prop int 306 boundary bool 307 ) 308 309 r, l := utf8.DecodeRuneInString(str[length:]) 310 state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) 311 312 if boundary { 313 return str[:length], str[length:], width, state | (prop << shiftGraphemePropState) 314 } 315 316 if firstProp == prExtendedPictographic { 317 if r == vs15 { 318 width = 1 319 } else if r == vs16 { 320 width = 2 321 } 322 } else if firstProp != prRegionalIndicator && firstProp != prL { 323 width += runeWidth(r, prop) 324 } 325 326 length += l 327 if len(str) <= length { 328 return str, "", width, grAny | (prop << shiftGraphemePropState) 329 } 330 } 331 }