src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

grapheme.go (9910B)


      1 package uniseg
      2 
      3 import "unicode/utf8"
      4 
      5 // Graphemes implements an iterator over Unicode grapheme clusters, or
      6 // user-perceived characters. While iterating, it also provides information
      7 // about word boundaries, sentence boundaries, line breaks, and monospace
      8 // character widths.
      9 //
     10 // After constructing the class via [NewGraphemes] for a given string "str",
     11 // [Graphemes.Next] is called for every grapheme cluster in a loop until it
     12 // returns false. Inside the loop, information about the grapheme cluster as
     13 // well as boundary information and character width is available via the various
     14 // methods (see examples below).
     15 //
     16 // This class basically wraps the [StepString] parser and provides a convenient
     17 // interface to it. If you are only interested in some parts of this package's
     18 // functionality, using the specialized functions starting with "First" is
     19 // almost always faster.
     20 type Graphemes struct {
     21 	// The original string.
     22 	original string
     23 
     24 	// The remaining string to be parsed.
     25 	remaining string
     26 
     27 	// The current grapheme cluster.
     28 	cluster string
     29 
     30 	// The byte offset of the current grapheme cluster relative to the original
     31 	// string.
     32 	offset int
     33 
     34 	// The current boundary information of the [Step] parser.
     35 	boundaries int
     36 
     37 	// The current state of the [Step] parser.
     38 	state int
     39 }
     40 
     41 // NewGraphemes returns a new grapheme cluster iterator.
     42 func NewGraphemes(str string) *Graphemes {
     43 	return &Graphemes{
     44 		original:  str,
     45 		remaining: str,
     46 		state:     -1,
     47 	}
     48 }
     49 
     50 // Next advances the iterator by one grapheme cluster and returns false if no
     51 // clusters are left. This function must be called before the first cluster is
     52 // accessed.
     53 func (g *Graphemes) Next() bool {
     54 	if len(g.remaining) == 0 {
     55 		// We're already past the end.
     56 		g.state = -2
     57 		g.cluster = ""
     58 		return false
     59 	}
     60 	g.offset += len(g.cluster)
     61 	g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
     62 	return true
     63 }
     64 
     65 // Runes returns a slice of runes (code points) which corresponds to the current
     66 // grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
     67 // has not yet been called, nil is returned.
     68 func (g *Graphemes) Runes() []rune {
     69 	if g.state < 0 {
     70 		return nil
     71 	}
     72 	return []rune(g.cluster)
     73 }
     74 
     75 // Str returns a substring of the original string which corresponds to the
     76 // current grapheme cluster. If the iterator is already past the end or
     77 // [Graphemes.Next] has not yet been called, an empty string is returned.
     78 func (g *Graphemes) Str() string {
     79 	return g.cluster
     80 }
     81 
     82 // Bytes returns a byte slice which corresponds to the current grapheme cluster.
     83 // If the iterator is already past the end or [Graphemes.Next] has not yet been
     84 // called, nil is returned.
     85 func (g *Graphemes) Bytes() []byte {
     86 	if g.state < 0 {
     87 		return nil
     88 	}
     89 	return []byte(g.cluster)
     90 }
     91 
     92 // Positions returns the interval of the current grapheme cluster as byte
     93 // positions into the original string. The first returned value "from" indexes
     94 // the first byte and the second returned value "to" indexes the first byte that
     95 // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
     96 // the original string "str". If [Graphemes.Next] has not yet been called, both
     97 // values are 0. If the iterator is already past the end, both values are 1.
     98 func (g *Graphemes) Positions() (int, int) {
     99 	if g.state == -1 {
    100 		return 0, 0
    101 	} else if g.state == -2 {
    102 		return 1, 1
    103 	}
    104 	return g.offset, g.offset + len(g.cluster)
    105 }
    106 
    107 // IsWordBoundary returns true if a word ends after the current grapheme
    108 // cluster.
    109 func (g *Graphemes) IsWordBoundary() bool {
    110 	if g.state < 0 {
    111 		return true
    112 	}
    113 	return g.boundaries&MaskWord != 0
    114 }
    115 
    116 // IsSentenceBoundary returns true if a sentence ends after the current
    117 // grapheme cluster.
    118 func (g *Graphemes) IsSentenceBoundary() bool {
    119 	if g.state < 0 {
    120 		return true
    121 	}
    122 	return g.boundaries&MaskSentence != 0
    123 }
    124 
    125 // LineBreak returns whether the line can be broken after the current grapheme
    126 // cluster. A value of [LineDontBreak] means the line may not be broken, a value
    127 // of [LineMustBreak] means the line must be broken, and a value of
    128 // [LineCanBreak] means the line may or may not be broken.
    129 func (g *Graphemes) LineBreak() int {
    130 	if g.state == -1 {
    131 		return LineDontBreak
    132 	}
    133 	if g.state == -2 {
    134 		return LineMustBreak
    135 	}
    136 	return g.boundaries & MaskLine
    137 }
    138 
    139 // Width returns the monospace width of the current grapheme cluster.
    140 func (g *Graphemes) Width() int {
    141 	if g.state < 0 {
    142 		return 0
    143 	}
    144 	return g.boundaries >> ShiftWidth
    145 }
    146 
    147 // Reset puts the iterator into its initial state such that the next call to
    148 // [Graphemes.Next] sets it to the first grapheme cluster again.
    149 func (g *Graphemes) Reset() {
    150 	g.state = -1
    151 	g.offset = 0
    152 	g.cluster = ""
    153 	g.remaining = g.original
    154 }
    155 
    156 // GraphemeClusterCount returns the number of user-perceived characters
    157 // (grapheme clusters) for the given string.
    158 func GraphemeClusterCount(s string) (n int) {
    159 	state := -1
    160 	for len(s) > 0 {
    161 		_, s, _, state = FirstGraphemeClusterInString(s, state)
    162 		n++
    163 	}
    164 	return
    165 }
    166 
    167 // ReverseString reverses the given string while observing grapheme cluster
    168 // boundaries.
    169 func ReverseString(s string) string {
    170 	str := []byte(s)
    171 	reversed := make([]byte, len(str))
    172 	state := -1
    173 	index := len(str)
    174 	for len(str) > 0 {
    175 		var cluster []byte
    176 		cluster, str, _, state = FirstGraphemeCluster(str, state)
    177 		index -= len(cluster)
    178 		copy(reversed[index:], cluster)
    179 		if index <= len(str)/2 {
    180 			break
    181 		}
    182 	}
    183 	return string(reversed)
    184 }
    185 
    186 // The number of bits the grapheme property must be shifted to make place for
    187 // grapheme states.
    188 const shiftGraphemePropState = 4
    189 
    190 // FirstGraphemeCluster returns the first grapheme cluster found in the given
    191 // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
    192 // Cluster Boundaries]. This function can be called continuously to extract all
    193 // grapheme clusters from a byte slice, as illustrated in the example below.
    194 //
    195 // If you don't know the current state, for example when calling the function
    196 // for the first time, you must pass -1. For consecutive calls, pass the state
    197 // and rest slice returned by the previous call.
    198 //
    199 // The "rest" slice is the sub-slice of the original byte slice "b" starting
    200 // after the last byte of the identified grapheme cluster. If the length of the
    201 // "rest" slice is 0, the entire byte slice "b" has been processed. The
    202 // "cluster" byte slice is the sub-slice of the input slice containing the
    203 // identified grapheme cluster.
    204 //
    205 // The returned width is the width of the grapheme cluster for most monospace
    206 // fonts where a value of 1 represents one character cell.
    207 //
    208 // Given an empty byte slice "b", the function returns nil values.
    209 //
    210 // While slightly less convenient than using the Graphemes class, this function
    211 // has much better performance and makes no allocations. It lends itself well to
    212 // large byte slices.
    213 //
    214 // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
    215 func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
    216 	// An empty byte slice returns nothing.
    217 	if len(b) == 0 {
    218 		return
    219 	}
    220 
    221 	// Extract the first rune.
    222 	r, length := utf8.DecodeRune(b)
    223 	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
    224 		var prop int
    225 		if state < 0 {
    226 			prop = propertyGraphemes(r)
    227 		} else {
    228 			prop = state >> shiftGraphemePropState
    229 		}
    230 		return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
    231 	}
    232 
    233 	// If we don't know the state, determine it now.
    234 	var firstProp int
    235 	if state < 0 {
    236 		state, firstProp, _ = transitionGraphemeState(state, r)
    237 	} else {
    238 		firstProp = state >> shiftGraphemePropState
    239 	}
    240 	width += runeWidth(r, firstProp)
    241 
    242 	// Transition until we find a boundary.
    243 	for {
    244 		var (
    245 			prop     int
    246 			boundary bool
    247 		)
    248 
    249 		r, l := utf8.DecodeRune(b[length:])
    250 		state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
    251 
    252 		if boundary {
    253 			return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
    254 		}
    255 
    256 		if firstProp == prExtendedPictographic {
    257 			if r == vs15 {
    258 				width = 1
    259 			} else if r == vs16 {
    260 				width = 2
    261 			}
    262 		} else if firstProp != prRegionalIndicator && firstProp != prL {
    263 			width += runeWidth(r, prop)
    264 		}
    265 
    266 		length += l
    267 		if len(b) <= length {
    268 			return b, nil, width, grAny | (prop << shiftGraphemePropState)
    269 		}
    270 	}
    271 }
    272 
    273 // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
    274 // outputs are strings.
    275 func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
    276 	// An empty string returns nothing.
    277 	if len(str) == 0 {
    278 		return
    279 	}
    280 
    281 	// Extract the first rune.
    282 	r, length := utf8.DecodeRuneInString(str)
    283 	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
    284 		var prop int
    285 		if state < 0 {
    286 			prop = propertyGraphemes(r)
    287 		} else {
    288 			prop = state >> shiftGraphemePropState
    289 		}
    290 		return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
    291 	}
    292 
    293 	// If we don't know the state, determine it now.
    294 	var firstProp int
    295 	if state < 0 {
    296 		state, firstProp, _ = transitionGraphemeState(state, r)
    297 	} else {
    298 		firstProp = state >> shiftGraphemePropState
    299 	}
    300 	width += runeWidth(r, firstProp)
    301 
    302 	// Transition until we find a boundary.
    303 	for {
    304 		var (
    305 			prop     int
    306 			boundary bool
    307 		)
    308 
    309 		r, l := utf8.DecodeRuneInString(str[length:])
    310 		state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
    311 
    312 		if boundary {
    313 			return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
    314 		}
    315 
    316 		if firstProp == prExtendedPictographic {
    317 			if r == vs15 {
    318 				width = 1
    319 			} else if r == vs16 {
    320 				width = 2
    321 			}
    322 		} else if firstProp != prRegionalIndicator && firstProp != prL {
    323 			width += runeWidth(r, prop)
    324 		}
    325 
    326 		length += l
    327 		if len(str) <= length {
    328 			return str, "", width, grAny | (prop << shiftGraphemePropState)
    329 		}
    330 	}
    331 }