src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

step.go (9698B)


      1 package uniseg
      2 
      3 import "unicode/utf8"
      4 
      5 // The bit masks used to extract boundary information returned by [Step].
      6 const (
      7 	MaskLine     = 3
      8 	MaskWord     = 4
      9 	MaskSentence = 8
     10 )
     11 
     12 // The number of bits to shift the boundary information returned by [Step] to
     13 // obtain the monospace width of the grapheme cluster.
     14 const ShiftWidth = 4
     15 
     16 // The bit positions by which boundary flags are shifted by the [Step] function.
     17 // These must correspond to the Mask constants.
     18 const (
     19 	shiftWord     = 2
     20 	shiftSentence = 3
     21 	// shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
     22 )
     23 
     24 // The bit positions by which states are shifted by the [Step] function. These
     25 // values must ensure state values defined for each of the boundary algorithms
     26 // don't overlap (and that they all still fit in a single int). These must
     27 // correspond to the Mask constants.
     28 const (
     29 	shiftWordState     = 4
     30 	shiftSentenceState = 9
     31 	shiftLineState     = 13
     32 	shiftPropState     = 21 // No mask as these are always the remaining bits.
     33 )
     34 
     35 // The bit mask used to extract the state returned by the [Step] function, after
     36 // shifting. These values must correspond to the shift constants.
     37 const (
     38 	maskGraphemeState = 0xf
     39 	maskWordState     = 0x1f
     40 	maskSentenceState = 0xf
     41 	maskLineState     = 0xff
     42 )
     43 
     44 // Step returns the first grapheme cluster (user-perceived character) found in
     45 // the given byte slice. It also returns information about the boundary between
     46 // that grapheme cluster and the one following it as well as the monospace width
     47 // of the grapheme cluster. There are three types of boundary information: word
     48 // boundaries, sentence boundaries, and line breaks. This function is therefore
     49 // a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
     50 // [FirstLineSegment].
     51 //
     52 // The "boundaries" return value can be evaluated as follows:
     53 //
     54 //   - boundaries&MaskWord != 0: The boundary is a word boundary.
     55 //   - boundaries&MaskWord == 0: The boundary is not a word boundary.
     56 //   - boundaries&MaskSentence != 0: The boundary is a sentence boundary.
     57 //   - boundaries&MaskSentence == 0: The boundary is not a sentence boundary.
     58 //   - boundaries&MaskLine == LineDontBreak: You must not break the line at the
     59 //     boundary.
     60 //   - boundaries&MaskLine == LineMustBreak: You must break the line at the
     61 //     boundary.
     62 //   - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
     63 //     the boundary.
     64 //   - boundaries >> ShiftWidth: The width of the grapheme cluster for most
     65 //     monospace fonts where a value of 1 represents one character cell.
     66 //
     67 // This function can be called continuously to extract all grapheme clusters
     68 // from a byte slice, as illustrated in the examples below.
     69 //
     70 // If you don't know which state to pass, for example when calling the function
     71 // for the first time, you must pass -1. For consecutive calls, pass the state
     72 // and rest slice returned by the previous call.
     73 //
     74 // The "rest" slice is the sub-slice of the original byte slice "b" starting
     75 // after the last byte of the identified grapheme cluster. If the length of the
     76 // "rest" slice is 0, the entire byte slice "b" has been processed. The
     77 // "cluster" byte slice is the sub-slice of the input slice containing the
     78 // first identified grapheme cluster.
     79 //
     80 // Given an empty byte slice "b", the function returns nil values.
     81 //
     82 // While slightly less convenient than using the Graphemes class, this function
     83 // has much better performance and makes no allocations. It lends itself well to
     84 // large byte slices.
     85 //
     86 // Note that in accordance with [UAX #14 LB3], the final segment will end with
     87 // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
     88 // to ignore this by checking if the length of the "rest" slice is 0 and calling
     89 // [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
     90 //
     91 // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
     92 func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
     93 	// An empty byte slice returns nothing.
     94 	if len(b) == 0 {
     95 		return
     96 	}
     97 
     98 	// Extract the first rune.
     99 	r, length := utf8.DecodeRune(b)
    100 	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
    101 		var prop int
    102 		if state < 0 {
    103 			prop = propertyGraphemes(r)
    104 		} else {
    105 			prop = state >> shiftPropState
    106 		}
    107 		return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
    108 	}
    109 
    110 	// If we don't know the state, determine it now.
    111 	var graphemeState, wordState, sentenceState, lineState, firstProp int
    112 	remainder := b[length:]
    113 	if state < 0 {
    114 		graphemeState, firstProp, _ = transitionGraphemeState(state, r)
    115 		wordState, _ = transitionWordBreakState(state, r, remainder, "")
    116 		sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
    117 		lineState, _ = transitionLineBreakState(state, r, remainder, "")
    118 	} else {
    119 		graphemeState = state & maskGraphemeState
    120 		wordState = (state >> shiftWordState) & maskWordState
    121 		sentenceState = (state >> shiftSentenceState) & maskSentenceState
    122 		lineState = (state >> shiftLineState) & maskLineState
    123 		firstProp = state >> shiftPropState
    124 	}
    125 
    126 	// Transition until we find a grapheme cluster boundary.
    127 	width := runeWidth(r, firstProp)
    128 	for {
    129 		var (
    130 			graphemeBoundary, wordBoundary, sentenceBoundary bool
    131 			lineBreak, prop                                  int
    132 		)
    133 
    134 		r, l := utf8.DecodeRune(remainder)
    135 		remainder = b[length+l:]
    136 
    137 		graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
    138 		wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
    139 		sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
    140 		lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
    141 
    142 		if graphemeBoundary {
    143 			boundary := lineBreak | (width << ShiftWidth)
    144 			if wordBoundary {
    145 				boundary |= 1 << shiftWord
    146 			}
    147 			if sentenceBoundary {
    148 				boundary |= 1 << shiftSentence
    149 			}
    150 			return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
    151 		}
    152 
    153 		if firstProp == prExtendedPictographic {
    154 			if r == vs15 {
    155 				width = 1
    156 			} else if r == vs16 {
    157 				width = 2
    158 			}
    159 		} else if firstProp != prRegionalIndicator && firstProp != prL {
    160 			width += runeWidth(r, prop)
    161 		}
    162 
    163 		length += l
    164 		if len(b) <= length {
    165 			return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
    166 		}
    167 	}
    168 }
    169 
    170 // StepString is like [Step] but its input and outputs are strings.
    171 func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
    172 	// An empty byte slice returns nothing.
    173 	if len(str) == 0 {
    174 		return
    175 	}
    176 
    177 	// Extract the first rune.
    178 	r, length := utf8.DecodeRuneInString(str)
    179 	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
    180 		prop := propertyGraphemes(r)
    181 		return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
    182 	}
    183 
    184 	// If we don't know the state, determine it now.
    185 	var graphemeState, wordState, sentenceState, lineState, firstProp int
    186 	remainder := str[length:]
    187 	if state < 0 {
    188 		graphemeState, firstProp, _ = transitionGraphemeState(state, r)
    189 		wordState, _ = transitionWordBreakState(state, r, nil, remainder)
    190 		sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
    191 		lineState, _ = transitionLineBreakState(state, r, nil, remainder)
    192 	} else {
    193 		graphemeState = state & maskGraphemeState
    194 		wordState = (state >> shiftWordState) & maskWordState
    195 		sentenceState = (state >> shiftSentenceState) & maskSentenceState
    196 		lineState = (state >> shiftLineState) & maskLineState
    197 		firstProp = state >> shiftPropState
    198 	}
    199 
    200 	// Transition until we find a grapheme cluster boundary.
    201 	width := runeWidth(r, firstProp)
    202 	for {
    203 		var (
    204 			graphemeBoundary, wordBoundary, sentenceBoundary bool
    205 			lineBreak, prop                                  int
    206 		)
    207 
    208 		r, l := utf8.DecodeRuneInString(remainder)
    209 		remainder = str[length+l:]
    210 
    211 		graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
    212 		wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
    213 		sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
    214 		lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
    215 
    216 		if graphemeBoundary {
    217 			boundary := lineBreak | (width << ShiftWidth)
    218 			if wordBoundary {
    219 				boundary |= 1 << shiftWord
    220 			}
    221 			if sentenceBoundary {
    222 				boundary |= 1 << shiftSentence
    223 			}
    224 			return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
    225 		}
    226 
    227 		if firstProp == prExtendedPictographic {
    228 			if r == vs15 {
    229 				width = 1
    230 			} else if r == vs16 {
    231 				width = 2
    232 			}
    233 		} else if firstProp != prRegionalIndicator && firstProp != prL {
    234 			width += runeWidth(r, prop)
    235 		}
    236 
    237 		length += l
    238 		if len(str) <= length {
    239 			return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
    240 		}
    241 	}
    242 }