src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

line.go (4923B)


      1 package uniseg
      2 
      3 import "unicode/utf8"
      4 
      5 // FirstLineSegment returns the prefix of the given byte slice after which a
      6 // decision to break the string over to the next line can or must be made,
      7 // according to the rules of [Unicode Standard Annex #14]. This is used to
      8 // implement line breaking.
      9 //
     10 // Line breaking, also known as word wrapping, is the process of breaking a
     11 // section of text into lines such that it will fit in the available width of a
     12 // page, window or other display area.
     13 //
     14 // The returned "segment" may not be broken into smaller parts, unless no other
     15 // breaking opportunities present themselves, in which case you may break by
     16 // grapheme clusters (using the [FirstGraphemeCluster] function to determine the
     17 // grapheme clusters).
     18 //
     19 // The "mustBreak" flag indicates whether you MUST break the line after the
     20 // given segment (true), for example after newline characters, or you MAY break
     21 // the line after the given segment (false).
     22 //
     23 // This function can be called continuously to extract all non-breaking sub-sets
     24 // from a byte slice, as illustrated in the example below.
     25 //
     26 // If you don't know the current state, for example when calling the function
     27 // for the first time, you must pass -1. For consecutive calls, pass the state
     28 // and rest slice returned by the previous call.
     29 //
     30 // The "rest" slice is the sub-slice of the original byte slice "b" starting
     31 // after the last byte of the identified line segment. If the length of the
     32 // "rest" slice is 0, the entire byte slice "b" has been processed. The
     33 // "segment" byte slice is the sub-slice of the input slice containing the
     34 // identified line segment.
     35 //
     36 // Given an empty byte slice "b", the function returns nil values.
     37 //
     38 // Note that in accordance with [UAX #14 LB3], the final segment will end with
     39 // "mustBreak" set to true. You can choose to ignore this by checking if the
     40 // length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or
     41 // [HasTrailingLineBreakInString] on the last rune.
     42 //
     43 // Note also that this algorithm may break within grapheme clusters. This is
     44 // addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
     45 // the [Step] function instead.
     46 //
     47 // [Unicode Standard Annex #14]: https://www.unicode.org/reports/tr14/
     48 // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
     49 func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) {
     50 	// An empty byte slice returns nothing.
     51 	if len(b) == 0 {
     52 		return
     53 	}
     54 
     55 	// Extract the first rune.
     56 	r, length := utf8.DecodeRune(b)
     57 	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
     58 		return b, nil, true, lbAny // LB3.
     59 	}
     60 
     61 	// If we don't know the state, determine it now.
     62 	if state < 0 {
     63 		state, _ = transitionLineBreakState(state, r, b[length:], "")
     64 	}
     65 
     66 	// Transition until we find a boundary.
     67 	var boundary int
     68 	for {
     69 		r, l := utf8.DecodeRune(b[length:])
     70 		state, boundary = transitionLineBreakState(state, r, b[length+l:], "")
     71 
     72 		if boundary != LineDontBreak {
     73 			return b[:length], b[length:], boundary == LineMustBreak, state
     74 		}
     75 
     76 		length += l
     77 		if len(b) <= length {
     78 			return b, nil, true, lbAny // LB3
     79 		}
     80 	}
     81 }
     82 
     83 // FirstLineSegmentInString is like [FirstLineSegment] but its input and outputs
     84 // are strings.
     85 func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) {
     86 	// An empty byte slice returns nothing.
     87 	if len(str) == 0 {
     88 		return
     89 	}
     90 
     91 	// Extract the first rune.
     92 	r, length := utf8.DecodeRuneInString(str)
     93 	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
     94 		return str, "", true, lbAny // LB3.
     95 	}
     96 
     97 	// If we don't know the state, determine it now.
     98 	if state < 0 {
     99 		state, _ = transitionLineBreakState(state, r, nil, str[length:])
    100 	}
    101 
    102 	// Transition until we find a boundary.
    103 	var boundary int
    104 	for {
    105 		r, l := utf8.DecodeRuneInString(str[length:])
    106 		state, boundary = transitionLineBreakState(state, r, nil, str[length+l:])
    107 
    108 		if boundary != LineDontBreak {
    109 			return str[:length], str[length:], boundary == LineMustBreak, state
    110 		}
    111 
    112 		length += l
    113 		if len(str) <= length {
    114 			return str, "", true, lbAny // LB3.
    115 		}
    116 	}
    117 }
    118 
    119 // HasTrailingLineBreak returns true if the last rune in the given byte slice is
    120 // one of the hard line break code points defined in LB4 and LB5 of [UAX #14].
    121 //
    122 // [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
    123 func HasTrailingLineBreak(b []byte) bool {
    124 	r, _ := utf8.DecodeLastRune(b)
    125 	property, _ := propertyLineBreak(r)
    126 	return property == prBK || property == prCR || property == prLF || property == prNL
    127 }
    128 
    129 // HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
    130 func HasTrailingLineBreakInString(str string) bool {
    131 	r, _ := utf8.DecodeLastRuneInString(str)
    132 	property, _ := propertyLineBreak(r)
    133 	return property == prBK || property == prCR || property == prLF || property == prNL
    134 }