line.go (4923B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // FirstLineSegment returns the prefix of the given byte slice after which a 6 // decision to break the string over to the next line can or must be made, 7 // according to the rules of [Unicode Standard Annex #14]. This is used to 8 // implement line breaking. 9 // 10 // Line breaking, also known as word wrapping, is the process of breaking a 11 // section of text into lines such that it will fit in the available width of a 12 // page, window or other display area. 13 // 14 // The returned "segment" may not be broken into smaller parts, unless no other 15 // breaking opportunities present themselves, in which case you may break by 16 // grapheme clusters (using the [FirstGraphemeCluster] function to determine the 17 // grapheme clusters). 18 // 19 // The "mustBreak" flag indicates whether you MUST break the line after the 20 // given segment (true), for example after newline characters, or you MAY break 21 // the line after the given segment (false). 22 // 23 // This function can be called continuously to extract all non-breaking sub-sets 24 // from a byte slice, as illustrated in the example below. 25 // 26 // If you don't know the current state, for example when calling the function 27 // for the first time, you must pass -1. For consecutive calls, pass the state 28 // and rest slice returned by the previous call. 29 // 30 // The "rest" slice is the sub-slice of the original byte slice "b" starting 31 // after the last byte of the identified line segment. If the length of the 32 // "rest" slice is 0, the entire byte slice "b" has been processed. The 33 // "segment" byte slice is the sub-slice of the input slice containing the 34 // identified line segment. 35 // 36 // Given an empty byte slice "b", the function returns nil values. 37 // 38 // Note that in accordance with [UAX #14 LB3], the final segment will end with 39 // "mustBreak" set to true. You can choose to ignore this by checking if the 40 // length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or 41 // [HasTrailingLineBreakInString] on the last rune. 42 // 43 // Note also that this algorithm may break within grapheme clusters. This is 44 // addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use 45 // the [Step] function instead. 46 // 47 // [Unicode Standard Annex #14]: https://www.unicode.org/reports/tr14/ 48 // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm 49 func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) { 50 // An empty byte slice returns nothing. 51 if len(b) == 0 { 52 return 53 } 54 55 // Extract the first rune. 56 r, length := utf8.DecodeRune(b) 57 if len(b) <= length { // If we're already past the end, there is nothing else to parse. 58 return b, nil, true, lbAny // LB3. 59 } 60 61 // If we don't know the state, determine it now. 62 if state < 0 { 63 state, _ = transitionLineBreakState(state, r, b[length:], "") 64 } 65 66 // Transition until we find a boundary. 67 var boundary int 68 for { 69 r, l := utf8.DecodeRune(b[length:]) 70 state, boundary = transitionLineBreakState(state, r, b[length+l:], "") 71 72 if boundary != LineDontBreak { 73 return b[:length], b[length:], boundary == LineMustBreak, state 74 } 75 76 length += l 77 if len(b) <= length { 78 return b, nil, true, lbAny // LB3 79 } 80 } 81 } 82 83 // FirstLineSegmentInString is like [FirstLineSegment] but its input and outputs 84 // are strings. 85 func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) { 86 // An empty byte slice returns nothing. 87 if len(str) == 0 { 88 return 89 } 90 91 // Extract the first rune. 92 r, length := utf8.DecodeRuneInString(str) 93 if len(str) <= length { // If we're already past the end, there is nothing else to parse. 94 return str, "", true, lbAny // LB3. 95 } 96 97 // If we don't know the state, determine it now. 98 if state < 0 { 99 state, _ = transitionLineBreakState(state, r, nil, str[length:]) 100 } 101 102 // Transition until we find a boundary. 103 var boundary int 104 for { 105 r, l := utf8.DecodeRuneInString(str[length:]) 106 state, boundary = transitionLineBreakState(state, r, nil, str[length+l:]) 107 108 if boundary != LineDontBreak { 109 return str[:length], str[length:], boundary == LineMustBreak, state 110 } 111 112 length += l 113 if len(str) <= length { 114 return str, "", true, lbAny // LB3. 115 } 116 } 117 } 118 119 // HasTrailingLineBreak returns true if the last rune in the given byte slice is 120 // one of the hard line break code points defined in LB4 and LB5 of [UAX #14]. 121 // 122 // [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm 123 func HasTrailingLineBreak(b []byte) bool { 124 r, _ := utf8.DecodeLastRune(b) 125 property, _ := propertyLineBreak(r) 126 return property == prBK || property == prCR || property == prLF || property == prNL 127 } 128 129 // HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string. 130 func HasTrailingLineBreakInString(str string) bool { 131 r, _ := utf8.DecodeLastRuneInString(str) 132 property, _ := propertyLineBreak(r) 133 return property == prBK || property == prCR || property == prLF || property == prNL 134 }