sentence.go (2735B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // FirstSentence returns the first sentence found in the given byte slice 6 // according to the rules of [Unicode Standard Annex #29, Sentence Boundaries]. 7 // This function can be called continuously to extract all sentences from a byte 8 // slice, as illustrated in the example below. 9 // 10 // If you don't know the current state, for example when calling the function 11 // for the first time, you must pass -1. For consecutive calls, pass the state 12 // and rest slice returned by the previous call. 13 // 14 // The "rest" slice is the sub-slice of the original byte slice "b" starting 15 // after the last byte of the identified sentence. If the length of the "rest" 16 // slice is 0, the entire byte slice "b" has been processed. The "sentence" byte 17 // slice is the sub-slice of the input slice containing the identified sentence. 18 // 19 // Given an empty byte slice "b", the function returns nil values. 20 // 21 // [Unicode Standard Annex #29, Sentence Boundaries]: http://unicode.org/reports/tr29/#Sentence_Boundaries 22 func FirstSentence(b []byte, state int) (sentence, rest []byte, newState int) { 23 // An empty byte slice returns nothing. 24 if len(b) == 0 { 25 return 26 } 27 28 // Extract the first rune. 29 r, length := utf8.DecodeRune(b) 30 if len(b) <= length { // If we're already past the end, there is nothing else to parse. 31 return b, nil, sbAny 32 } 33 34 // If we don't know the state, determine it now. 35 if state < 0 { 36 state, _ = transitionSentenceBreakState(state, r, b[length:], "") 37 } 38 39 // Transition until we find a boundary. 40 var boundary bool 41 for { 42 r, l := utf8.DecodeRune(b[length:]) 43 state, boundary = transitionSentenceBreakState(state, r, b[length+l:], "") 44 45 if boundary { 46 return b[:length], b[length:], state 47 } 48 49 length += l 50 if len(b) <= length { 51 return b, nil, sbAny 52 } 53 } 54 } 55 56 // FirstSentenceInString is like [FirstSentence] but its input and outputs are 57 // strings. 58 func FirstSentenceInString(str string, state int) (sentence, rest string, newState int) { 59 // An empty byte slice returns nothing. 60 if len(str) == 0 { 61 return 62 } 63 64 // Extract the first rune. 65 r, length := utf8.DecodeRuneInString(str) 66 if len(str) <= length { // If we're already past the end, there is nothing else to parse. 67 return str, "", sbAny 68 } 69 70 // If we don't know the state, determine it now. 71 if state < 0 { 72 state, _ = transitionSentenceBreakState(state, r, nil, str[length:]) 73 } 74 75 // Transition until we find a boundary. 76 var boundary bool 77 for { 78 r, l := utf8.DecodeRuneInString(str[length:]) 79 state, boundary = transitionSentenceBreakState(state, r, nil, str[length+l:]) 80 81 if boundary { 82 return str[:length], str[length:], state 83 } 84 85 length += l 86 if len(str) <= length { 87 return str, "", sbAny 88 } 89 } 90 }