splitfunc.go (4142B)
1 package graphemes 2 3 import ( 4 "bufio" 5 6 "github.com/clipperhouse/stringish" 7 ) 8 9 // is determines if lookup intersects propert(ies) 10 func (lookup property) is(properties property) bool { 11 return (lookup & properties) != 0 12 } 13 14 const _Ignore = _Extend 15 16 // SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner. 17 // 18 // See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries. 19 var SplitFunc bufio.SplitFunc = splitFunc[[]byte] 20 21 func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) { 22 var empty T 23 if len(data) == 0 { 24 return 0, empty, nil 25 } 26 27 // These vars are stateful across loop iterations 28 var pos int 29 var lastExIgnore property = 0 // "last excluding ignored categories" 30 var lastLastExIgnore property = 0 // "last one before that" 31 var regionalIndicatorCount int 32 33 // Rules are usually of the form Cat1 × Cat2; "current" refers to the first property 34 // to the right of the ×, from which we look back or forward 35 36 current, w := lookup(data[pos:]) 37 if w == 0 { 38 if !atEOF { 39 // Rune extends past current data, request more 40 return 0, empty, nil 41 } 42 pos = len(data) 43 return pos, data[:pos], nil 44 } 45 46 // https://unicode.org/reports/tr29/#GB1 47 // Start of text always advances 48 pos += w 49 50 for { 51 eot := pos == len(data) // "end of text" 52 53 if eot { 54 if !atEOF { 55 // Token extends past current data, request more 56 return 0, empty, nil 57 } 58 59 // https://unicode.org/reports/tr29/#GB2 60 break 61 } 62 63 /* 64 We've switched the evaluation order of GB1↓ and GB2↑. It's ok: 65 because we've checked for len(data) at the top of this function, 66 sot and eot are mutually exclusive, order doesn't matter. 67 */ 68 69 // Rules are usually of the form Cat1 × Cat2; "current" refers to the first property 70 // to the right of the ×, from which we look back or forward 71 72 // Remember previous properties to avoid lookups/lookbacks 73 last := current 74 if !last.is(_Ignore) { 75 lastLastExIgnore = lastExIgnore 76 lastExIgnore = last 77 } 78 79 current, w = lookup(data[pos:]) 80 if w == 0 { 81 if atEOF { 82 // Just return the bytes, we can't do anything with them 83 pos = len(data) 84 break 85 } 86 // Rune extends past current data, request more 87 return 0, empty, nil 88 } 89 90 // Optimization: no rule can possibly apply 91 if current|last == 0 { // i.e. both are zero 92 break 93 } 94 95 // https://unicode.org/reports/tr29/#GB3 96 if current.is(_LF) && last.is(_CR) { 97 pos += w 98 continue 99 } 100 101 // https://unicode.org/reports/tr29/#GB4 102 // https://unicode.org/reports/tr29/#GB5 103 if (current | last).is(_Control | _CR | _LF) { 104 break 105 } 106 107 // https://unicode.org/reports/tr29/#GB6 108 if current.is(_L|_V|_LV|_LVT) && last.is(_L) { 109 pos += w 110 continue 111 } 112 113 // https://unicode.org/reports/tr29/#GB7 114 if current.is(_V|_T) && last.is(_LV|_V) { 115 pos += w 116 continue 117 } 118 119 // https://unicode.org/reports/tr29/#GB8 120 if current.is(_T) && last.is(_LVT|_T) { 121 pos += w 122 continue 123 } 124 125 // https://unicode.org/reports/tr29/#GB9 126 if current.is(_Extend | _ZWJ) { 127 pos += w 128 continue 129 } 130 131 // https://unicode.org/reports/tr29/#GB9a 132 if current.is(_SpacingMark) { 133 pos += w 134 continue 135 } 136 137 // https://unicode.org/reports/tr29/#GB9b 138 if last.is(_Prepend) { 139 pos += w 140 continue 141 } 142 143 // https://unicode.org/reports/tr29/#GB9c 144 // TODO(clipperhouse): 145 // It appears to be added in Unicode 15.1.0: 146 // https://unicode.org/versions/Unicode15.1.0/#Migration 147 // This package currently supports Unicode 15.0.0, so 148 // out of scope for now 149 150 // https://unicode.org/reports/tr29/#GB11 151 if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) { 152 pos += w 153 continue 154 } 155 156 // https://unicode.org/reports/tr29/#GB12 157 // https://unicode.org/reports/tr29/#GB13 158 if (current & last).is(_RegionalIndicator) { 159 regionalIndicatorCount++ 160 161 odd := regionalIndicatorCount%2 == 1 162 if odd { 163 pos += w 164 continue 165 } 166 } 167 168 // If we fall through all the above rules, it's a grapheme cluster break 169 break 170 } 171 172 // Return token 173 return pos, data[:pos], nil 174 }