src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

splitfunc.go (4142B)


      1 package graphemes
      2 
      3 import (
      4 	"bufio"
      5 
      6 	"github.com/clipperhouse/stringish"
      7 )
      8 
      9 // is determines if lookup intersects propert(ies)
     10 func (lookup property) is(properties property) bool {
     11 	return (lookup & properties) != 0
     12 }
     13 
     14 const _Ignore = _Extend
     15 
     16 // SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
     17 //
     18 // See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
     19 var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
     20 
     21 func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) {
     22 	var empty T
     23 	if len(data) == 0 {
     24 		return 0, empty, nil
     25 	}
     26 
     27 	// These vars are stateful across loop iterations
     28 	var pos int
     29 	var lastExIgnore property = 0     // "last excluding ignored categories"
     30 	var lastLastExIgnore property = 0 // "last one before that"
     31 	var regionalIndicatorCount int
     32 
     33 	// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
     34 	// to the right of the ×, from which we look back or forward
     35 
     36 	current, w := lookup(data[pos:])
     37 	if w == 0 {
     38 		if !atEOF {
     39 			// Rune extends past current data, request more
     40 			return 0, empty, nil
     41 		}
     42 		pos = len(data)
     43 		return pos, data[:pos], nil
     44 	}
     45 
     46 	// https://unicode.org/reports/tr29/#GB1
     47 	// Start of text always advances
     48 	pos += w
     49 
     50 	for {
     51 		eot := pos == len(data) // "end of text"
     52 
     53 		if eot {
     54 			if !atEOF {
     55 				// Token extends past current data, request more
     56 				return 0, empty, nil
     57 			}
     58 
     59 			// https://unicode.org/reports/tr29/#GB2
     60 			break
     61 		}
     62 
     63 		/*
     64 			We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
     65 			because we've checked for len(data) at the top of this function,
     66 			sot and eot are mutually exclusive, order doesn't matter.
     67 		*/
     68 
     69 		// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
     70 		// to the right of the ×, from which we look back or forward
     71 
     72 		// Remember previous properties to avoid lookups/lookbacks
     73 		last := current
     74 		if !last.is(_Ignore) {
     75 			lastLastExIgnore = lastExIgnore
     76 			lastExIgnore = last
     77 		}
     78 
     79 		current, w = lookup(data[pos:])
     80 		if w == 0 {
     81 			if atEOF {
     82 				// Just return the bytes, we can't do anything with them
     83 				pos = len(data)
     84 				break
     85 			}
     86 			// Rune extends past current data, request more
     87 			return 0, empty, nil
     88 		}
     89 
     90 		// Optimization: no rule can possibly apply
     91 		if current|last == 0 { // i.e. both are zero
     92 			break
     93 		}
     94 
     95 		// https://unicode.org/reports/tr29/#GB3
     96 		if current.is(_LF) && last.is(_CR) {
     97 			pos += w
     98 			continue
     99 		}
    100 
    101 		// https://unicode.org/reports/tr29/#GB4
    102 		// https://unicode.org/reports/tr29/#GB5
    103 		if (current | last).is(_Control | _CR | _LF) {
    104 			break
    105 		}
    106 
    107 		// https://unicode.org/reports/tr29/#GB6
    108 		if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
    109 			pos += w
    110 			continue
    111 		}
    112 
    113 		// https://unicode.org/reports/tr29/#GB7
    114 		if current.is(_V|_T) && last.is(_LV|_V) {
    115 			pos += w
    116 			continue
    117 		}
    118 
    119 		// https://unicode.org/reports/tr29/#GB8
    120 		if current.is(_T) && last.is(_LVT|_T) {
    121 			pos += w
    122 			continue
    123 		}
    124 
    125 		// https://unicode.org/reports/tr29/#GB9
    126 		if current.is(_Extend | _ZWJ) {
    127 			pos += w
    128 			continue
    129 		}
    130 
    131 		// https://unicode.org/reports/tr29/#GB9a
    132 		if current.is(_SpacingMark) {
    133 			pos += w
    134 			continue
    135 		}
    136 
    137 		// https://unicode.org/reports/tr29/#GB9b
    138 		if last.is(_Prepend) {
    139 			pos += w
    140 			continue
    141 		}
    142 
    143 		// https://unicode.org/reports/tr29/#GB9c
    144 		// TODO(clipperhouse):
    145 		// It appears to be added in Unicode 15.1.0:
    146 		// https://unicode.org/versions/Unicode15.1.0/#Migration
    147 		// This package currently supports Unicode 15.0.0, so
    148 		// out of scope for now
    149 
    150 		// https://unicode.org/reports/tr29/#GB11
    151 		if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
    152 			pos += w
    153 			continue
    154 		}
    155 
    156 		// https://unicode.org/reports/tr29/#GB12
    157 		// https://unicode.org/reports/tr29/#GB13
    158 		if (current & last).is(_RegionalIndicator) {
    159 			regionalIndicatorCount++
    160 
    161 			odd := regionalIndicatorCount%2 == 1
    162 			if odd {
    163 				pos += w
    164 				continue
    165 			}
    166 		}
    167 
    168 		// If we fall through all the above rules, it's a grapheme cluster break
    169 		break
    170 	}
    171 
    172 	// Return token
    173 	return pos, data[:pos], nil
    174 }