src

Go monorepo.
git clone git://code.dwrz.net/src
Log | Files | Refs

properties.go (3509B)


      1 package uniseg
      2 
      3 // The Unicode properties as used in the various parsers. Only the ones needed
      4 // in the context of this package are included.
      5 const (
      6 	prXX      = 0    // Same as prAny.
      7 	prAny     = iota // prAny must be 0.
      8 	prPrepend        // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
      9 	prCR
     10 	prLF
     11 	prControl
     12 	prExtend
     13 	prRegionalIndicator
     14 	prSpacingMark
     15 	prL
     16 	prV
     17 	prT
     18 	prLV
     19 	prLVT
     20 	prZWJ
     21 	prExtendedPictographic
     22 	prNewline
     23 	prWSegSpace
     24 	prDoubleQuote
     25 	prSingleQuote
     26 	prMidNumLet
     27 	prNumeric
     28 	prMidLetter
     29 	prMidNum
     30 	prExtendNumLet
     31 	prALetter
     32 	prFormat
     33 	prHebrewLetter
     34 	prKatakana
     35 	prSp
     36 	prSTerm
     37 	prClose
     38 	prSContinue
     39 	prATerm
     40 	prUpper
     41 	prLower
     42 	prSep
     43 	prOLetter
     44 	prCM
     45 	prBA
     46 	prBK
     47 	prSP
     48 	prEX
     49 	prQU
     50 	prAL
     51 	prPR
     52 	prPO
     53 	prOP
     54 	prCP
     55 	prIS
     56 	prHY
     57 	prSY
     58 	prNU
     59 	prCL
     60 	prNL
     61 	prGL
     62 	prAI
     63 	prBB
     64 	prHL
     65 	prSA
     66 	prJL
     67 	prJV
     68 	prJT
     69 	prNS
     70 	prZW
     71 	prB2
     72 	prIN
     73 	prWJ
     74 	prID
     75 	prEB
     76 	prCJ
     77 	prH2
     78 	prH3
     79 	prSG
     80 	prCB
     81 	prRI
     82 	prEM
     83 	prN
     84 	prNa
     85 	prA
     86 	prW
     87 	prH
     88 	prF
     89 	prEmojiPresentation
     90 )
     91 
     92 // Unicode General Categories. Only the ones needed in the context of this
     93 // package are included.
     94 const (
     95 	gcNone = iota // gcNone must be 0.
     96 	gcCc
     97 	gcZs
     98 	gcPo
     99 	gcSc
    100 	gcPs
    101 	gcPe
    102 	gcSm
    103 	gcPd
    104 	gcNd
    105 	gcLu
    106 	gcSk
    107 	gcPc
    108 	gcLl
    109 	gcSo
    110 	gcLo
    111 	gcPi
    112 	gcCf
    113 	gcNo
    114 	gcPf
    115 	gcLC
    116 	gcLm
    117 	gcMn
    118 	gcMe
    119 	gcMc
    120 	gcNl
    121 	gcZl
    122 	gcZp
    123 	gcCn
    124 	gcCs
    125 	gcCo
    126 )
    127 
    128 // Special code points.
    129 const (
    130 	vs15 = 0xfe0e // Variation Selector-15 (text presentation)
    131 	vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
    132 )
    133 
    134 // propertySearch performs a binary search on a property slice and returns the
    135 // entry whose range (start = first array element, end = second array element)
    136 // includes r, or an array of 0's if no such entry was found.
    137 func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
    138 	// Run a binary search.
    139 	from := 0
    140 	to := len(dictionary)
    141 	for to > from {
    142 		middle := (from + to) / 2
    143 		cpRange := dictionary[middle]
    144 		if int(r) < cpRange[0] {
    145 			to = middle
    146 			continue
    147 		}
    148 		if int(r) > cpRange[1] {
    149 			from = middle + 1
    150 			continue
    151 		}
    152 		return cpRange
    153 	}
    154 	return
    155 }
    156 
    157 // property returns the Unicode property value (see constants above) of the
    158 // given code point.
    159 func property(dictionary [][3]int, r rune) int {
    160 	return propertySearch(dictionary, r)[2]
    161 }
    162 
    163 // propertyLineBreak returns the Unicode property value and General Category
    164 // (see constants above) of the given code point, as listed in the line break
    165 // code points table, while fast tracking ASCII digits and letters.
    166 func propertyLineBreak(r rune) (property, generalCategory int) {
    167 	if r >= 'a' && r <= 'z' {
    168 		return prAL, gcLl
    169 	}
    170 	if r >= 'A' && r <= 'Z' {
    171 		return prAL, gcLu
    172 	}
    173 	if r >= '0' && r <= '9' {
    174 		return prNU, gcNd
    175 	}
    176 	entry := propertySearch(lineBreakCodePoints, r)
    177 	return entry[2], entry[3]
    178 }
    179 
    180 // propertyGraphemes returns the Unicode grapheme cluster property value of the
    181 // given code point while fast tracking ASCII characters.
    182 func propertyGraphemes(r rune) int {
    183 	if r >= 0x20 && r <= 0x7e {
    184 		return prAny
    185 	}
    186 	if r == 0x0a {
    187 		return prLF
    188 	}
    189 	if r == 0x0d {
    190 		return prCR
    191 	}
    192 	if r >= 0 && r <= 0x1f || r == 0x7f {
    193 		return prControl
    194 	}
    195 	return property(graphemeCodePoints, r)
    196 }
    197 
    198 // propertyEastAsianWidth returns the Unicode East Asian Width property value of
    199 // the given code point while fast tracking ASCII characters.
    200 func propertyEastAsianWidth(r rune) int {
    201 	if r >= 0x20 && r <= 0x7e {
    202 		return prNa
    203 	}
    204 	if r >= 0 && r <= 0x1f || r == 0x7f {
    205 		return prN
    206 	}
    207 	return property(eastAsianWidth, r)
    208 }