linerules.go (16008B)
1 package uniseg 2 3 import "unicode/utf8" 4 5 // The states of the line break parser. 6 const ( 7 lbAny = iota 8 lbBK 9 lbCR 10 lbLF 11 lbNL 12 lbSP 13 lbZW 14 lbWJ 15 lbGL 16 lbBA 17 lbHY 18 lbCL 19 lbCP 20 lbEX 21 lbIS 22 lbSY 23 lbOP 24 lbQU 25 lbQUSP 26 lbNS 27 lbCLCPSP 28 lbB2 29 lbB2SP 30 lbCB 31 lbBB 32 lbLB21a 33 lbHL 34 lbAL 35 lbNU 36 lbPR 37 lbEB 38 lbIDEM 39 lbNUNU 40 lbNUSY 41 lbNUIS 42 lbNUCL 43 lbNUCP 44 lbPO 45 lbJL 46 lbJV 47 lbJT 48 lbH2 49 lbH3 50 lbOddRI 51 lbEvenRI 52 lbExtPicCn 53 lbZWJBit = 64 54 lbCPeaFWHBit = 128 55 ) 56 57 // These constants define whether a given text may be broken into the next line. 58 // If the break is optional (LineCanBreak), you may choose to break or not based 59 // on your own criteria, for example, if the text has reached the available 60 // width. 61 const ( 62 LineDontBreak = iota // You may not break the line here. 63 LineCanBreak // You may or may not break the line here. 64 LineMustBreak // You must break the line here. 65 ) 66 67 // lbTransitions implements the line break parser's state transitions. It's 68 // anologous to [grTransitions], see comments there for details. 69 // 70 // Unicode version 15.0.0. 71 func lbTransitions(state, prop int) (newState, lineBreak, rule int) { 72 switch uint64(state) | uint64(prop)<<32 { 73 // LB4. 74 case lbBK | prAny<<32: 75 return lbAny, LineMustBreak, 40 76 77 // LB5. 78 case lbCR | prLF<<32: 79 return lbLF, LineDontBreak, 50 80 case lbCR | prAny<<32: 81 return lbAny, LineMustBreak, 50 82 case lbLF | prAny<<32: 83 return lbAny, LineMustBreak, 50 84 case lbNL | prAny<<32: 85 return lbAny, LineMustBreak, 50 86 87 // LB6. 88 case lbAny | prBK<<32: 89 return lbBK, LineDontBreak, 60 90 case lbAny | prCR<<32: 91 return lbCR, LineDontBreak, 60 92 case lbAny | prLF<<32: 93 return lbLF, LineDontBreak, 60 94 case lbAny | prNL<<32: 95 return lbNL, LineDontBreak, 60 96 97 // LB7. 98 case lbAny | prSP<<32: 99 return lbSP, LineDontBreak, 70 100 case lbAny | prZW<<32: 101 return lbZW, LineDontBreak, 70 102 103 // LB8. 104 case lbZW | prSP<<32: 105 return lbZW, LineDontBreak, 70 106 case lbZW | prAny<<32: 107 return lbAny, LineCanBreak, 80 108 109 // LB11. 110 case lbAny | prWJ<<32: 111 return lbWJ, LineDontBreak, 110 112 case lbWJ | prAny<<32: 113 return lbAny, LineDontBreak, 110 114 115 // LB12. 116 case lbAny | prGL<<32: 117 return lbGL, LineCanBreak, 310 118 case lbGL | prAny<<32: 119 return lbAny, LineDontBreak, 120 120 121 // LB13 (simple transitions). 122 case lbAny | prCL<<32: 123 return lbCL, LineCanBreak, 310 124 case lbAny | prCP<<32: 125 return lbCP, LineCanBreak, 310 126 case lbAny | prEX<<32: 127 return lbEX, LineDontBreak, 130 128 case lbAny | prIS<<32: 129 return lbIS, LineCanBreak, 310 130 case lbAny | prSY<<32: 131 return lbSY, LineCanBreak, 310 132 133 // LB14. 134 case lbAny | prOP<<32: 135 return lbOP, LineCanBreak, 310 136 case lbOP | prSP<<32: 137 return lbOP, LineDontBreak, 70 138 case lbOP | prAny<<32: 139 return lbAny, LineDontBreak, 140 140 141 // LB15. 142 case lbQU | prSP<<32: 143 return lbQUSP, LineDontBreak, 70 144 case lbQU | prOP<<32: 145 return lbOP, LineDontBreak, 150 146 case lbQUSP | prOP<<32: 147 return lbOP, LineDontBreak, 150 148 149 // LB16. 150 case lbCL | prSP<<32: 151 return lbCLCPSP, LineDontBreak, 70 152 case lbNUCL | prSP<<32: 153 return lbCLCPSP, LineDontBreak, 70 154 case lbCP | prSP<<32: 155 return lbCLCPSP, LineDontBreak, 70 156 case lbNUCP | prSP<<32: 157 return lbCLCPSP, LineDontBreak, 70 158 case lbCL | prNS<<32: 159 return lbNS, LineDontBreak, 160 160 case lbNUCL | prNS<<32: 161 return lbNS, LineDontBreak, 160 162 case lbCP | prNS<<32: 163 return lbNS, LineDontBreak, 160 164 case lbNUCP | prNS<<32: 165 return lbNS, LineDontBreak, 160 166 case lbCLCPSP | prNS<<32: 167 return lbNS, LineDontBreak, 160 168 169 // LB17. 170 case lbAny | prB2<<32: 171 return lbB2, LineCanBreak, 310 172 case lbB2 | prSP<<32: 173 return lbB2SP, LineDontBreak, 70 174 case lbB2 | prB2<<32: 175 return lbB2, LineDontBreak, 170 176 case lbB2SP | prB2<<32: 177 return lbB2, LineDontBreak, 170 178 179 // LB18. 180 case lbSP | prAny<<32: 181 return lbAny, LineCanBreak, 180 182 case lbQUSP | prAny<<32: 183 return lbAny, LineCanBreak, 180 184 case lbCLCPSP | prAny<<32: 185 return lbAny, LineCanBreak, 180 186 case lbB2SP | prAny<<32: 187 return lbAny, LineCanBreak, 180 188 189 // LB19. 190 case lbAny | prQU<<32: 191 return lbQU, LineDontBreak, 190 192 case lbQU | prAny<<32: 193 return lbAny, LineDontBreak, 190 194 195 // LB20. 196 case lbAny | prCB<<32: 197 return lbCB, LineCanBreak, 200 198 case lbCB | prAny<<32: 199 return lbAny, LineCanBreak, 200 200 201 // LB21. 202 case lbAny | prBA<<32: 203 return lbBA, LineDontBreak, 210 204 case lbAny | prHY<<32: 205 return lbHY, LineDontBreak, 210 206 case lbAny | prNS<<32: 207 return lbNS, LineDontBreak, 210 208 case lbAny | prBB<<32: 209 return lbBB, LineCanBreak, 310 210 case lbBB | prAny<<32: 211 return lbAny, LineDontBreak, 210 212 213 // LB21a. 214 case lbAny | prHL<<32: 215 return lbHL, LineCanBreak, 310 216 case lbHL | prHY<<32: 217 return lbLB21a, LineDontBreak, 210 218 case lbHL | prBA<<32: 219 return lbLB21a, LineDontBreak, 210 220 case lbLB21a | prAny<<32: 221 return lbAny, LineDontBreak, 211 222 223 // LB21b. 224 case lbSY | prHL<<32: 225 return lbHL, LineDontBreak, 212 226 case lbNUSY | prHL<<32: 227 return lbHL, LineDontBreak, 212 228 229 // LB22. 230 case lbAny | prIN<<32: 231 return lbAny, LineDontBreak, 220 232 233 // LB23. 234 case lbAny | prAL<<32: 235 return lbAL, LineCanBreak, 310 236 case lbAny | prNU<<32: 237 return lbNU, LineCanBreak, 310 238 case lbAL | prNU<<32: 239 return lbNU, LineDontBreak, 230 240 case lbHL | prNU<<32: 241 return lbNU, LineDontBreak, 230 242 case lbNU | prAL<<32: 243 return lbAL, LineDontBreak, 230 244 case lbNU | prHL<<32: 245 return lbHL, LineDontBreak, 230 246 case lbNUNU | prAL<<32: 247 return lbAL, LineDontBreak, 230 248 case lbNUNU | prHL<<32: 249 return lbHL, LineDontBreak, 230 250 251 // LB23a. 252 case lbAny | prPR<<32: 253 return lbPR, LineCanBreak, 310 254 case lbAny | prID<<32: 255 return lbIDEM, LineCanBreak, 310 256 case lbAny | prEB<<32: 257 return lbEB, LineCanBreak, 310 258 case lbAny | prEM<<32: 259 return lbIDEM, LineCanBreak, 310 260 case lbPR | prID<<32: 261 return lbIDEM, LineDontBreak, 231 262 case lbPR | prEB<<32: 263 return lbEB, LineDontBreak, 231 264 case lbPR | prEM<<32: 265 return lbIDEM, LineDontBreak, 231 266 case lbIDEM | prPO<<32: 267 return lbPO, LineDontBreak, 231 268 case lbEB | prPO<<32: 269 return lbPO, LineDontBreak, 231 270 271 // LB24. 272 case lbAny | prPO<<32: 273 return lbPO, LineCanBreak, 310 274 case lbPR | prAL<<32: 275 return lbAL, LineDontBreak, 240 276 case lbPR | prHL<<32: 277 return lbHL, LineDontBreak, 240 278 case lbPO | prAL<<32: 279 return lbAL, LineDontBreak, 240 280 case lbPO | prHL<<32: 281 return lbHL, LineDontBreak, 240 282 case lbAL | prPR<<32: 283 return lbPR, LineDontBreak, 240 284 case lbAL | prPO<<32: 285 return lbPO, LineDontBreak, 240 286 case lbHL | prPR<<32: 287 return lbPR, LineDontBreak, 240 288 case lbHL | prPO<<32: 289 return lbPO, LineDontBreak, 240 290 291 // LB25 (simple transitions). 292 case lbPR | prNU<<32: 293 return lbNU, LineDontBreak, 250 294 case lbPO | prNU<<32: 295 return lbNU, LineDontBreak, 250 296 case lbOP | prNU<<32: 297 return lbNU, LineDontBreak, 250 298 case lbHY | prNU<<32: 299 return lbNU, LineDontBreak, 250 300 case lbNU | prNU<<32: 301 return lbNUNU, LineDontBreak, 250 302 case lbNU | prSY<<32: 303 return lbNUSY, LineDontBreak, 250 304 case lbNU | prIS<<32: 305 return lbNUIS, LineDontBreak, 250 306 case lbNUNU | prNU<<32: 307 return lbNUNU, LineDontBreak, 250 308 case lbNUNU | prSY<<32: 309 return lbNUSY, LineDontBreak, 250 310 case lbNUNU | prIS<<32: 311 return lbNUIS, LineDontBreak, 250 312 case lbNUSY | prNU<<32: 313 return lbNUNU, LineDontBreak, 250 314 case lbNUSY | prSY<<32: 315 return lbNUSY, LineDontBreak, 250 316 case lbNUSY | prIS<<32: 317 return lbNUIS, LineDontBreak, 250 318 case lbNUIS | prNU<<32: 319 return lbNUNU, LineDontBreak, 250 320 case lbNUIS | prSY<<32: 321 return lbNUSY, LineDontBreak, 250 322 case lbNUIS | prIS<<32: 323 return lbNUIS, LineDontBreak, 250 324 case lbNU | prCL<<32: 325 return lbNUCL, LineDontBreak, 250 326 case lbNU | prCP<<32: 327 return lbNUCP, LineDontBreak, 250 328 case lbNUNU | prCL<<32: 329 return lbNUCL, LineDontBreak, 250 330 case lbNUNU | prCP<<32: 331 return lbNUCP, LineDontBreak, 250 332 case lbNUSY | prCL<<32: 333 return lbNUCL, LineDontBreak, 250 334 case lbNUSY | prCP<<32: 335 return lbNUCP, LineDontBreak, 250 336 case lbNUIS | prCL<<32: 337 return lbNUCL, LineDontBreak, 250 338 case lbNUIS | prCP<<32: 339 return lbNUCP, LineDontBreak, 250 340 case lbNU | prPO<<32: 341 return lbPO, LineDontBreak, 250 342 case lbNUNU | prPO<<32: 343 return lbPO, LineDontBreak, 250 344 case lbNUSY | prPO<<32: 345 return lbPO, LineDontBreak, 250 346 case lbNUIS | prPO<<32: 347 return lbPO, LineDontBreak, 250 348 case lbNUCL | prPO<<32: 349 return lbPO, LineDontBreak, 250 350 case lbNUCP | prPO<<32: 351 return lbPO, LineDontBreak, 250 352 case lbNU | prPR<<32: 353 return lbPR, LineDontBreak, 250 354 case lbNUNU | prPR<<32: 355 return lbPR, LineDontBreak, 250 356 case lbNUSY | prPR<<32: 357 return lbPR, LineDontBreak, 250 358 case lbNUIS | prPR<<32: 359 return lbPR, LineDontBreak, 250 360 case lbNUCL | prPR<<32: 361 return lbPR, LineDontBreak, 250 362 case lbNUCP | prPR<<32: 363 return lbPR, LineDontBreak, 250 364 365 // LB26. 366 case lbAny | prJL<<32: 367 return lbJL, LineCanBreak, 310 368 case lbAny | prJV<<32: 369 return lbJV, LineCanBreak, 310 370 case lbAny | prJT<<32: 371 return lbJT, LineCanBreak, 310 372 case lbAny | prH2<<32: 373 return lbH2, LineCanBreak, 310 374 case lbAny | prH3<<32: 375 return lbH3, LineCanBreak, 310 376 case lbJL | prJL<<32: 377 return lbJL, LineDontBreak, 260 378 case lbJL | prJV<<32: 379 return lbJV, LineDontBreak, 260 380 case lbJL | prH2<<32: 381 return lbH2, LineDontBreak, 260 382 case lbJL | prH3<<32: 383 return lbH3, LineDontBreak, 260 384 case lbJV | prJV<<32: 385 return lbJV, LineDontBreak, 260 386 case lbJV | prJT<<32: 387 return lbJT, LineDontBreak, 260 388 case lbH2 | prJV<<32: 389 return lbJV, LineDontBreak, 260 390 case lbH2 | prJT<<32: 391 return lbJT, LineDontBreak, 260 392 case lbJT | prJT<<32: 393 return lbJT, LineDontBreak, 260 394 case lbH3 | prJT<<32: 395 return lbJT, LineDontBreak, 260 396 397 // LB27. 398 case lbJL | prPO<<32: 399 return lbPO, LineDontBreak, 270 400 case lbJV | prPO<<32: 401 return lbPO, LineDontBreak, 270 402 case lbJT | prPO<<32: 403 return lbPO, LineDontBreak, 270 404 case lbH2 | prPO<<32: 405 return lbPO, LineDontBreak, 270 406 case lbH3 | prPO<<32: 407 return lbPO, LineDontBreak, 270 408 case lbPR | prJL<<32: 409 return lbJL, LineDontBreak, 270 410 case lbPR | prJV<<32: 411 return lbJV, LineDontBreak, 270 412 case lbPR | prJT<<32: 413 return lbJT, LineDontBreak, 270 414 case lbPR | prH2<<32: 415 return lbH2, LineDontBreak, 270 416 case lbPR | prH3<<32: 417 return lbH3, LineDontBreak, 270 418 419 // LB28. 420 case lbAL | prAL<<32: 421 return lbAL, LineDontBreak, 280 422 case lbAL | prHL<<32: 423 return lbHL, LineDontBreak, 280 424 case lbHL | prAL<<32: 425 return lbAL, LineDontBreak, 280 426 case lbHL | prHL<<32: 427 return lbHL, LineDontBreak, 280 428 429 // LB29. 430 case lbIS | prAL<<32: 431 return lbAL, LineDontBreak, 290 432 case lbIS | prHL<<32: 433 return lbHL, LineDontBreak, 290 434 case lbNUIS | prAL<<32: 435 return lbAL, LineDontBreak, 290 436 case lbNUIS | prHL<<32: 437 return lbHL, LineDontBreak, 290 438 439 default: 440 return -1, -1, -1 441 } 442 } 443 444 // transitionLineBreakState determines the new state of the line break parser 445 // given the current state and the next code point. It also returns the type of 446 // line break: LineDontBreak, LineCanBreak, or LineMustBreak. If more than one 447 // code point is needed to determine the new state, the byte slice or the string 448 // starting after rune "r" can be used (whichever is not nil or empty) for 449 // further lookups. 450 func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) { 451 // Determine the property of the next character. 452 nextProperty, generalCategory := propertyLineBreak(r) 453 454 // Prepare. 455 var forceNoBreak, isCPeaFWH bool 456 if state >= 0 && state&lbCPeaFWHBit != 0 { 457 isCPeaFWH = true // LB30: CP but ea is not F, W, or H. 458 state = state &^ lbCPeaFWHBit 459 } 460 if state >= 0 && state&lbZWJBit != 0 { 461 state = state &^ lbZWJBit // Extract zero-width joiner bit. 462 forceNoBreak = true // LB8a. 463 } 464 465 defer func() { 466 // Transition into LB30. 467 if newState == lbCP || newState == lbNUCP { 468 ea := propertyEastAsianWidth(r) 469 if ea != prF && ea != prW && ea != prH { 470 newState |= lbCPeaFWHBit 471 } 472 } 473 474 // Override break. 475 if forceNoBreak { 476 lineBreak = LineDontBreak 477 } 478 }() 479 480 // LB1. 481 if nextProperty == prAI || nextProperty == prSG || nextProperty == prXX { 482 nextProperty = prAL 483 } else if nextProperty == prSA { 484 if generalCategory == gcMn || generalCategory == gcMc { 485 nextProperty = prCM 486 } else { 487 nextProperty = prAL 488 } 489 } else if nextProperty == prCJ { 490 nextProperty = prNS 491 } 492 493 // Combining marks. 494 if nextProperty == prZWJ || nextProperty == prCM { 495 var bit int 496 if nextProperty == prZWJ { 497 bit = lbZWJBit 498 } 499 mustBreakState := state < 0 || state == lbBK || state == lbCR || state == lbLF || state == lbNL 500 if !mustBreakState && state != lbSP && state != lbZW && state != lbQUSP && state != lbCLCPSP && state != lbB2SP { 501 // LB9. 502 return state | bit, LineDontBreak 503 } else { 504 // LB10. 505 if mustBreakState { 506 return lbAL | bit, LineMustBreak 507 } 508 return lbAL | bit, LineCanBreak 509 } 510 } 511 512 // Find the applicable transition in the table. 513 var rule int 514 newState, lineBreak, rule = lbTransitions(state, nextProperty) 515 if newState < 0 { 516 // No specific transition found. Try the less specific ones. 517 anyPropProp, anyPropLineBreak, anyPropRule := lbTransitions(state, prAny) 518 anyStateProp, anyStateLineBreak, anyStateRule := lbTransitions(lbAny, nextProperty) 519 if anyPropProp >= 0 && anyStateProp >= 0 { 520 // Both apply. We'll use a mix (see comments for grTransitions). 521 newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule 522 if anyPropRule < anyStateRule { 523 lineBreak, rule = anyPropLineBreak, anyPropRule 524 } 525 } else if anyPropProp >= 0 { 526 // We only have a specific state. 527 newState, lineBreak, rule = anyPropProp, anyPropLineBreak, anyPropRule 528 // This branch will probably never be reached because okAnyState will 529 // always be true given the current transition map. But we keep it here 530 // for future modifications to the transition map where this may not be 531 // true anymore. 532 } else if anyStateProp >= 0 { 533 // We only have a specific property. 534 newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule 535 } else { 536 // No known transition. LB31: ALL รท ALL. 537 newState, lineBreak, rule = lbAny, LineCanBreak, 310 538 } 539 } 540 541 // LB12a. 542 if rule > 121 && 543 nextProperty == prGL && 544 (state != lbSP && state != lbBA && state != lbHY && state != lbLB21a && state != lbQUSP && state != lbCLCPSP && state != lbB2SP) { 545 return lbGL, LineDontBreak 546 } 547 548 // LB13. 549 if rule > 130 && state != lbNU && state != lbNUNU { 550 switch nextProperty { 551 case prCL: 552 return lbCL, LineDontBreak 553 case prCP: 554 return lbCP, LineDontBreak 555 case prIS: 556 return lbIS, LineDontBreak 557 case prSY: 558 return lbSY, LineDontBreak 559 } 560 } 561 562 // LB25 (look ahead). 563 if rule > 250 && 564 (state == lbPR || state == lbPO) && 565 nextProperty == prOP || nextProperty == prHY { 566 var r rune 567 if b != nil { // Byte slice version. 568 r, _ = utf8.DecodeRune(b) 569 } else { // String version. 570 r, _ = utf8.DecodeRuneInString(str) 571 } 572 if r != utf8.RuneError { 573 pr, _ := propertyLineBreak(r) 574 if pr == prNU { 575 return lbNU, LineDontBreak 576 } 577 } 578 } 579 580 // LB30 (part one). 581 if rule > 300 { 582 if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP { 583 ea := propertyEastAsianWidth(r) 584 if ea != prF && ea != prW && ea != prH { 585 return lbOP, LineDontBreak 586 } 587 } else if isCPeaFWH { 588 switch nextProperty { 589 case prAL: 590 return lbAL, LineDontBreak 591 case prHL: 592 return lbHL, LineDontBreak 593 case prNU: 594 return lbNU, LineDontBreak 595 } 596 } 597 } 598 599 // LB30a. 600 if newState == lbAny && nextProperty == prRI { 601 if state != lbOddRI && state != lbEvenRI { // Includes state == -1. 602 // Transition into the first RI. 603 return lbOddRI, lineBreak 604 } 605 if state == lbOddRI { 606 // Don't break pairs of Regional Indicators. 607 return lbEvenRI, LineDontBreak 608 } 609 return lbOddRI, lineBreak 610 } 611 612 // LB30b. 613 if rule > 302 { 614 if nextProperty == prEM { 615 if state == lbEB || state == lbExtPicCn { 616 return prAny, LineDontBreak 617 } 618 } 619 graphemeProperty := propertyGraphemes(r) 620 if graphemeProperty == prExtendedPictographic && generalCategory == gcCn { 621 return lbExtPicCn, LineCanBreak 622 } 623 } 624 625 return 626 }