utf8.c (2198B)
1 #include "utf8.h" 2 3 size_t codepoint_to_utf8(const uint32_t codepoint, unsigned char buffer[4]) { 4 if (codepoint <= 0x7F) { 5 buffer[0] = codepoint; 6 return 1; 7 } 8 if (codepoint >= 0x80 && codepoint <= 0x07FF) { 9 buffer[0] = 0xC0 | (codepoint >> 6); 10 buffer[1] = 0x80 | (codepoint & 0x3F); 11 return 2; 12 } 13 if (codepoint >= 0x0800 && codepoint <= 0xFFFF) { 14 buffer[0] = 0xE0 | (codepoint >> 12); 15 buffer[1] = 0x80 | ((codepoint >> 6) & 0x3F); 16 buffer[2] = 0x80 | (codepoint & 0x3F); 17 return 3; 18 } 19 20 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) { 21 buffer[0] = 0xF0 | (codepoint >> 18); 22 buffer[1] = 0x80 | ((codepoint >> 12) & 0x3F); 23 buffer[2] = 0x80 | ((codepoint >> 6) & 0x3F); 24 buffer[3] = 0x80 | (codepoint & 0x3F); 25 return 4; 26 } 27 return 0; 28 } 29 30 bool utf8_to_codepoint(const unsigned char buffer[4], const size_t len, 31 uint32_t *codepoint) { 32 *codepoint = 0; 33 if (len == 1 && buffer[0] <= 0x7F) { 34 *codepoint = buffer[0]; 35 return true; 36 } 37 if (len == 2 && (buffer[0] >= 0xC0 && buffer[0] <= 0xDF) && 38 (buffer[1] >= 0x80 && buffer[1] <= 0xBF)) { 39 *codepoint = buffer[0] & 0x1F; 40 *codepoint = *codepoint << 6; 41 *codepoint = *codepoint | (buffer[1] & 0x3F); 42 return true; 43 } 44 if (len == 3 && (buffer[0] >= 0xE0 && buffer[0] <= 0xEF) && 45 (buffer[1] >= 0x80 && buffer[1] <= 0xBF) && 46 (buffer[2] >= 0x80 && buffer[2] <= 0xBF)) { 47 *codepoint = buffer[0] & 0xF; 48 *codepoint = *codepoint << 6; 49 *codepoint = *codepoint | (buffer[1] & 0x3F); 50 *codepoint = *codepoint << 6; 51 *codepoint = *codepoint | (buffer[2] & 0x3F); 52 return true; 53 } 54 if (len == 4 && (buffer[0] >= 0xF0 && buffer[0] <= 0xF7) && 55 (buffer[1] >= 0x80 && buffer[1] <= 0xBF) && 56 (buffer[2] >= 0x80 && buffer[2] <= 0xBF) && 57 (buffer[3] >= 0x80 && buffer[3] <= 0xBF)) { 58 *codepoint = buffer[0] & 7; 59 *codepoint = *codepoint << 6; 60 *codepoint = *codepoint | (buffer[1] & 0x3F); 61 *codepoint = *codepoint << 6; 62 *codepoint = *codepoint | (buffer[2] & 0x3F); 63 *codepoint = *codepoint << 6; 64 *codepoint = *codepoint | (buffer[3] & 0x3F); 65 return true; 66 } 67 68 return false; 69 }