I have a program that outputs a textual table using UTF-8 strings, and I need to measure the number of monospaced character cells used by a string so I can align it properly
The following code takes ill-formed byte sequences into consideration. the example of string data comes from ""Table 3-8. Use of U+FFFD in UTF-8 Conversion"" in the Unicode Standard 6.3.
#include
#include
#include
#include
#define is_trail(c) (c > 0x7F && c < 0xC0)
#define SUCCESS 1
#define FAILURE -1
int utf8_get_next_char(const unsigned char*, size_t, size_t*, int*, unsigned int*);
int utf8_length(unsigned char*, size_t);
void utf8_print_each_char(unsigned char*, size_t);
int main(void)
{
unsigned char *str;
str = (unsigned char *) "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";
size_t str_size = strlen((const char*) str);
puts(10 == utf8_length(str, str_size) ? "true" : "false");
utf8_print_each_char(str, str_size);
return EXIT_SUCCESS;
}
int utf8_length(unsigned char *str, size_t str_size)
{
int length = 0;
size_t pos = 0;
size_t next_pos = 0;
int is_valid = 0;
unsigned int code_point = 0;
while (
utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
) {
++length;
}
return length;
}
void utf8_print_each_char(unsigned char *str, size_t str_size)
{
int length = 0;
size_t pos = 0;
size_t next_pos = 0;
int is_valid = 0;
unsigned int code_point = 0;
while (
utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
) {
if (is_valid == true) {
printf("%.*s\n", (int) next_pos - (int) pos, str + pos);
} else {
puts("\xEF\xBF\xBD");
}
pos = next_pos;
}
}
int utf8_get_next_char(const unsigned char *str, size_t str_size, size_t *cursor, int *is_valid, unsigned int *code_point)
{
size_t pos = *cursor;
size_t rest_size = str_size - pos;
unsigned char c;
unsigned char min;
unsigned char max;
*code_point = 0;
*is_valid = SUCCESS;
if (*cursor >= str_size) {
return FAILURE;
}
c = str[pos];
if (rest_size < 1) {
*is_valid = false;
pos += 1;
} else if (c < 0x80) {
*code_point = str[pos];
*is_valid = true;
pos += 1;
} else if (c < 0xC2) {
*is_valid = false;
pos += 1;
} else if (c < 0xE0) {
if (rest_size < 2 || !is_trail(str[pos + 1])) {
*is_valid = false;
pos += 1;
} else {
*code_point = ((str[pos] & 0x1F) << 6) | (str[pos + 1] & 0x3F);
*is_valid = true;
pos += 2;
}
} else if (c < 0xF0) {
min = (c == 0xE0) ? 0xA0 : 0x80;
max = (c == 0xED) ? 0x9F : 0xBF;
if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
*is_valid = false;
pos += 1;
} else if (rest_size < 3 || !is_trail(str[pos + 2])) {
*is_valid = false;
pos += 2;
} else {
*code_point = ((str[pos] & 0x1F) << 12)
| ((str[pos + 1] & 0x3F) << 6)
| (str[pos + 2] & 0x3F);
*is_valid = true;
pos += 3;
}
} else if (c < 0xF5) {
min = (c == 0xF0) ? 0x90 : 0x80;
max = (c == 0xF4) ? 0x8F : 0xBF;
if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
*is_valid = false;
pos += 1;
} else if (rest_size < 3 || !is_trail(str[pos + 2])) {
*is_valid = false;
pos += 2;
} else if (rest_size < 4 || !is_trail(str[pos + 3])) {
*is_valid = false;
pos += 3;
} else {
*code_point = ((str[pos] & 0x7) << 18)
| ((str[pos + 1] & 0x3F) << 12)
| ((str[pos + 2] & 0x3F) << 6)
| (str[pos + 3] & 0x3F);
*is_valid = true;
pos += 4;
}
} else {
*is_valid = false;
pos += 1;
}
*cursor = pos;
return SUCCESS;
}
When I write code for UTF-8, I see "Table 3-7. Well-Formed UTF-8 Byte Sequences" in the Unicode Standard 6.3.
Code Points First Byte Second Byte Third Byte Fourth Byte
U+0000 - U+007F 00 - 7F
U+0080 - U+07FF C2 - DF 80 - BF
U+0800 - U+0FFF E0 A0 - BF 80 - BF
U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF
U+D000 - U+D7FF ED 80 - 9F 80 - BF
U+E000 - U+FFFF EE - EF 80 - BF 80 - BF
U+10000 - U+3FFFF F0 90 - BF 80 - BF 80 - BF
U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF
U+100000 - U+10FFFF F4 80 - 8F 80 - BF 80 - BF