Number of character cells used by string

前端 未结 6 740
离开以前
离开以前 2020-12-02 14:56

I have a program that outputs a textual table using UTF-8 strings, and I need to measure the number of monospaced character cells used by a string so I can align it properly

6条回答
  •  谎友^
    谎友^ (楼主)
    2020-12-02 15:03

    The following code takes ill-formed byte sequences into consideration. the example of string data comes from ""Table 3-8. Use of U+FFFD in UTF-8 Conversion"" in the Unicode Standard 6.3.

    #include 
    #include 
    #include 
    #include 
    
    #define is_trail(c) (c > 0x7F && c < 0xC0)
    #define SUCCESS 1
    #define FAILURE -1
    
    int utf8_get_next_char(const unsigned char*, size_t, size_t*, int*, unsigned int*);
    int utf8_length(unsigned char*, size_t);
    void utf8_print_each_char(unsigned char*, size_t);
    
    int main(void)
    {
        unsigned char *str;
        str = (unsigned char *) "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";
        size_t str_size = strlen((const char*) str);
    
        puts(10 == utf8_length(str, str_size) ? "true" : "false");
        utf8_print_each_char(str, str_size);
    
        return EXIT_SUCCESS;
    }
    
    int utf8_length(unsigned char *str, size_t str_size)
    {
        int length = 0;
        size_t pos = 0;
        size_t next_pos = 0;
        int is_valid = 0;
        unsigned int code_point = 0;
    
        while (
            utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
        ) {
            ++length;
        }
    
        return length;
    }
    
    void utf8_print_each_char(unsigned char *str, size_t str_size)
    {
        int length = 0;
        size_t pos = 0;
        size_t next_pos = 0;
        int is_valid = 0;
        unsigned int code_point = 0;
    
        while (
            utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
        ) {
            if (is_valid == true) {
                printf("%.*s\n", (int) next_pos - (int) pos, str + pos);
            } else {
                puts("\xEF\xBF\xBD");
            }
    
            pos = next_pos;
        }
    }
    
    int utf8_get_next_char(const unsigned char *str, size_t str_size, size_t *cursor, int *is_valid, unsigned int *code_point)
    {
        size_t pos = *cursor;
        size_t rest_size = str_size - pos;
        unsigned char c;
        unsigned char min;
        unsigned char max;
    
        *code_point = 0;
        *is_valid = SUCCESS;
    
        if (*cursor >= str_size) {
            return FAILURE;
        }
    
        c = str[pos];
    
        if (rest_size < 1) {
            *is_valid = false;
            pos += 1;
        } else if (c < 0x80) {
            *code_point = str[pos];
            *is_valid = true;
            pos += 1;
        } else if (c < 0xC2) {
            *is_valid = false;
            pos += 1;
        } else if (c < 0xE0) {
    
            if (rest_size < 2 || !is_trail(str[pos + 1])) {
                *is_valid = false;
                pos += 1;
            } else {
                *code_point = ((str[pos] & 0x1F) << 6) | (str[pos + 1] & 0x3F);
                *is_valid = true;
                pos += 2;
            }
    
        } else if (c < 0xF0) {
    
            min = (c == 0xE0) ? 0xA0 : 0x80;
            max = (c == 0xED) ? 0x9F : 0xBF;
    
            if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
                *is_valid = false;
                pos += 1;         
            } else if (rest_size < 3 || !is_trail(str[pos + 2])) {
                *is_valid = false;
                pos += 2;
            } else {
                *code_point = ((str[pos]     & 0x1F) << 12) 
                           | ((str[pos + 1] & 0x3F) <<  6) 
                           |  (str[pos + 2] & 0x3F);
                *is_valid = true;
                pos += 3;
            }
    
        } else if (c < 0xF5) {
    
            min = (c == 0xF0) ? 0x90 : 0x80;
            max = (c == 0xF4) ? 0x8F : 0xBF;
    
            if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
                *is_valid = false;
                pos += 1;
            } else if (rest_size < 3 || !is_trail(str[pos + 2])) {
                *is_valid = false;
                pos += 2;
            } else if (rest_size < 4 || !is_trail(str[pos + 3])) {
                *is_valid = false;
                pos += 3;
            } else {
                *code_point = ((str[pos]     &  0x7) << 18)
                           | ((str[pos + 1] & 0x3F) << 12)
                           | ((str[pos + 2] & 0x3F) << 6)
                           |  (str[pos + 3] & 0x3F);
                *is_valid = true;
                pos += 4;
            }
    
        } else {
            *is_valid = false;
            pos += 1;
        }
    
        *cursor = pos;
    
        return SUCCESS;
    }
    

    When I write code for UTF-8, I see "Table 3-7. Well-Formed UTF-8 Byte Sequences" in the Unicode Standard 6.3.

           Code Points    First Byte Second Byte Third Byte Fourth Byte
      U+0000 -   U+007F   00 - 7F
      U+0080 -   U+07FF   C2 - DF    80 - BF
      U+0800 -   U+0FFF   E0         A0 - BF     80 - BF
      U+1000 -   U+CFFF   E1 - EC    80 - BF     80 - BF
      U+D000 -   U+D7FF   ED         80 - 9F     80 - BF
      U+E000 -   U+FFFF   EE - EF    80 - BF     80 - BF
     U+10000 -  U+3FFFF   F0         90 - BF     80 - BF    80 - BF
     U+40000 -  U+FFFFF   F1 - F3    80 - BF     80 - BF    80 - BF
    U+100000 - U+10FFFF   F4         80 - 8F     80 - BF    80 - BF
    

提交回复
热议问题