C++ iterate or split UTF-8 string into array of symbols?

后端 未结 5 1670
庸人自扰
庸人自扰 2020-12-25 08:26

Searching for a platform- and 3rd-party-library- independent way of iterating UTF-8 string or splitting it into array of UTF-8 symbols.

Please post a code snippet.

5条回答
  •  粉色の甜心
    2020-12-25 09:12

    Off the cuff:

    // Return length of s converted. On success return should equal s.length().
    // On error return points to the character where decoding failed.
    // Remember to check the success flag since decoding errors could occur at
    // the end of the string
    int convert(std::vector& u, const std::string& s, bool& success) {
        success = false;
        int cp = 0;
        int runlen = 0;
        for (std::string::const_iterator it = s.begin(), end = s.end(); it != end; ++it) {
            int ch = static_cast(*it);
            if (runlen > 0) {
                if ((ch & 0xc0 != 0x80) || cp == 0) return it-s.begin();
                cp = (cp << 6) + (ch & 0x3f);
                if (--runlen == 0) {
                    u.push_back(cp);
                    cp = 0;
                }
            }
            else if (cp == 0) {
                if (ch < 0x80)      { u.push_back(ch); }
                else if (ch > 0xf8) return it-s.begin();
                else if (ch > 0xf0) { cp = ch & 7; runlen = 3; }
                else if (ch > 0xe0) { cp = ch & 0xf; runlen = 2; }
                else if (ch > 0xc0) { cp = ch & 0x1f; runlen = 1; }
                else return it-s.begin(); // stop on error
            }
            else return it-s.begin();
        }
        success = runlen == 0; // verify we are between codepoints
        return s.length();
    }
    

提交回复
热议问题