How to read file which contains \uxxxx in vc++

后端 未结 4 1872
遇见更好的自我
遇见更好的自我 2020-12-12 00:14

I have txt file whose contents are:

\\u041f\\u0435\\u0440\\u0432\\u044b\\u0439_\\u0438\\u043d\\u0442\\u0435\\u0440\\u0430\\u043a\\u0442\\u0438\\u0432\

相关标签:
4条回答
  • 2020-12-12 00:44

    It's not very easy when you're reading in the file. It's easier to do a post-processing step afterwards. You can use Boost::regex to look for the pattern "\u[0-9A-Fa-f]{4}", and replace that by the corresponding single character.

    0 讨论(0)
  • 2020-12-12 00:57

    Here is an example for MSalters's suggestion:

    #include <iostream>
    #include <string>
    #include <fstream>
    #include <algorithm>
    #include <sstream>
    #include <iomanip>
    #include <locale>
    
    #include <boost/scoped_array.hpp>
    #include <boost/regex.hpp>
    #include <boost/numeric/conversion/cast.hpp>
    
    std::wstring convert_unicode_escape_sequences(const std::string& source) {
      const boost::regex regex("\\\\u([0-9A-Fa-f]{4})");  // NB: no support for non-BMP characters
      boost::scoped_array<wchar_t> buffer(new wchar_t[source.size()]);
      wchar_t* const output_begin = buffer.get();
      wchar_t* output_iter = output_begin;
      std::string::const_iterator last_match = source.begin();
      for (boost::sregex_iterator input_iter(source.begin(), source.end(), regex), input_end; input_iter != input_end; ++input_iter) {
        const boost::smatch& match = *input_iter;
        output_iter = std::copy(match.prefix().first, match.prefix().second, output_iter);
        std::stringstream stream;
        stream << std::hex << match[1].str() << std::ends;
        unsigned int value;
        stream >> value;
        *output_iter++ = boost::numeric_cast<wchar_t>(value);
        last_match = match[0].second;
      }
      output_iter = std::copy(last_match, source.end(), output_iter);
      return std::wstring(output_begin, output_iter);
    }
    
    int wmain() {
      std::locale::global(std::locale(""));
      const std::wstring filename = L"test.txt";
      std::ifstream stream(filename.c_str(), std::ios::in | std::ios::binary);
      stream.seekg(0, std::ios::end);
      const std::ifstream::streampos size = stream.tellg();
      stream.seekg(0);
      boost::scoped_array<char> buffer(new char[size]);
      stream.read(buffer.get(), size);
      const std::string source(buffer.get(), size);
      const std::wstring result = convert_unicode_escape_sequences(source);
      std::wcout << result << std::endl;
    }
    

    I'm always surprised how complicated seemingly simple things like this are in C++.

    0 讨论(0)
  • 2020-12-12 01:03

    My solution. I used Boost for UTF-16 - UTF-8 conversion.

    #include <fstream>
    #include <codecvt>
    #include <boost/numeric/conversion/cast.hpp>
    
    //------------------------------------------------------------------------------
    
    inline uint8_t get_uint8(uint8_t h, uint8_t l)
    {
        uint8_t ret;
    
        if (h - '0' < 10)
            ret = h - '0';
        else if (h - 'A' < 6)
            ret = h - 'A' + 0x0A;
        else if (h - 'a' < 6)
            ret = h - 'a' + 0x0A;
    
        ret = ret << 4;
    
        if (l - '0' < 10)
            ret |= l - '0';
        else if (l - 'A' < 6)
            ret |= l - 'A' + 0x0A;
        else if (l - 'a' < 6)
            ret |= l - 'a' + 0x0A;
        return  ret;
    }
    
    std::string convert_unicode_escape_sequences(const std::string& source) 
    {
        std::wstring ws; ws.reserve(source.size());
        std::wstringstream wis(ws);
    
        auto s = source.begin();
        while (s != source.end())
        {
            if (*s == '\\')
            {
                if (std::distance(s, source.end()) > 5)
                {
                    if (*(s + 1) == 'u')
                    {
                        unsigned int v = get_uint8(*(s + 2), *(s + 3)) << 8;
                        v |= get_uint8(*(s + 4), *(s + 5));
    
                        s += 6;
                        wis << boost::numeric_cast<wchar_t>(v);
                        continue;
                    }
                }
            }
            wis << wchar_t(*s);
            s++;
        }
    
        std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;
        return myconv.to_bytes(wis.str());
    }
    
    0 讨论(0)
  • 2020-12-12 01:07

    Check this code :) Windows SDK has it already for you, MS geeks thought for this too, you can find more details in this post: http://weblogs.asp.net/kennykerr/archive/2008/07/24/visual-c-in-short-converting-between-unicode-and-utf-8.aspx

    #include <atlconv.h>
    #include <atlstr.h>
    
    #define ASSERT ATLASSERT
    
    int main()
    {
        const CStringW unicode1 = L"\u041f and \x03A9"; // 'Alpha' and 'Omega'
    
        const CStringA utf8 = CW2A(unicode1, CP_UTF8);
    
        ASSERT(utf8.GetLength() > unicode1.GetLength());
    
        const CStringW unicode2 = CA2W(utf8, CP_UTF8);
    
        ASSERT(unicode1 == unicode2);   
    
        return 0;
    }
    

    This code has been tested by me and it works fine.

    0 讨论(0)
提交回复
热议问题