How to read unicode (utf-8) / binary file line by line

前端未结

关注

 6  1720

醉酒成梦 2020-12-16 15:56

Hi programmers,

I want read line by line a Unicode (UTF-8) text file created by Notepad, i don\'t want display the Unicode string in the screen, i w

6条回答

北荒 (楼主)

2020-12-16 16:28

In this article a coding and decoding routine is written and it is explained how the unicode is encoded:

http://www.codeguru.com/cpp/misc/misc/multi-lingualsupport/article.php/c10451/

It can be easily adjusted to C. Simply encode your ANSI or decode the UTF-8 String and make a byte compare

EDIT: After the OP said that it is too hard to rewrite the function from C++ here a template:

What is needed:
+ Free the allocated memory (or wait till the process ends or ignore it)
+ Add the 4 byte functions
+ Tell me that short and int is not guaranteed to be 2 and 4 bytes long (I know, but C is really stupid !) and finally
+ Find some other errors

#include 
#include 

#define         MASKBITS                0x3F
#define         MASKBYTE                0x80
#define         MASK2BYTES              0xC0
#define         MASK3BYTES              0xE0
#define         MASK4BYTES              0xF0
#define         MASK5BYTES              0xF8
#define         MASK6BYTES              0xFC

char* UTF8Encode2BytesUnicode(unsigned short* input)
{
   int size = 0,
       cindex = 0;
   while (input[size] != 0)
     size++;
   // Reserve enough place; The amount of 
   char* result = (char*) malloc(size);
   for (int i=0; i> 6));
         result[cindex++] = ((char)(MASKBYTE | input[i] & MASKBITS));
      }
      // 1110xxxx 10xxxxxx 10xxxxxx
      else if(input[i] < 0x10000)
      {
         result[cindex++] = ((char)(MASK3BYTES | input[i] >> 12));
         result[cindex++] = ((char)(MASKBYTE | input[i] >> 6 & MASKBITS));
         result[cindex++] = ((char)(MASKBYTE | input[i] & MASKBITS));
      }
   }
}

wchar_t* UTF8Decode2BytesUnicode(char* input)
{
  int size = strlen(input);
  wchar_t* result = (wchar_t*) malloc(size*sizeof(wchar_t));
  int rindex = 0,
      windex = 0;
  while (rindex < size)
  {
      wchar_t ch;

      // 1110xxxx 10xxxxxx 10xxxxxx
      if((input[rindex] & MASK3BYTES) == MASK3BYTES)
      {
         ch = ((input[rindex] & 0x0F) << 12) | (
               (input[rindex+1] & MASKBITS) << 6)
              | (input[rindex+2] & MASKBITS);
         rindex += 3;
      }
      // 110xxxxx 10xxxxxx
      else if((input[rindex] & MASK2BYTES) == MASK2BYTES)
      {
         ch = ((input[rindex] & 0x1F) << 6) | (input[rindex+1] & MASKBITS);
         rindex += 2;
      }
      // 0xxxxxxx
      else if(input[rindex] < MASKBYTE)
      {
         ch = input[rindex];
         rindex += 1;
      }

      result[windex] = ch;
   }
}

char* getUnicodeToUTF8(wchar_t* myString) {
  int size = sizeof(wchar_t);
  if (size == 1)
    return (char*) myString;
  else if (size == 2)
    return UTF8Encode2BytesUnicode((unsigned short*) myString);
  else
    return UTF8Encode4BytesUnicode((unsigned int*) myString);
}

0 讨论(0)

查看其它6个回答