Getting error while reading unicode file in C

笑着哭i 提交于 2020-01-06 12:45:25

问题


I want to read a unicode file in C (Cygwin/GCC) using the following code:

#include <stdio.h>
#include <stdlib.h>
#include <glib.h>


void split_parse(char* text){
    char** res = g_strsplit(text, "=", 2);
    printf("Key = %s : ", res[0]);
    printf("Value = %s", res[1]);
    printf("\n");
}

int main(int argc, char **argv)
{
    setenv ("CYGWIN", "nodosfilewarning", 1);

    GIOChannel *channel;
    GError *err = NULL;
    int reading = 0;
    const gchar* enc;
    guchar magic[2] = { 0 };
    gsize bytes_read = 0;

    const char* filename = "C:\\CONFIG";


    channel = g_io_channel_new_file (filename, "r", &err);

    if (!channel) {
        g_print("%s", err->message);
        return 1;
    }

    if (g_io_channel_set_encoding(channel, NULL, &err) != G_IO_STATUS_NORMAL) {
        g_print("g_io_channel_set_encoding: %s\n", err->message);
        return 1;
    }

    if (g_io_channel_read_chars(channel, (gchar*) magic, 2, &bytes_read, &err) != G_IO_STATUS_NORMAL) {
        g_print("g_io_channel_read_chars: %s\n", err->message);
        return 1;
    }

    if (magic[0] == 0xFF && magic[1] == 0xFE)
    {
        enc = "UTF-16LE";
    }
    else if (magic[0] == 0xFE && magic[1] == 0xFF)
    {
        enc = "UTF-16BE";
    }
    else
    {
        enc = "UTF-8";
        if (g_io_channel_seek_position(channel, 0, G_SEEK_CUR, &err) == G_IO_STATUS_ERROR)
        {
            g_print("g_io_channel_seek: failed\n");
            return 1;
        }
    }

    if (g_io_channel_set_encoding (channel, enc, &err) != G_IO_STATUS_NORMAL) {
        g_print("%s", err->message);
        return 1;
    }

    reading = 1;
    GIOStatus status;
    char* str = NULL;
    size_t len;

    while(reading){

        status = g_io_channel_read_line(channel, &str, &len, NULL, &err);
        switch(status){
            case G_IO_STATUS_EOF:
                reading = 0;
                break;
            case G_IO_STATUS_NORMAL:
                if(len == 0) continue;
                split_parse(str);
                break;
            case G_IO_STATUS_AGAIN: continue;
            case G_IO_STATUS_ERROR:
            default:
                //throw error;
                reading = 0;
                break;
        }
    }

    g_free(str);
    g_io_channel_unref(channel);

    return(EXIT_SUCCESS);
}

The file (C:\CONFIG) content is as follows:

h-debug="1"
name=ME
ÃÆÿЮ©=2¾1¼

While reading it I am always getting the following error message at "g_io_channel_read_line" inside the while loop:

0x800474f8 "Invalid byte sequence in conversion input"

What am I doing wrong? How to read a file like this in C using glib?

EDIT: Hexdump of the file


回答1:


Your file contains the 3-byte UTF8 BOM of (EF BB BF). byte-order-mark.

Your code defaults to UTF8, but does not consume the BOM.

channel, 0, G_SEEK_CUR, &err

s/b

channel, 3, G_SEEK_CUR, &err

Further, I would recommend extending your magic code to read 4 bytes and affirmatively discern the BOM.

If you do not find a BOM, you could assume encoding NULL which I think is binary. Or throw an error Or fix the wayward text file Or, if your are pedantic, sequentially try all known encoding types.


UTF32BE "\x00\x00\xFE\xFF"
UTF32LE "\xFF\xFE\x00\x00"
UTF8 "\xEF\xBB\xBF"
UTF16BE "\xFE\xFF"
UTF16LE "\xFF\xFE"
NULL for binary



来源:https://stackoverflow.com/questions/17383930/getting-error-while-reading-unicode-file-in-c

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!