C how to skip BOM when checking if x is at the start of a file

六月ゝ 毕业季﹏ 提交于 2021-01-28 08:50:57

问题


In a C array/string, How to i correctly detect if something is at the start of a file if the file has a BOM as sometimes the BOM takes up 1 character, other times the BOM takes up 3 characters, and other times the BOM is not present, resulting in the actual location of x to not always start on index 0

Most of the time it is this (in hex) "ef bb bf" For example:

ef bb bf 23 21 2f 62 69 6e 2f 62 61 73 68 0a 61 20 26 26 20 62 0a 67 20 : ...#!/bin/bash.a && b.g 

Would it be something like this?

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

struct BOM {
    int is_BOM;
    int length;
    int type;
    char * type_as_string;
    char * BOM;
}

int matches(char * BOM_, char * string_, int length_) {
    char * b = BOM_+1;
    for(int i = 0; i < length_; i++) {
        if (string_[i] == b[i]) matches = 1;
        else {
            matches = 0;
            break;
        }
    }
    return matches;
}

#define ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) if (matches(BOM_, string_, length_)) { \
    bom_struct.is_BOM = is_BOM_; \
    bom_struct.length = length_; \
    bom_struct.type = type_; \
    bom_struct.type_as_string = type_as_string_; \
    bom_struct.BOM = BOM_+1 /* remove the ^ at the start */ ; \
}

#define elifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) else ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_)

#define elbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_) else { \
    bom_struct.is_BOM = is_BOM_; \
    bom_struct.length = length_; \
    bom_struct.type = type_; \
    bom_struct.type_as_string = type_as_string_; \
    bom_struct.BOM = BOM_; \
}

#define cat 0
#define hex 1
#define both 2
#define json 3

int mode;

void __hexdump(unsigned char *buffer, unsigned long index, unsigned long width)
{
    unsigned long i;
    if (mode == both || mode == hex) {
    for (i = 0; i < index; i++)
        printf("%02x ", buffer[i]);
    }
    if (mode == both) {
    for (unsigned long spacer = index; spacer < width; spacer++)
        printf("\t");
    printf(": ");
    }
    if (mode == cat || mode == both || mode == json) {
    for (i = 0; i < index; i++)
    {
        if (buffer[i] < 32 || buffer[i] >= 127)
            printf(".");
        else
            printf("%c", buffer[i]);
    }
    }
    printf("\n");
}

int __hexdump_string(char *infile, unsigned long start, unsigned long stop, unsigned long width)
{
    char ch;
    unsigned long f_index = 0;
    unsigned long bb_index = 0;
    unsigned char *byte_buffer = malloc(width);
    if (byte_buffer == NULL)
    {
        printf("Could not allocate memory for byte_buffer\n");
        return -1;
    }
    while (*infile)
    {
        ch = *infile;
        if ((f_index >= start) && (f_index <= stop))
        {
            byte_buffer[bb_index] = ch;
            bb_index++;
        }
        if (bb_index >= width)
        {
            __hexdump(byte_buffer, bb_index, width);
            bb_index = 0;
        }
        f_index++;
        infile++;
    }
    if (bb_index)
        __hexdump(byte_buffer, bb_index, width);
    free(byte_buffer);
    return 0;
}

#define builtin__BOM_print(bom_struct) { \
    printf("%s.is_BOM = %s\n%s.length = %d\n%s.type = %d\n%s.type_as_string = %s\n%s.BOM = ", #bom_struct, bom_struct.is_BOM?"yes":"no", #bom_struct, bom_struct.length, #bom_struct, bom_struct.type, #bom_struct,bom_struct.type_as_string, #bom_struct); \
    mode = both; \
    __hexdump_string(bom_struct.BOM, 0, bom_struct.length, 5); \
}


struct BOM builtin__BOM_get(char * string) {
    struct BOM bom;
    ifbom(bom, true, 3, 1, "UTF-8", "^\xef\xbb\xbf", string)
    elifbom(bom, true, 2, 2, "UTF-16 (BE)", "^\xfe\xff", string)
    elifbom(bom, true, 2, 3, "UTF-16 (LE)", "^\xff\xfe", string)
    elifbom(bom, true, 4, 4, "UTF-32 (BE)", "^\x00\x00\xfe\xff", string)
    elifbom(bom, true, 4, 5, "UTF-32 (LE)", "^\xff\xfe\x00\x00", string)
    elifbom(bom, true, 5, 6, "UTF-7", "^\x2b\x2f\x76\x38\x3d", string)
    elifbom(bom, true, 4, 7, "UTF-7", "^\x2b\x2f\x76\x38", string)
    elifbom(bom, true, 4, 8, "UTF-7", "^\x2b\x2f\x76\x39", string)
    elifbom(bom, true, 4, 9, "UTF-7", "^\x2b\x2f\x76\x2b", string)
    elifbom(bom, true, 4, 10, "UTF-7", "^\x2b\x2f\x76\x2f", string)
    elifbom(bom, true, 3, 11, "UTF-1", "^\xf7\x64\x4c", string)
    elifbom(bom, true, 4, 12, "UTF-EBCDIC", "^\xdd\x73\x66\x73", string)
    elifbom(bom, true, 3, 13, "SCSU", "^\x0e\xfe\xff", string)
    elifbom(bom, true, 3, 14, "BOCU-1", "^\xfb\xee\x28", string)
    elifbom(bom, true, 4, 15, "GB-18030", "^\x84\x31\x95\x33", string)
    elbom(bom, false, 0, 0, "Not present", "Not present")

    return (struct BOM) bom;
}

int main()
{
    struct BOM t = builtin__BOM_get("test");
    builtin__BOM_print(t);
    return 0;
}

回答1:


You should read the first characters to know if BOM is present or not.

  • if 4 first chars are FF FE 00 00 : little endian UTF-32
  • else if 2 first chars are FF FE : little endian UTF-16
  • else if 4 first chars are 00 00 FE FF : big endian UTF-32
  • else if 2 first chars are FE FF : big endian UTF-16
  • else if 3 firsts chars are EF BB BF : UTF-8
  • etc...

Depending on the BOM length, you know at which index the real file data starts.

You can find a more complete list of BOM on wikipedia page : https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding



来源:https://stackoverflow.com/questions/51518244/c-how-to-skip-bom-when-checking-if-x-is-at-the-start-of-a-file

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!