How am I allowed to workaround DOS functions that used strings containing accented characters (ASCII to UTF-8)?

瘦欲@ 提交于 2021-01-28 09:12:01

问题


I was writing a SW where I wanted to use an old C code written in the early '80. This code did some conversion on strings. It also used the accented characters that, at that time (DOS), were coded in the ASCII table (codes bigger than 127).

Now the new systems use UTF-8 encoding, so the old code works very badly. I am using Linux (Ubuntu 17 / gcc gcc (Ubuntu 7.2.0-8ubuntu3) 7.2.0).

I'm looking for a workaround allowing me to make the least possible changes. I have begun to do some tests to analyze the arisen issues. I made two main: one useschar *strings andchar elements, another uses wchar_t * strings and wchar_t elements. Both don't work correctly.

The first (using char * and char) requires, in example, a workaround when strchr recognizes multi-byte code, it doesn't prints (printf) the multi-byte char in the correct way, althoug prints correctly the char *. Furthermore generates a lot of warnings relavant to the use of multibyte chars.

The second (using wchar_t * and char *) runs, but doesn't prints correctly the multi-bytes characters, they appear as '?' both when they are printed as wchar_t and as wchar_t * (strings).

MAIN1:

#include <stdio.h>
#include <string.h>
#include <inttypes.h>

/* http://clc-wiki.net/wiki/strchr
 * standard C implementation
 */
char *_strchr(const char *s, int c);

char *_strchr(const char *s, int c)
{
    while (*s != (char)c)
        if (!*s++)
            return 0;
    return (char *)s;
}


int main()
{
    char          * p1 = NULL;
    const char    * t1 = "Sergio è un Italiano e andò via!";

    printf("Text --> %s\n\n",t1);

    for(size_t i=0;i<strlen(t1);i++) {
        printf("%02X %c|",(uint8_t)t1[i],t1[i]);
    }
    puts("\n");

    puts("Searching ò");
    /*warning: multi-character character constant [-Wmultichar]
                      p1 = strchr(t1,'ò');
                                     ^~~~
    */
    p1 = strchr(t1,'ò');
    printf("%s\n",p1-1); // -1 needs to correct the position

    /*warning: multi-character character constant [-Wmultichar]
                      p1 = _strchr(t1,'ò');
                                     ^~~~
    */
    p1 = _strchr(t1,'ò');
    printf("%s\n",p1-1);    // -1 needs to correct the position
    puts("");

    puts("Searching è");
    /*warning: multi-character character constant [-Wmultichar]
                      p1 = strchr(t1,'è');
                                     ^~~~
    */
    p1 = strchr(t1,'è');
    printf("%s\n",p1-1);    // -1 needs to correct the position

    /*warning: multi-character character constant [-Wmultichar]
                      p1 = _strchr(t1,'è');
                                     ^~~~
    */
    p1 = _strchr(t1,'è');
    printf("%s\n",p1-1);    // -1 needs to correct the position
    puts("");

    /*warning: multi-character character constant [-Wmultichar]
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                    ^~~~
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                        ^~~~
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                            ^~~~
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                                ^~~~
    */
    printf("%c %c %08X %08X\n",'è','ò','è','ò');

    /*multi-character character constant [-Wmultichar]
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                ^~~~
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                    ^~~~
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                                 ^~~~
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                                              ^~~~
    */
    printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');

    puts("");
    return 0;
}

Output:

MAIN2:

#include <stdio.h>
#include <string.h>
#include <wchar.h>
#include <inttypes.h>

#define wputs(s) wprintf(s"\n")

/* https://opensource.apple.com/source/Libc/Libc-498.1.1/string/wcschr-fbsd.c
 * FBSD C implementation
 */
wchar_t * _wcschr(const wchar_t *s, wchar_t c);

wchar_t * _wcschr(const wchar_t *s, wchar_t c)
{
    while (*s != c && *s != L'\0')
        s++;
    if (*s == c)
        return ((wchar_t *)s);
    return (NULL);
}

int main()
{
    wchar_t       * p1 = NULL;
    const wchar_t * t1 = L"Sergio è un Italiano e andò via!";
    const wchar_t * f0 = L"%02X %c|";
    const wchar_t * f1 = L"Text --> %ls\n\n";
    const wchar_t * f2 = L"%ls\n";

    uint8_t * p = (uint8_t *)t1;

    wprintf(f1,t1);

    for(size_t i=0;;i++) {
        uint8_t c=*(p+i);

        wprintf(f0,c,(c<' ')?'.':(c>127)?'*':c);
        if ( c=='!' )
            break;
    }
    wputs(L"\n");

    wputs(L"Searching ò");

    p1 = wcschr(t1,L'ò');
    wprintf(f2,p1);

    p1 = _wcschr(t1,L'ò');
    wprintf(f2,p1);
    wputs(L"---");

    wputs(L"Searching è");

    p1 = wcschr(t1,L'è');
    wprintf(f2,p1);

    p1 = _wcschr(t1,L'è');
    wprintf(f2,p1);
    wputs(L"");

    wprintf(L"%lc %lc %08X %08X\n",L'è',L'ò',L'è',L'ò');
    wprintf(L"%lc %lc %08X %08X\n",L'è',L'ò',(uint8_t)L'è',(uint8_t)L'ò');

    wputs(L"");

    return 0;
}

Output:


回答1:


You need to localize your program, if you want to use wide-character I/O. It's not difficult, just a setlocale() call, plus optionally fwide() to see if the user locale supports wide I/O on the desired stream(s).

In your main(), before any input/output, run

    if (!setlocale(LC_ALL, "")) {
        /* Current locale is not supported
           by the C library; abort. */
    }

As the comment says, this tells your C library, that this program is locale-aware, and that it should do the setup and preparations needed to follow the rules of the locale the user has set up. See man 7 locale for further information. Essentially, the C library does not automatically pick up the current locale the user has set up, but uses the default C/POSIX locale. This command tells the C library to try and conform to the currently set up locale.

In POSIX C, each FILE handle has an orientation, that can be queried and set (but only before reading or writing to it) using fwide(). Note that it is a property of the file handle, not files themselves; and it only determines whether the C library uses byte-oriented (normal/narrow) or wide-character functions to read from and write to the stream. If you don't call it, the C library tries to do it automatically based on the first read/write function you use to access the stream, if the locale has been set. However, using for example

    if (fwide(stdout, 1) <= 0) {
        /* The C library does not support wide-character
           orientation for standard output in this locale.
           Abort.
        */
    }

after the locale setup, means you can detect if the C library does not support the user locale or if the user locale does not support wide characters at all, for that particular stream; and abort the program. (It is always better to tell the user that the results would be garbage, than silently try to do your best, and possibly garble the user data. The user can, after all, always use a different tool; but silently garbling the user data means this particular tool would simply be untrustworthy: worthless.)

You must not mix wprintf() and printf(); nor fwprintf() and fprintf() to the same stream. It either fails (does not print anything), confuses the C library, or produces garbled results. Similarly, you must not mix fgetc() and fgetwc() on the same stream. Simply put, you must not mix byte-oriented or wide-character-oriented functions on the same stream.

This does not mean that you cannot print a byte-oriented (or multibyte) string to a wide-character-oriented stream, or vice versa; quite the opposite. It works very logically, %s and %c always refer to a byte-oriented string or character, and %ls and %lc a wide string or character. For example, if you have

const wchar_t *ws = L"Hello";
const char     *s = "world!";

you can print them both to byte-oriented standard output using

printf("%ls, %s\n", ws, s);

or to a wide-character-oriented standard output using

wprintf(L"%ls, %s\n", ws, s);

This is basically a limitation in the POSIX C library: you must use byte-oriented functions for byte-oriented streams, and wide-character oriented functions for wide-character oriented streams. It might feel weird at first, but if you think about it, it's very clear and simple rule.


Let's look at an example program roughly similar to yours; expanded to read the (unlimited-length) strings line by line from standard input, using any newline convention (CR, LF, CRLF, LFCR):

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <locale.h>
#include <wchar.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

/* Function to read a wide-character line,
   using any newline convention, skipping embedded NUL bytes (L'\0'),
   and dynamically reallocating the buffer as needed.
   If *lineptr==NULL and *sizeptr==0, the buffer is dynamically allocated.
   Returns the number of wide characters read.
   If an error occurs, returns zero, with errno set.
   At end of input, returns zero, with errno zero.
*/
size_t wide_line(wchar_t **lineptr, size_t *sizeptr, FILE *in)
{
    wchar_t *line;
    size_t   size, used = 0;
    wint_t   wc;

    if (!lineptr || !sizeptr) {
        errno = EINVAL;
        return 0;
    }
    if (ferror(in)) {
        errno = EIO;
        return 0;
    }

    if (*sizeptr) {
        line = *lineptr;
        size = *sizeptr;
    } else {
        *lineptr = line = NULL;
        *sizeptr = size = 0;
    }

    while (1) {

        if (used + 3 >= size) {
            /* Conservative dynamic memory reallocation policy. */
            if (used < 126)
                size = 128;
            else
            if (used < 2097152)
                size = (used * 3) / 2;
            else
                size = (used | 1048575) + 1048579;

            /* Check for size overflow. */
            if (used + 2 >= size) {
                errno = ENOMEM;
                return 0;
            }

            line = realloc(line, size * sizeof line[0]);
            if (!line) {
                errno = ENOMEM;
                return 0;
            }

            *lineptr = line;
            *sizeptr = size;
        }

        wc = fgetwc(in);
        if (wc == WEOF) {
            line[used] = L'\0';
            errno = 0;
            return used;

        } else
        if (wc == L'\n') {
            line[used++] = L'\n';

            wc = fgetwc(in);
            if (wc == L'\r')
                line[used++] = L'\r';
            else
            if (wc != WEOF)
                ungetwc(wc, in);

            line[used] = L'\0';
            errno = 0;
            return used;

        } else
        if (wc == L'\r') {
            line[used++] = L'\r';

            wc = fgetwc(in);
            if (wc == L'\n')
                line[used++] = L'\n';
            else
            if (wc != WEOF)
                ungetwc(wc, in);

            line[used] = L'\0';
            errno = 0;
            return used;
        } else
        if (wc != L'\0')
            line[used++] = wc;
    }
}

/* Returns a dynamically allocated wide string,
   with contents from a multibyte string. */
wchar_t *dup_mbstowcs(const char *src)
{
    if (src && *src) {
        wchar_t *dst;
        size_t   len, check;

        len = mbstowcs(NULL, src, 0);
        if (len == (size_t)-1) {
            errno = EILSEQ;
            return NULL;
        }

        dst = malloc((len + 1) * sizeof *dst);
        if (!dst) {
            errno = ENOMEM;
            return NULL;
        }

        check = mbstowcs(dst, src, len + 1);
        if (check != len) {
            free(dst);
            errno = EILSEQ;
            return NULL;
        }

        /* Be paranoid, and ensure the string is terminated. */
        dst[len] = L'\0';
        return dst;

    } else {
        wchar_t *empty;

        empty = malloc(sizeof *empty);
        if (!empty) {
            errno = ENOMEM;
            return NULL;
        }

        *empty = L'\0';
        return empty;
    }
}

int main(int argc, char *argv[])
{
    wchar_t **argw;
    wchar_t  *line = NULL;
    size_t    size = 0;
    size_t    len;
    int       arg;

    if (!setlocale(LC_ALL, "")) {
        fprintf(stderr, "Current locale is unsupported.\n");
        return EXIT_FAILURE;
    }

    if (fwide(stdin, 1) <= 0) {
        fprintf(stderr, "Standard input does not support wide characters.\n");
        return EXIT_FAILURE;
    }

    if (fwide(stdout, 1) <= 0) {
        fprintf(stderr, "Standard output does not support wide characters.\n");
        return EXIT_FAILURE;
    }

    if (argc < 2) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s WIDE-CHARACTER [ WIDE-CHARACTER ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program will look for the first instance of each wide character\n");
        fprintf(stderr, "in each line of input.\n");
        return EXIT_SUCCESS;
    }

    /* Convert command-line arguments to wide character strings. */
    argw = malloc((size_t)(argc + 1) * sizeof *argw);
    if (!argw) {
        fprintf(stderr, "Out of memory.\n");
        return EXIT_FAILURE;
    }
    for (arg = 0; arg < argc; arg++) {
        argw[arg] = dup_mbstowcs(argv[arg]);
        if (!argw[arg]) {
            fprintf(stderr, "Error converting argv[%d]: %s.\n", arg, strerror(errno));
            return EXIT_FAILURE;
        }
    }
    argw[argc] = NULL;

    while (1) {

        len = wide_line(&line, &size, stdin);
        if (!len) {
            if (errno) {
                fprintf(stderr, "Error reading standard input: %s.\n", strerror(errno));
                return EXIT_FAILURE;
            } else
            if (ferror(stdin)) {
                fprintf(stderr, "Error reading standard input.\n");
                return EXIT_FAILURE;
            }
            /* It was just an end of file, no error. */
            break;
        }

        for (arg = 1; arg < argc; arg++)
            if (argw[arg][0] != L'\0') {
                wchar_t  *pos = wcschr(line, argw[arg][0]);
                if (pos) {
                    size_t  i = (size_t)(pos - line);

                    fputws(line, stdout);
                    wprintf(L"%*lc\n", (int)(i + 1), argw[arg][0]);
                }
            }

    }

    /* Because we are exiting the program,
       we don't *need* to free the line buffer we used.
       However, this is completely safe,
       and this is the way you should free the buffer. */
    free(line);
    line = NULL;
    size = 0;

    return EXIT_SUCCESS;
}

Because POSIX has not standardized the wide-character version of getline(), we implement our own variant as wide_line(). It supports all four newline conventions, and returns a size_t; 0 (with errno set) if an error occurs.

Because of the universal newline support, wide_line is not well suited for interactive input, as it tends to be one character "late". (For line-buffered input, as terminals tend to be, that means one full line late.)

I included the wide_line() implementation, because it, or something very much like it, solves most of problems when reading wide-input files that were written on various systems.

The dup_mbstowcs() function is most useful when the command line parameters are needed as wide character strings. It simply does the conversion to a dynamically allocated buffer. Essentially, argw[] is the wide-character copy of argv[] array.

Other than those two functions, and the code that creates the argw[] array, there is not much code at all. (Feel free to poach the functions, or the entire code, to be used in your own projects later on; I consider the code to be in Public Domain.)

If you save the above as example.c, you can compile it using e.g.

gcc -Wall -O2 example.c -o example

If you run e.g.

printf 'Sergio è un Italiano e andò via!\n' | ./example 'o' 'ò' 'è'

the output will be

Sergio è un Italiano e andò via!
     o
Sergio è un Italiano e andò via!
                          ò
Sergio è un Italiano e andò via!
       è

The indentation "trick" is that if i is the position you want the wide character to be printed at, then (i+1) is the width of that logical field. When we use * as the width field in the print specification, the width is read from an int parameter preceding the actual parameter being printed.




回答2:


You need to convert to and from the expected character encodings. Say the old system expects some Windows code page, and the new code expects UTF-8. Then to call old functions from the new stuff you need to:

  1. Check you can perform the conversion safely (the input may contain characters which cannot be represented in the desired Windows code page form)...
  2. Convert from UTF-8 to the desired Windows code page representation. This should yield a new buffer/string in the compatible representation (a copy).
  3. Call the old code with the newly converted representation of the original argument
  4. Receive the output in some buffer, it will be in the Windows code page representation.
  5. So convert that output into a UTF-8 copy.
  6. Cleanup the temporary copy of the input, the original output buffer from the old code.
  7. Return the converted UTF-8 output copy to the new code.

And you'd need to do the reverse dance if you want to call the new UTF-8 code from the old stuff.

EDIT: Note that your old system cannot have been expecting purely ASCII, because ASCII is a 7-bit encoding, and UTF-8 is explicitly backwards compatible with that. So your first task is to correct your understanding of what is the actual encoding being used.



来源:https://stackoverflow.com/questions/48730724/how-am-i-allowed-to-workaround-dos-functions-that-used-strings-containing-accent

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!