Conversion between UTF-8 ArrayBuffer and String

I have an ArrayBuffer which contains a string encoded using UTF-8 and I can't find a standard way of converting such ArrayBuffer into a JS String (which I understand is encoded using UTF-16).

I've seen this code in numerous places, but I fail to see how it would work with any UTF-8 code points that are longer than 1 byte.

return String.fromCharCode.apply(null, new Uint8Array(data));

Similarly, I can't find a standard way of converting from a String to a UTF-8 encoded ArrayBuffer.

function stringToUint(string) {
    var string = btoa(unescape(encodeURIComponent(string))),
        charList = string.split(''),
        uintArray = [];
    for (var i = 0; i < charList.length; i++) {
        uintArray.push(charList[i].charCodeAt(0));
    }
    return new Uint8Array(uintArray);
}

function uintToString(uintArray) {
    var encodedString = String.fromCharCode.apply(null, uintArray),
        decodedString = decodeURIComponent(escape(atob(encodedString)));
    return decodedString;
}

I have done, with some help from the internet, these little functions, they should solve your problems! Here is the working JSFiddle.

EDIT:

Since the source of the Uint8Array is external and you can't use atob you just need to remove it(working fiddle):

function uintToString(uintArray) {
    var encodedString = String.fromCharCode.apply(null, uintArray),
        decodedString = decodeURIComponent(escape(encodedString));
    return decodedString;
}

PPB

Using TextEncoder and TextDecoder

var uint8array = new TextEncoder("utf-8").encode("Plain Text");
var string = new TextDecoder().decode(uint8array);
console.log(uint8array ,string )

Albert

This should work:

// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt

/* utf.js - UTF-8 <=> UTF-16 convertion
 *
 * Copyright (C) 1999 Masanao Izumo <iz@onicos.co.jp>
 * Version: 1.0
 * LastModified: Dec 25 1999
 * This library is free.  You can redistribute it and/or modify it.
 */

function Utf8ArrayToStr(array) {
  var out, i, len, c;
  var char2, char3;

  out = "";
  len = array.length;
  i = 0;
  while (i < len) {
    c = array[i++];
    switch (c >> 4)
    { 
      case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
        // 0xxxxxxx
        out += String.fromCharCode(c);
        break;
      case 12: case 13:
        // 110x xxxx   10xx xxxx
        char2 = array[i++];
        out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
        break;
      case 14:
        // 1110 xxxx  10xx xxxx  10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        out += String.fromCharCode(((c & 0x0F) << 12) |
                                   ((char2 & 0x3F) << 6) |
                                   ((char3 & 0x3F) << 0));
        break;
    }
  }    
  return out;
}

It's somewhat cleaner as the other solutions because it doesn't use any hacks nor depends on Browser JS functions, e.g. works also in other JS environments.

Check out the JSFiddle demo.

Also see the related questions: here, here

There's a polyfill for Encoding over on Github: text-encoding. It's easy for Node or the browser, and the Readme advises the following:

var uint8array = TextEncoder(encoding).encode(string);
var string = TextDecoder(encoding).decode(uint8array);

If I recall, 'utf-8' is the encoding you need, and of course you'll need to wrap your buffer:

var uint8array = new Uint8Array(utf8buffer);

Hope it works as well for you as it has for me.

If you are doing this in browser there are no character encoding libraries built-in, but you can get by with:

function pad(n) {
    return n.length < 2 ? "0" + n : n;
}

var array = new Uint8Array(data);
var str = "";
for( var i = 0, len = array.length; i < len; ++i ) {
    str += ( "%" + pad(array[i].toString(16)))
}

str = decodeURIComponent(str);

Here's a demo that decodes a 3-byte UTF-8 unit: http://jsfiddle.net/Z9pQE/

The methods readAsArrayBuffer and readAsText from a FileReader object converts a Blob object to an ArrayBuffer or to a DOMString asynchronous.

A Blob object type can be created from a raw text or byte array, for example.

let blob = new Blob([text], { type: "text/plain" });

let reader = new FileReader();
reader.onload = event =>
{
    let buffer = event.target.result;
};
reader.readAsArrayBuffer(blob);

I think it's better to pack up this in a promise:

function textToByteArray(text)
{
    let blob = new Blob([text], { type: "text/plain" });
    let reader = new FileReader();
    let done = function() { };

    reader.onload = event =>
    {
        done(new Uint8Array(event.target.result));
    };
    reader.readAsArrayBuffer(blob);

    return { done: function(callback) { done = callback; } }
}

function byteArrayToText(bytes, encoding)
{
    let blob = new Blob([bytes], { type: "application/octet-stream" });
    let reader = new FileReader();
    let done = function() { };

    reader.onload = event =>
    {
        done(event.target.result);
    };

    if(encoding) { reader.readAsText(blob, encoding); } else { reader.readAsText(blob); }

    return { done: function(callback) { done = callback; } }
}

let text = "\uD83D\uDCA9 = \u2661";
textToByteArray(text).done(bytes =>
{
    console.log(bytes);
    byteArrayToText(bytes, 'UTF-8').done(text => 
    {
        console.log(text); // 💩 = ♡
    });
});

The main problem of programmers looking for conversion from byte array into a string is UTF-8 encoding (compression) of unicode characters. This code will help you:

var getString = function (strBytes) {

    var MAX_SIZE = 0x4000;
    var codeUnits = [];
    var highSurrogate;
    var lowSurrogate;
    var index = -1;

    var result = '';

    while (++index < strBytes.length) {
        var codePoint = Number(strBytes[index]);

        if (codePoint === (codePoint & 0x7F)) {

        } else if (0xF0 === (codePoint & 0xF0)) {
            codePoint ^= 0xF0;
            codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
            codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
            codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
        } else if (0xE0 === (codePoint & 0xE0)) {
            codePoint ^= 0xE0;
            codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
            codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
        } else if (0xC0 === (codePoint & 0xC0)) {
            codePoint ^= 0xC0;
            codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
        }

        if (!isFinite(codePoint) || codePoint < 0 || codePoint > 0x10FFFF || Math.floor(codePoint) != codePoint)
            throw RangeError('Invalid code point: ' + codePoint);

        if (codePoint <= 0xFFFF)
            codeUnits.push(codePoint);
        else {
            codePoint -= 0x10000;
            highSurrogate = (codePoint >> 10) | 0xD800;
            lowSurrogate = (codePoint % 0x400) | 0xDC00;
            codeUnits.push(highSurrogate, lowSurrogate);
        }
        if (index + 1 == strBytes.length || codeUnits.length > MAX_SIZE) {
            result += String.fromCharCode.apply(null, codeUnits);
            codeUnits.length = 0;
        }
    }

    return result;
}

All the best !

For those that are working in Node.js >= 0.1.99

Node.js offers a built-in module called StringDecoder that is expressly designed to decode multi-bytes buffers into strings.

https://nodejs.org/api/string_decoder.html

From the link above:

The string_decoder module provides an API for decoding Buffer objects into strings in a manner that preserves encoded multi-byte UTF-8 and UTF-16 characters

Example (borrowed from the above docs):


const { StringDecoder } = require('string_decoder');
const decoder = new StringDecoder('utf8');

const cent = Buffer.from([0xC2, 0xA2]);
console.log(decoder.write(cent)); // ¢

const euro = Buffer.from([0xE2, 0x82, 0xAC]);
console.log(decoder.write(euro)); // €

来源：https://stackoverflow.com/questions/17191945/conversion-between-utf-8-arraybuffer-and-string

标签

javascript

string

utf-8

arraybuffer