问题
I have an ArrayBuffer
which contains a string encoded using UTF-8 and I can\'t find a standard way of converting such ArrayBuffer
into a JS String
(which I understand is encoded using UTF-16).
I\'ve seen this code in numerous places, but I fail to see how it would work with any UTF-8 code points that are longer than 1 byte.
return String.fromCharCode.apply(null, new Uint8Array(data));
Similarly, I can\'t find a standard way of converting from a String
to a UTF-8 encoded ArrayBuffer
.
回答1:
function stringToUint(string) {
var string = btoa(unescape(encodeURIComponent(string))),
charList = string.split(''),
uintArray = [];
for (var i = 0; i < charList.length; i++) {
uintArray.push(charList[i].charCodeAt(0));
}
return new Uint8Array(uintArray);
}
function uintToString(uintArray) {
var encodedString = String.fromCharCode.apply(null, uintArray),
decodedString = decodeURIComponent(escape(atob(encodedString)));
return decodedString;
}
I have done, with some help from the internet, these little functions, they should solve your problems! Here is the working JSFiddle.
EDIT:
Since the source of the Uint8Array is external and you can't use atob
you just need to remove it(working fiddle):
function uintToString(uintArray) {
var encodedString = String.fromCharCode.apply(null, uintArray),
decodedString = decodeURIComponent(escape(encodedString));
return decodedString;
}
Warning: escape and unescape is removed from web standards. See this.
回答2:
Using TextEncoder and TextDecoder
var uint8array = new TextEncoder("utf-8").encode("Plain Text");
var string = new TextDecoder().decode(uint8array);
console.log(uint8array ,string )
回答3:
This should work:
// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt
/* utf.js - UTF-8 <=> UTF-16 convertion
*
* Copyright (C) 1999 Masanao Izumo <iz@onicos.co.jp>
* Version: 1.0
* LastModified: Dec 25 1999
* This library is free. You can redistribute it and/or modify it.
*/
function Utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3;
out = "";
len = array.length;
i = 0;
while (i < len) {
c = array[i++];
switch (c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxxxxxx
out += String.fromCharCode(c);
break;
case 12: case 13:
// 110x xxxx 10xx xxxx
char2 = array[i++];
out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx 10xx xxxx 10xx xxxx
char2 = array[i++];
char3 = array[i++];
out += String.fromCharCode(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
}
}
return out;
}
It's somewhat cleaner as the other solutions because it doesn't use any hacks nor depends on Browser JS functions, e.g. works also in other JS environments.
Check out the JSFiddle demo.
Also see the related questions: here, here
回答4:
There's a polyfill for Encoding over on Github: text-encoding. It's easy for Node or the browser, and the Readme advises the following:
var uint8array = TextEncoder(encoding).encode(string);
var string = TextDecoder(encoding).decode(uint8array);
If I recall, 'utf-8'
is the encoding
you need, and of course you'll need to wrap your buffer:
var uint8array = new Uint8Array(utf8buffer);
Hope it works as well for you as it has for me.
回答5:
If you are doing this in browser there are no character encoding libraries built-in, but you can get by with:
function pad(n) {
return n.length < 2 ? "0" + n : n;
}
var array = new Uint8Array(data);
var str = "";
for( var i = 0, len = array.length; i < len; ++i ) {
str += ( "%" + pad(array[i].toString(16)))
}
str = decodeURIComponent(str);
Here's a demo that decodes a 3-byte UTF-8 unit: http://jsfiddle.net/Z9pQE/
回答6:
The methods readAsArrayBuffer and readAsText from a FileReader object converts a Blob object to an ArrayBuffer or to a DOMString asynchronous.
A Blob object type can be created from a raw text or byte array, for example.
let blob = new Blob([text], { type: "text/plain" });
let reader = new FileReader();
reader.onload = event =>
{
let buffer = event.target.result;
};
reader.readAsArrayBuffer(blob);
I think it's better to pack up this in a promise:
function textToByteArray(text)
{
let blob = new Blob([text], { type: "text/plain" });
let reader = new FileReader();
let done = function() { };
reader.onload = event =>
{
done(new Uint8Array(event.target.result));
};
reader.readAsArrayBuffer(blob);
return { done: function(callback) { done = callback; } }
}
function byteArrayToText(bytes, encoding)
{
let blob = new Blob([bytes], { type: "application/octet-stream" });
let reader = new FileReader();
let done = function() { };
reader.onload = event =>
{
done(event.target.result);
};
if(encoding) { reader.readAsText(blob, encoding); } else { reader.readAsText(blob); }
return { done: function(callback) { done = callback; } }
}
let text = "\uD83D\uDCA9 = \u2661";
textToByteArray(text).done(bytes =>
{
console.log(bytes);
byteArrayToText(bytes, 'UTF-8').done(text =>
{
console.log(text); // 💩 = ♡
});
});
回答7:
The main problem of programmers looking for conversion from byte array into a string is UTF-8 encoding (compression) of unicode characters. This code will help you:
var getString = function (strBytes) {
var MAX_SIZE = 0x4000;
var codeUnits = [];
var highSurrogate;
var lowSurrogate;
var index = -1;
var result = '';
while (++index < strBytes.length) {
var codePoint = Number(strBytes[index]);
if (codePoint === (codePoint & 0x7F)) {
} else if (0xF0 === (codePoint & 0xF0)) {
codePoint ^= 0xF0;
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
} else if (0xE0 === (codePoint & 0xE0)) {
codePoint ^= 0xE0;
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
} else if (0xC0 === (codePoint & 0xC0)) {
codePoint ^= 0xC0;
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
}
if (!isFinite(codePoint) || codePoint < 0 || codePoint > 0x10FFFF || Math.floor(codePoint) != codePoint)
throw RangeError('Invalid code point: ' + codePoint);
if (codePoint <= 0xFFFF)
codeUnits.push(codePoint);
else {
codePoint -= 0x10000;
highSurrogate = (codePoint >> 10) | 0xD800;
lowSurrogate = (codePoint % 0x400) | 0xDC00;
codeUnits.push(highSurrogate, lowSurrogate);
}
if (index + 1 == strBytes.length || codeUnits.length > MAX_SIZE) {
result += String.fromCharCode.apply(null, codeUnits);
codeUnits.length = 0;
}
}
return result;
}
All the best !
回答8:
For those that are working in Node.js >= 0.1.99
Node.js offers a built-in module called StringDecoder
that is expressly designed to decode multi-bytes buffers into strings.
https://nodejs.org/api/string_decoder.html
From the link above:
The string_decoder module provides an API for decoding Buffer objects into strings in a manner that preserves encoded multi-byte UTF-8 and UTF-16 characters
Example (borrowed from the above docs):
const { StringDecoder } = require('string_decoder');
const decoder = new StringDecoder('utf8');
const cent = Buffer.from([0xC2, 0xA2]);
console.log(decoder.write(cent)); // ¢
const euro = Buffer.from([0xE2, 0x82, 0xAC]);
console.log(decoder.write(euro)); // €
来源:https://stackoverflow.com/questions/17191945/conversion-between-utf-8-arraybuffer-and-string