Skip to content

Commit 33d95ea

Browse files
NFC: Use TextEncoder for stringToUTF8Array and lengthBytesUTF8 Function
1 parent 803d1a6 commit 33d95ea

13 files changed

+140
-50
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,3 +601,4 @@ a license to everyone to use it as detailed in LICENSE.)
601601
* Artur Gatin <agatin@teladochealth.com> (copyright owned by Teladoc Health, Inc.)
602602
* Christian Lloyd <clloyd@teladochealth.com> (copyright owned by Teladoc Health, Inc.)
603603
* Sean Morris <sean@seanmorr.is>
604+
* Pt. Prashant Tripathi <ptprashanttripathi@outlook.com>

site/source/docs/tools_reference/settings_reference.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2700,6 +2700,18 @@ of ENVIRONMENT since TextDecoder is not available in those environments).
27002700

27012701
Default value: 1
27022702

2703+
.. _textencoder:
2704+
2705+
TEXTENCODER
2706+
===========
2707+
2708+
The default value of 1 means the generated code will use TextEncoder if
2709+
available and fall back to custom encoding code when it is not available.
2710+
If set to 2, we assume TextEncoder is always present and usable, and no
2711+
fallback JS code will be emitted.
2712+
2713+
Default value: 1
2714+
27032715
.. _embind_std_string_is_utf8:
27042716

27052717
EMBIND_STD_STRING_IS_UTF8

src/lib/libstrings.js

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
#error "TEXTDECODER must be either 1 or 2"
99
#endif
1010

11+
#if TEXTENCODER != 1 && TEXTENCODER != 2
12+
#error "TEXTENCODER must be either 1 or 2"
13+
#endif
14+
1115
addToLibrary({
1216
// TextDecoder constructor defaults to UTF-8
1317
#if TEXTDECODER == 2
@@ -16,6 +20,13 @@ addToLibrary({
1620
$UTF8Decoder: "typeof TextDecoder != 'undefined' ? new TextDecoder() : undefined",
1721
#endif
1822

23+
// TextEncoder constructor defaults to UTF-8
24+
#if TEXTENCODER == 2
25+
$UTF8Encoder: "new TextEncoder()",
26+
#else
27+
$UTF8Encoder: "typeof TextEncoder != 'undefined' ? new TextEncoder() : undefined",
28+
#endif
29+
1930
$findStringEnd: (heapOrArray, idx, maxBytesToRead, ignoreNul) => {
2031
var maxIdx = idx + maxBytesToRead;
2132
if (ignoreNul) return maxIdx;
@@ -133,7 +144,7 @@ addToLibrary({
133144
* terminator) that this function will write.
134145
*
135146
* @param {string} str - The Javascript string to copy.
136-
* @param {ArrayBufferView|Array<number>} heap - The array to copy to. Each
147+
* @param {ArrayBufferView} heap - The array to copy to. Each
137148
* index in this array is assumed
138149
* to be one 8-byte element.
139150
* @param {number} outIdx - The starting offset in the array to begin the copying.
@@ -147,9 +158,12 @@ addToLibrary({
147158
* terminator.
148159
* @return {number} The number of bytes written, EXCLUDING the null terminator.
149160
*/
161+
$stringToUTF8Array__deps: [
162+
'$UTF8Encoder',
150163
#if ASSERTIONS
151-
$stringToUTF8Array__deps: ['$warnOnce'],
164+
'$warnOnce',
152165
#endif
166+
],
153167
$stringToUTF8Array: (str, heap, outIdx, maxBytesToWrite) => {
154168
#if CAN_ADDRESS_2GB
155169
outIdx >>>= 0;
@@ -162,6 +176,26 @@ addToLibrary({
162176
if (!(maxBytesToWrite > 0))
163177
return 0;
164178

179+
#if TEXTENCODER == 2
180+
// Always use TextEncoder when TEXTENCODER == 2
181+
var encoded = UTF8Encoder.encode(str);
182+
var bytesToWrite = Math.min(encoded.length, maxBytesToWrite - 1); // -1 for null terminator
183+
encoded = encoded.subarray(0, bytesToWrite)
184+
heap.set(encoded, outIdx)
185+
heap[outIdx + bytesToWrite] = 0;
186+
return bytesToWrite;
187+
#else
188+
// When using conditional TextEncoder, use it for longer strings if available
189+
if (str.length > 16 && UTF8Encoder) {
190+
var encoded = UTF8Encoder.encode(str);
191+
var bytesToWrite = Math.min(encoded.length, maxBytesToWrite - 1); // -1 for null terminator
192+
encoded = encoded.subarray(0, bytesToWrite)
193+
heap.set(encoded, outIdx)
194+
heap[outIdx + bytesToWrite] = 0;
195+
return bytesToWrite;
196+
}
197+
198+
// Fallback: manual UTF-8 encoding
165199
var startIdx = outIdx;
166200
var endIdx = outIdx + maxBytesToWrite - 1; // -1 for string null terminator.
167201
for (var i = 0; i < str.length; ++i) {
@@ -198,6 +232,7 @@ addToLibrary({
198232
// Null-terminate the pointer to the buffer.
199233
heap[outIdx] = 0;
200234
return outIdx - startIdx;
235+
#endif // TEXTENCODER == 2
201236
},
202237

203238
/**
@@ -218,24 +253,54 @@ addToLibrary({
218253
},
219254

220255
/**
221-
* Returns the number of bytes the given JavaScript string takes if encoded as a
256+
* Returns the number of bytes the given Javascript string takes if encoded as a
222257
* UTF8 byte array, EXCLUDING the null terminator byte.
223258
*
224-
* @param {string} str - The JavaScript string to operate on.
225-
* @return {number} The length, in bytes, of the UTF-8 encoded string.
259+
* @param {string} str - JavaScript string to operator on
260+
* @return {number} Length, in bytes, of the UTF8 encoded string.
226261
*/
262+
$lengthBytesUTF8__deps: ['$UTF8Encoder'],
227263
$lengthBytesUTF8: (str) => {
228-
return UTF8Decoder.encode(str).length;
264+
#if TEXTENCODER == 2
265+
// Always use TextEncoder when TEXTENCODER == 2
266+
return UTF8Encoder.encode(str).length;
267+
#else
268+
// When using conditional TextEncoder, use it for longer strings if available
269+
if (UTF8Encoder) {
270+
return UTF8Encoder.encode(str).length;
271+
}
272+
273+
// Fallback: manual calculation
274+
var len = 0;
275+
for (var i = 0; i < str.length; ++i) {
276+
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code
277+
// unit, not a Unicode code point of the character! So decode
278+
// UTF16->UTF32->UTF8.
279+
// See http://unicode.org/faq/utf_bom.html#utf16-3
280+
var c = str.charCodeAt(i); // possibly a lead surrogate
281+
if (c <= 0x7F) {
282+
len++;
283+
} else if (c <= 0x7FF) {
284+
len += 2;
285+
} else if (c >= 0xD800 && c <= 0xDFFF) {
286+
len += 4; ++i;
287+
} else {
288+
len += 3;
289+
}
290+
}
291+
return len;
292+
#endif // TEXTENCODER == 2
229293
},
230294

231295
$intArrayFromString__docs: '/** @type {function(string, boolean=, number=)} */',
232296
$intArrayFromString__deps: ['$lengthBytesUTF8', '$stringToUTF8Array'],
233297
$intArrayFromString: (stringy, dontAddNull, length) => {
234298
var len = length > 0 ? length : lengthBytesUTF8(stringy)+1;
235-
var u8array = new Array(len);
299+
var u8array = new Uint8Array(len);
236300
var numBytesWritten = stringToUTF8Array(stringy, u8array, 0, u8array.length);
237-
if (dontAddNull) u8array.length = numBytesWritten;
238-
return u8array;
301+
if (dontAddNull)
302+
u8array = u8array.subarray(0, numBytesWritten);
303+
return Array.from(u8array);
239304
},
240305

241306
$intArrayToString: (array) => {

src/settings.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,13 @@ var EVAL_CTORS = 0;
17761776
// [link]
17771777
var TEXTDECODER = 1;
17781778

1779+
// The default value of 1 means the generated code will use TextEncoder if
1780+
// available and fall back to custom encoding code when it is not available.
1781+
// If set to 2, we assume TextEncoder is always present and usable, and no
1782+
// fallback JS code will be emitted.
1783+
// [link]
1784+
var TEXTENCODER = 1;
1785+
17791786
// Embind specific: If enabled, assume UTF-8 encoded data in std::string binding.
17801787
// Disable this to support binary data transfer.
17811788
// [link]
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"a.html": 552,
33
"a.html.gz": 373,
4-
"a.js": 7255,
5-
"a.js.gz": 3313,
4+
"a.js": 6890,
5+
"a.js.gz": 3182,
66
"a.wasm": 7315,
77
"a.wasm.gz": 3368,
8-
"total": 15122,
9-
"total_gz": 7054
8+
"total": 14757,
9+
"total_gz": 6923
1010
}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"a.html": 552,
33
"a.html.gz": 373,
4-
"a.js": 5356,
5-
"a.js.gz": 2526,
4+
"a.js": 4991,
5+
"a.js.gz": 2382,
66
"a.wasm": 5852,
77
"a.wasm.gz": 2743,
8-
"total": 11760,
9-
"total_gz": 5642
8+
"total": 11395,
9+
"total_gz": 5498
1010
}

test/code_size/test_minimal_runtime_code_size_hello_webgl2_wasm.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
"a.html.gz": 321,
44
"a.js": 4437,
55
"a.js.gz": 2281,
6-
"a.wasm": 8317,
7-
"a.wasm.gz": 5660,
8-
"total": 13208,
9-
"total_gz": 8262
6+
"a.wasm": 8290,
7+
"a.wasm.gz": 5639,
8+
"total": 13181,
9+
"total_gz": 8241
1010
}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"a.html": 346,
33
"a.html.gz": 255,
4-
"a.js": 18207,
5-
"a.js.gz": 9835,
6-
"total": 18553,
7-
"total_gz": 10090
4+
"a.js": 18186,
5+
"a.js.gz": 9818,
6+
"total": 18532,
7+
"total_gz": 10073
88
}

test/code_size/test_minimal_runtime_code_size_hello_webgl_wasm.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
"a.html.gz": 321,
44
"a.js": 3975,
55
"a.js.gz": 2123,
6-
"a.wasm": 8317,
7-
"a.wasm.gz": 5660,
8-
"total": 12746,
9-
"total_gz": 8104
6+
"a.wasm": 8290,
7+
"a.wasm.gz": 5639,
8+
"total": 12719,
9+
"total_gz": 8083
1010
}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"a.html": 346,
33
"a.html.gz": 255,
4-
"a.js": 17733,
5-
"a.js.gz": 9669,
6-
"total": 18079,
7-
"total_gz": 9924
4+
"a.js": 17713,
5+
"a.js.gz": 9651,
6+
"total": 18059,
7+
"total_gz": 9906
88
}

0 commit comments

Comments
 (0)