NFC: Use TextEncoder for stringToUTF8Array and lengthBytesUTF8 Function

PtPrashantTripathi · PtPrashantTripathi · commit 33d95ea603bf · 2025-09-04T19:16:14.000Z
diff --git a/AUTHORS b/AUTHORS
@@ -601,3 +601,4 @@ a license to everyone to use it as detailed in LICENSE.)
 * Artur Gatin <agatin@teladochealth.com> (copyright owned by Teladoc Health, Inc.)
 * Christian Lloyd <clloyd@teladochealth.com> (copyright owned by Teladoc Health, Inc.)
 * Sean Morris <sean@seanmorr.is>
+* Pt. Prashant Tripathi <ptprashanttripathi@outlook.com>
diff --git a/site/source/docs/tools_reference/settings_reference.rst b/site/source/docs/tools_reference/settings_reference.rst
@@ -2700,6 +2700,18 @@ of ENVIRONMENT since TextDecoder is not available in those environments).
 
 Default value: 1
 
+.. _textencoder:
+
+TEXTENCODER
+===========
+
+The default value of 1 means the generated code will use TextEncoder if
+available and fall back to custom encoding code when it is not available.  
+If set to 2, we assume TextEncoder is always present and usable, and no
+fallback JS code will be emitted.
+
+Default value: 1
+
 .. _embind_std_string_is_utf8:
 
 EMBIND_STD_STRING_IS_UTF8
diff --git a/src/lib/libstrings.js b/src/lib/libstrings.js
@@ -8,6 +8,10 @@
 #error "TEXTDECODER must be either 1 or 2"
 #endif
 
+#if TEXTENCODER != 1 && TEXTENCODER != 2
+#error "TEXTENCODER must be either 1 or 2"
+#endif
+
 addToLibrary({
   // TextDecoder constructor defaults to UTF-8
 #if TEXTDECODER == 2
@@ -16,6 +20,13 @@ addToLibrary({
   $UTF8Decoder: "typeof TextDecoder != 'undefined' ? new TextDecoder() : undefined",
 #endif
 
+  // TextEncoder constructor defaults to UTF-8
+#if TEXTENCODER == 2
+  $UTF8Encoder: "new TextEncoder()",
+#else
+  $UTF8Encoder: "typeof TextEncoder != 'undefined' ? new TextEncoder() : undefined",
+#endif
+
   $findStringEnd: (heapOrArray, idx, maxBytesToRead, ignoreNul) => {
     var maxIdx = idx + maxBytesToRead;
     if (ignoreNul) return maxIdx;
@@ -133,7 +144,7 @@ addToLibrary({
    * terminator) that this function will write.
    *
    * @param {string} str - The Javascript string to copy.
-   * @param {ArrayBufferView|Array<number>} heap - The array to copy to. Each
+   * @param {ArrayBufferView} heap - The array to copy to. Each
    *                                               index in this array is assumed
    *                                               to be one 8-byte element.
    * @param {number} outIdx - The starting offset in the array to begin the copying.
@@ -147,9 +158,12 @@ addToLibrary({
    *                                   terminator.
    * @return {number} The number of bytes written, EXCLUDING the null terminator.
    */
+  $stringToUTF8Array__deps: [
+    '$UTF8Encoder',
 #if ASSERTIONS
-  $stringToUTF8Array__deps: ['$warnOnce'],
+    '$warnOnce',
 #endif
+  ],
   $stringToUTF8Array: (str, heap, outIdx, maxBytesToWrite) => {
 #if CAN_ADDRESS_2GB
     outIdx >>>= 0;
@@ -162,6 +176,26 @@ addToLibrary({
     if (!(maxBytesToWrite > 0))
       return 0;
 
+#if TEXTENCODER == 2
+    // Always use TextEncoder when TEXTENCODER == 2
+    var encoded = UTF8Encoder.encode(str);
+    var bytesToWrite = Math.min(encoded.length, maxBytesToWrite - 1); // -1 for null terminator
+    encoded = encoded.subarray(0, bytesToWrite)
+    heap.set(encoded, outIdx)
+    heap[outIdx + bytesToWrite] = 0;
+    return bytesToWrite;
+#else
+    // When using conditional TextEncoder, use it for longer strings if available
+    if (str.length > 16 && UTF8Encoder) {
+      var encoded = UTF8Encoder.encode(str);
+      var bytesToWrite = Math.min(encoded.length, maxBytesToWrite - 1); // -1 for null terminator
+      encoded = encoded.subarray(0, bytesToWrite)
+      heap.set(encoded, outIdx)
+      heap[outIdx + bytesToWrite] = 0;
+      return bytesToWrite;
+    }
+    
+    // Fallback: manual UTF-8 encoding
     var startIdx = outIdx;
     var endIdx = outIdx + maxBytesToWrite - 1; // -1 for string null terminator.
     for (var i = 0; i < str.length; ++i) {
@@ -198,6 +232,7 @@ addToLibrary({
     // Null-terminate the pointer to the buffer.
     heap[outIdx] = 0;
     return outIdx - startIdx;
+#endif // TEXTENCODER == 2
   },
 
   /**
@@ -218,24 +253,54 @@ addToLibrary({
   },
 
   /**
-   * Returns the number of bytes the given JavaScript string takes if encoded as a
+   * Returns the number of bytes the given Javascript string takes if encoded as a
    * UTF8 byte array, EXCLUDING the null terminator byte.
    *
-   * @param {string} str - The JavaScript string to operate on.
-   * @return {number} The length, in bytes, of the UTF-8 encoded string.
+   * @param {string} str - JavaScript string to operator on
+   * @return {number} Length, in bytes, of the UTF8 encoded string.
    */
+  $lengthBytesUTF8__deps: ['$UTF8Encoder'],
   $lengthBytesUTF8: (str) => {
-    return UTF8Decoder.encode(str).length;
+#if TEXTENCODER == 2
+    // Always use TextEncoder when TEXTENCODER == 2
+    return UTF8Encoder.encode(str).length;
+#else
+    // When using conditional TextEncoder, use it for longer strings if available
+    if (UTF8Encoder) {
+      return UTF8Encoder.encode(str).length;
+    }
+    
+    // Fallback: manual calculation
+    var len = 0;
+    for (var i = 0; i < str.length; ++i) {
+      // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code
+      // unit, not a Unicode code point of the character! So decode
+      // UTF16->UTF32->UTF8.
+      // See http://unicode.org/faq/utf_bom.html#utf16-3
+      var c = str.charCodeAt(i); // possibly a lead surrogate
+      if (c <= 0x7F) {
+        len++;
+      } else if (c <= 0x7FF) {
+        len += 2;
+      } else if (c >= 0xD800 && c <= 0xDFFF) {
+        len += 4; ++i;
+      } else {
+        len += 3;
+      }
+    }
+    return len;
+#endif // TEXTENCODER == 2
   },
 
   $intArrayFromString__docs: '/** @type {function(string, boolean=, number=)} */',
   $intArrayFromString__deps: ['$lengthBytesUTF8', '$stringToUTF8Array'],
   $intArrayFromString: (stringy, dontAddNull, length) => {
     var len = length > 0 ? length : lengthBytesUTF8(stringy)+1;
-    var u8array = new Array(len);
+    var u8array = new Uint8Array(len);
     var numBytesWritten = stringToUTF8Array(stringy, u8array, 0, u8array.length);
-    if (dontAddNull) u8array.length = numBytesWritten;
-    return u8array;
+    if (dontAddNull) 
+      u8array = u8array.subarray(0, numBytesWritten);
+    return Array.from(u8array);
   },
 
   $intArrayToString: (array) => {
diff --git a/src/settings.js b/src/settings.js
@@ -1776,6 +1776,13 @@ var EVAL_CTORS = 0;
 // [link]
 var TEXTDECODER = 1;
 
+// The default value of 1 means the generated code will use TextEncoder if
+// available and fall back to custom encoding code when it is not available.  
+// If set to 2, we assume TextEncoder is always present and usable, and no
+// fallback JS code will be emitted.
+// [link]
+var TEXTENCODER = 1;
+
 // Embind specific: If enabled, assume UTF-8 encoded data in std::string binding.
 // Disable this to support binary data transfer.
 // [link]
diff --git a/test/code_size/test_minimal_runtime_code_size_hello_embind.json b/test/code_size/test_minimal_runtime_code_size_hello_embind.json
@@ -1,10 +1,10 @@
 {
   "a.html": 552,
   "a.html.gz": 373,
-  "a.js": 7255,
-  "a.js.gz": 3313,
+  "a.js": 6890,
+  "a.js.gz": 3182,
   "a.wasm": 7315,
   "a.wasm.gz": 3368,
-  "total": 15122,
-  "total_gz": 7054
+  "total": 14757,
+  "total_gz": 6923
 }
diff --git a/test/code_size/test_minimal_runtime_code_size_hello_embind_val.json b/test/code_size/test_minimal_runtime_code_size_hello_embind_val.json
@@ -1,10 +1,10 @@
 {
   "a.html": 552,
   "a.html.gz": 373,
-  "a.js": 5356,
-  "a.js.gz": 2526,
+  "a.js": 4991,
+  "a.js.gz": 2382,
   "a.wasm": 5852,
   "a.wasm.gz": 2743,
-  "total": 11760,
-  "total_gz": 5642
+  "total": 11395,
+  "total_gz": 5498
 }
diff --git a/test/code_size/test_minimal_runtime_code_size_hello_webgl2_wasm.json b/test/code_size/test_minimal_runtime_code_size_hello_webgl2_wasm.json
@@ -3,8 +3,8 @@
   "a.html.gz": 321,
   "a.js": 4437,
   "a.js.gz": 2281,
-  "a.wasm": 8317,
-  "a.wasm.gz": 5660,
-  "total": 13208,
-  "total_gz": 8262
+  "a.wasm": 8290,
+  "a.wasm.gz": 5639,
+  "total": 13181,
+  "total_gz": 8241
 }
diff --git a/test/code_size/test_minimal_runtime_code_size_hello_webgl2_wasm2js.json b/test/code_size/test_minimal_runtime_code_size_hello_webgl2_wasm2js.json
@@ -1,8 +1,8 @@
 {
   "a.html": 346,
   "a.html.gz": 255,
-  "a.js": 18207,
-  "a.js.gz": 9835,
-  "total": 18553,
-  "total_gz": 10090
+  "a.js": 18186,
+  "a.js.gz": 9818,
+  "total": 18532,
+  "total_gz": 10073
 }
diff --git a/test/code_size/test_minimal_runtime_code_size_hello_webgl_wasm.json b/test/code_size/test_minimal_runtime_code_size_hello_webgl_wasm.json
@@ -3,8 +3,8 @@
   "a.html.gz": 321,
   "a.js": 3975,
   "a.js.gz": 2123,
-  "a.wasm": 8317,
-  "a.wasm.gz": 5660,
-  "total": 12746,
-  "total_gz": 8104
+  "a.wasm": 8290,
+  "a.wasm.gz": 5639,
+  "total": 12719,
+  "total_gz": 8083
 }
diff --git a/test/code_size/test_minimal_runtime_code_size_hello_webgl_wasm2js.json b/test/code_size/test_minimal_runtime_code_size_hello_webgl_wasm2js.json
@@ -1,8 +1,8 @@
 {
   "a.html": 346,
   "a.html.gz": 255,
-  "a.js": 17733,
-  "a.js.gz": 9669,
-  "total": 18079,
-  "total_gz": 9924
+  "a.js": 17713,
+  "a.js.gz": 9651,
+  "total": 18059,
+  "total_gz": 9906
 }
diff --git a/test/code_size/test_minimal_runtime_code_size_random_printf_wasm.json b/test/code_size/test_minimal_runtime_code_size_random_printf_wasm.json
@@ -1,4 +1,4 @@
 {
   "a.html": 12507,
-  "a.html.gz": 6824
+  "a.html.gz": 6822
 }
diff --git a/test/code_size/test_unoptimized_code_size.json b/test/code_size/test_unoptimized_code_size.json
@@ -1,16 +1,16 @@
 {
-  "hello_world.js": 53881,
-  "hello_world.js.gz": 17016,
+  "hello_world.js": 53898,
+  "hello_world.js.gz": 17019,
   "hello_world.wasm": 15127,
-  "hello_world.wasm.gz": 7450,
+  "hello_world.wasm.gz": 7448,
   "no_asserts.js": 26352,
   "no_asserts.js.gz": 8789,
   "no_asserts.wasm": 12227,
-  "no_asserts.wasm.gz": 6010,
-  "strict.js": 51919,
-  "strict.js.gz": 16352,
+  "no_asserts.wasm.gz": 6008,
+  "strict.js": 51936,
+  "strict.js.gz": 16357,
   "strict.wasm": 15127,
-  "strict.wasm.gz": 7447,
-  "total": 174633,
-  "total_gz": 63064
+  "strict.wasm.gz": 7445,
+  "total": 174667,
+  "total_gz": 63066
 }
diff --git a/tools/link.py b/tools/link.py
@@ -1068,16 +1068,21 @@ def limit_incoming_module_api():
   if 'noExitRuntime' in settings.INCOMING_MODULE_JS_API:
     settings.DEFAULT_LIBRARY_FUNCS_TO_INCLUDE.append('$noExitRuntime')
 
-  # Default to TEXTDECODER=2 (always use TextDecoder to decode UTF-8 strings)
-  # in -Oz builds, since custom decoder for UTF-8 takes up space.
-  # When supporting shell environments, do not do this as TextDecoder is not
-  # widely supported there.
-  # In Audio Worklets TextDecoder API is intentionally not exposed
-  # (https://github.com/WebAudio/web-audio-api/issues/2499) so we also need to
-  # keep the JavaScript-based fallback.
+  # Default to TEXTDECODER=2 (always use TextDecoder for decoding UTF-8 strings)
+  # and TEXTENCODER=2 (always use TextEncoder for encoding UTF-8 strings)
+  # in -Oz builds to minimize code size, as custom UTF-8 encoder/decoder logic
+  # adds extra code footprint.
+  #
+  # Exceptions:
+  # - Shell environments: TextDecoder and TextEncoder are not consistently
+  #   available, so we must retain the JavaScript-based fallback.
+  # - Audio Worklets: The TextDecoder API is intentionally not exposed
+  #   (see: https://github.com/WebAudio/web-audio-api/issues/2499), requiring
+  #   the fallback decoder to remain in place.
   if settings.SHRINK_LEVEL >= 2 and not settings.AUDIO_WORKLET and \
      not settings.ENVIRONMENT_MAY_BE_SHELL:
     default_setting('TEXTDECODER', 2)
+    default_setting('TEXTENCODER', 2)
 
   # If set to 1, we will run the autodebugger (the automatic debugging tool, see
   # tools/autodebugger).  Note that this will disable inclusion of libraries. This

Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,8 @@`
`1`	`1`	`{`
`2`	`2`	`"a.html": 346,`
`3`	`3`	`"a.html.gz": 255,`
`4`		`- "a.js": 18207,`
`5`		`- "a.js.gz": 9835,`
`6`		`- "total": 18553,`
`7`		`- "total_gz": 10090`
	`4`	`+ "a.js": 18186,`
	`5`	`+ "a.js.gz": 9818,`
	`6`	`+ "total": 18532,`
	`7`	`+ "total_gz": 10073`
`8`	`8`	`}`