From f3fb6a11fe8b19e6556b9eaa70f7c7d457d63070 Mon Sep 17 00:00:00 2001
From: Ruben Bridgewater <ruben@bridgewater.de>
Date: Tue, 31 Dec 2019 15:14:04 +0100
Subject: [PATCH] src: change GetStringWidth's expand_emoji_sequence option
 default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The option is now set to true by default. Most terminals do not have
full emoji support and visualize emojis with zero width joiners as
individual emojis.
Also verify that at least one argument is always passed through to the
function and remove support for passing through code points. Only
accept strings from now on to simplify the API.

PR-URL: https://github.com/nodejs/node/pull/31112
Reviewed-By: Michaël Zasso <targos@protonmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
---
 lib/internal/readline/utils.js        |  3 ++
 src/node_i18n.cc                      | 13 ++---
 test/parallel/test-icu-stringwidth.js | 78 +++++++++++++--------------
 3 files changed, 43 insertions(+), 51 deletions(-)
diff --git a/lib/internal/readline/utils.js b/lib/internal/readline/utils.js
index 7c24a8e195a301..3257ee3a1e4a1f 100644
--- a/lib/internal/readline/utils.js
+++ b/lib/internal/readline/utils.js
@@ -63,6 +63,9 @@ if (internalBinding('config').hasIntl) {
   const icu = internalBinding('icu');
   // icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
   // Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
+  // TODO(BridgeAR): Expose the options to the user. That is probably the
+  // best thing possible at the moment, since it's difficult to know what
+  // the receiving end supports.
   getStringWidth = function getStringWidth(str) {
     let width = 0;
     str = stripVTControlCharacters(str);
diff --git a/src/node_i18n.cc b/src/node_i18n.cc
index 7cb7ab35d5060c..46c6ef39f861ec 100644
--- a/src/node_i18n.cc
+++ b/src/node_i18n.cc
@@ -767,18 +767,10 @@ static int GetColumnWidth(UChar32 codepoint,
 // Returns the column width for the given String.
 static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
   Environment* env = Environment::GetCurrent(args);
-  if (args.Length() < 1)
-    return;
+  CHECK(args[0]->IsString());
 
   bool ambiguous_as_full_width = args[1]->IsTrue();
-  bool expand_emoji_sequence = args[2]->IsTrue();
-
-  if (args[0]->IsNumber()) {
-    uint32_t val;
-    if (!args[0]->Uint32Value(env->context()).To(&val)) return;
-    args.GetReturnValue().Set(GetColumnWidth(val, ambiguous_as_full_width));
-    return;
-  }
+  bool expand_emoji_sequence = !args[2]->IsBoolean() || args[2]->IsTrue();
 
   TwoByteValue value(env->isolate(), args[0]);
   // reinterpret_cast is required by windows to compile
@@ -803,6 +795,7 @@ static void GetStringWidth(const FunctionCallbackInfo<Value>& args) {
     // in advance if a particular sequence is going to be supported.
     // The expand_emoji_sequence option allows the caller to skip this
     // check and count each code within an emoji sequence separately.
+    // https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
     if (!expand_emoji_sequence &&
         n > 0 && p == 0x200d &&  // 0x200d == ZWJ (zero width joiner)
         (u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) ||
diff --git a/test/parallel/test-icu-stringwidth.js b/test/parallel/test-icu-stringwidth.js
index 48384f916d9126..a427601803afd3 100644
--- a/test/parallel/test-icu-stringwidth.js
+++ b/test/parallel/test-icu-stringwidth.js
@@ -12,82 +12,78 @@ const readline = require('internal/readline/utils');
 
 // Ll (Lowercase Letter): LATIN SMALL LETTER A
 assert.strictEqual(readline.getStringWidth('a'), 1);
-assert.strictEqual(readline.getStringWidth(0x0061), 1);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x0061)), 1);
 // Lo (Other Letter)
 assert.strictEqual(readline.getStringWidth('丁'), 2);
-assert.strictEqual(readline.getStringWidth(0x4E01), 2);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x4E01)), 2);
 // Surrogate pairs
-assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 2);
+assert.strictEqual(readline.getStringWidth('\ud83d\udc78\ud83c\udfff'), 4);
 assert.strictEqual(readline.getStringWidth('👅'), 2);
 // Cs (Surrogate): High Surrogate
 assert.strictEqual(readline.getStringWidth('\ud83d'), 1);
 // Cs (Surrogate): Low Surrogate
 assert.strictEqual(readline.getStringWidth('\udc78'), 1);
 // Cc (Control): NULL
-assert.strictEqual(readline.getStringWidth(0), 0);
+assert.strictEqual(readline.getStringWidth('\u0000'), 0);
 // Cc (Control): BELL
-assert.strictEqual(readline.getStringWidth(0x0007), 0);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x0007)), 0);
 // Cc (Control): LINE FEED
 assert.strictEqual(readline.getStringWidth('\n'), 0);
 // Cf (Format): SOFT HYPHEN
-assert.strictEqual(readline.getStringWidth(0x00AD), 1);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x00AD)), 1);
 // Cf (Format): LEFT-TO-RIGHT MARK
 // Cf (Format): RIGHT-TO-LEFT MARK
 assert.strictEqual(readline.getStringWidth('\u200Ef\u200F'), 1);
 // Cn (Unassigned): Not a character
-assert.strictEqual(readline.getStringWidth(0x10FFEF), 1);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x10FFEF)), 1);
 // Cn (Unassigned): Not a character (but in a CJK range)
-assert.strictEqual(readline.getStringWidth(0x3FFEF), 2);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x3FFEF)), 1);
 // Mn (Nonspacing Mark): COMBINING ACUTE ACCENT
-assert.strictEqual(readline.getStringWidth(0x0301), 0);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x0301)), 0);
 // Mc (Spacing Mark): BALINESE ADEG ADEG
 // Chosen as its Canonical_Combining_Class is not 0, but is not a 0-width
 // character.
-assert.strictEqual(readline.getStringWidth(0x1B44), 1);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x1B44)), 1);
 // Me (Enclosing Mark): COMBINING ENCLOSING CIRCLE
-assert.strictEqual(readline.getStringWidth(0x20DD), 0);
+assert.strictEqual(readline.getStringWidth(String.fromCharCode(0x20DD)), 0);
 
-// The following is an emoji sequence. In some implementations, it is
-// represented as a single glyph, in other implementations as a sequence
-// of individual glyphs. By default, the algorithm will assume the single
-// glyph interpretation and return a value of 2. By passing the
-// expandEmojiSequence: true option, each component will be counted
-// individually.
-assert.strictEqual(readline.getStringWidth('👩‍👩‍👧‍👧'), 2);
-assert.strictEqual(
-  readline.getStringWidth('👩‍👩‍👧‍👧', { expandEmojiSequence: true }), 8);
+// The following is an emoji sequence with ZWJ (zero-width-joiner). In some
+// implementations, it is represented as a single glyph, in other
+// implementations as a sequence of individual glyphs. By default, each
+// component will be counted individually, since not a lot of systems support
+// these fully.
+// See https://www.unicode.org/reports/tr51/tr51-16.html#Emoji_ZWJ_Sequences
+assert.strictEqual(readline.getStringWidth('👩‍👩‍👧‍👧'), 8);
+// TODO(BridgeAR): This should have a width of two and six. The heart contains
+// the \uFE0F variation selector that indicates that it should be displayed as
+// emoji instead of as text. Emojis are all full width characters when not being
+// rendered as text.
+// https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block)
+assert.strictEqual(readline.getStringWidth('❤️'), 1);
+assert.strictEqual(readline.getStringWidth('👩‍❤️‍👩'), 5);
+// The length of one is correct. It is an emoji treated as text.
+assert.strictEqual(readline.getStringWidth('❤'), 1);
 
 // By default, unicode characters whose width is considered ambiguous will
 // be considered half-width. For these characters, getStringWidth will return
 // 1. In some contexts, however, it is more appropriate to consider them full
-// width. By default, the algorithm will assume half width. By passing
-// the ambiguousAsFullWidth: true option, ambiguous characters will be counted
-// as 2 columns.
+// width. By default, the algorithm will assume half width.
 assert.strictEqual(readline.getStringWidth('\u01d4'), 1);
-assert.strictEqual(
-  readline.getStringWidth('\u01d4', { ambiguousAsFullWidth: true }), 2);
 
 // Control chars and combining chars are zero
 assert.strictEqual(readline.getStringWidth('\u200E\n\u220A\u20D2'), 1);
 
 // Test that the fast path for ASCII characters yields results consistent
 // with the 'slow' path.
-for (const ambiguousAsFullWidth of [ false, true ]) {
-  for (let i = 0; i < 256; i++) {
-    const char = String.fromCharCode(i);
-    assert.strictEqual(
-      readline.getStringWidth(i, { ambiguousAsFullWidth }),
-      readline.getStringWidth(char, { ambiguousAsFullWidth }));
-    assert.strictEqual(
-      readline.getStringWidth(char + '🎉', { ambiguousAsFullWidth }),
-      readline.getStringWidth(char, { ambiguousAsFullWidth }) + 2);
+for (let i = 0; i < 256; i++) {
+  const char = String.fromCharCode(i);
+  assert.strictEqual(
+    readline.getStringWidth(char + '🎉'),
+    readline.getStringWidth(char) + 2);
 
-    if (i < 32 || (i >= 127 && i < 160)) {  // Control character
-      assert.strictEqual(
-        readline.getStringWidth(i, { ambiguousAsFullWidth }), 0);
-    } else if (i < 127) {  // Regular ASCII character
-      assert.strictEqual(
-        readline.getStringWidth(i, { ambiguousAsFullWidth }), 1);
-    }
+  if (i < 32 || (i >= 127 && i < 160)) {  // Control character
+    assert.strictEqual(readline.getStringWidth(char), 0);
+  } else {  // Regular ASCII character
+    assert.strictEqual(readline.getStringWidth(char), 1);
   }
 }