Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

util: improve unicode support #31319

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/internal/cli_table.js
Expand Up @@ -6,7 +6,7 @@ const {
ObjectPrototypeHasOwnProperty,
} = primordials;

const { getStringWidth } = require('internal/readline/utils');
const { getStringWidth } = require('internal/util/inspect');

// The use of Unicode characters below is the only non-comment use of non-ASCII
// Unicode characters in Node.js built-in modules. If they are ever removed or
Expand Down
117 changes: 0 additions & 117 deletions lib/internal/readline/utils.js
@@ -1,25 +1,13 @@
'use strict';

const {
RegExp,
Symbol,
} = primordials;

// Regex used for ansi escape code splitting
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
// Matches all ansi escape code sequences in a string
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
const ansi = new RegExp(ansiPattern, 'g');

const kUTF16SurrogateThreshold = 0x10000; // 2 ** 16
const kEscape = '\x1b';
const kSubstringSearch = Symbol('kSubstringSearch');

let getStringWidth;

function CSI(strings, ...args) {
let ret = `${kEscape}[`;
for (let n = 0; n < strings.length; n++) {
Expand Down Expand Up @@ -59,109 +47,6 @@ function charLengthAt(str, i) {
return str.codePointAt(i) >= kUTF16SurrogateThreshold ? 2 : 1;
}

if (internalBinding('config').hasIntl) {
const icu = internalBinding('icu');
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
// TODO(BridgeAR): Expose the options to the user. That is probably the
// best thing possible at the moment, since it's difficult to know what
// the receiving end supports.
getStringWidth = function getStringWidth(str) {
let width = 0;
str = stripVTControlCharacters(str);
for (let i = 0; i < str.length; i++) {
// Try to avoid calling into C++ by first handling the ASCII portion of
// the string. If it is fully ASCII, we skip the C++ part.
const code = str.charCodeAt(i);
if (code >= 127) {
width += icu.getStringWidth(str.slice(i));
break;
}
width += code >= 32 ? 1 : 0;
}
return width;
};
} else {
/**
* Returns the number of columns required to display the given string.
*/
getStringWidth = function getStringWidth(str) {
let width = 0;

str = stripVTControlCharacters(str);

for (const char of str) {
const code = char.codePointAt(0);
if (isFullWidthCodePoint(code)) {
width += 2;
} else if (!isZeroWidthCodePoint(code)) {
width++;
}
}

return width;
};

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
const isFullWidthCodePoint = (code) => {
// Code points are partially derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
return code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
(code >= 0x3250 && code <= 0x4dbf) ||
// CJK Unified Ideographs .. Yi Radicals
(code >= 0x4e00 && code <= 0xa4c6) ||
// Hangul Jamo Extended-A
(code >= 0xa960 && code <= 0xa97c) ||
// Hangul Syllables
(code >= 0xac00 && code <= 0xd7a3) ||
// CJK Compatibility Ideographs
(code >= 0xf900 && code <= 0xfaff) ||
// Vertical Forms
(code >= 0xfe10 && code <= 0xfe19) ||
// CJK Compatibility Forms .. Small Form Variants
(code >= 0xfe30 && code <= 0xfe6b) ||
// Halfwidth and Fullwidth Forms
(code >= 0xff01 && code <= 0xff60) ||
(code >= 0xffe0 && code <= 0xffe6) ||
// Kana Supplement
(code >= 0x1b000 && code <= 0x1b001) ||
// Enclosed Ideographic Supplement
(code >= 0x1f200 && code <= 0x1f251) ||
// Miscellaneous Symbols and Pictographs .. Emoticons
(code >= 0x1f300 && code <= 0x1f64f) ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
(code >= 0x20000 && code <= 0x3fffd)
);
};

const isZeroWidthCodePoint = (code) => {
return code <= 0x1F || // C0 control codes
(code > 0x7F && code <= 0x9F) || // C1 control codes
(code >= 0x0300 && code <= 0x036F) || // Combining Diacritical Marks
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
};
}

/**
* Tries to remove all VT control characters. Use to estimate displayed
* string width. May be buggy due to not running a real state machine
*/
function stripVTControlCharacters(str) {
return str.replace(ansi, '');
}

/*
Some patterns seen in terminal key escape codes, derived from combos seen
at http://www.midnight-commander.org/browser/lib/tty/key.c
Expand Down Expand Up @@ -477,8 +362,6 @@ module.exports = {
charLengthLeft,
commonPrefix,
emitKeys,
getStringWidth,
kSubstringSearch,
stripVTControlCharacters,
CSI
};
6 changes: 4 additions & 2 deletions lib/internal/repl/utils.js
Expand Up @@ -32,11 +32,13 @@ const {

const {
commonPrefix,
getStringWidth,
kSubstringSearch,
} = require('internal/readline/utils');

const { inspect } = require('util');
const {
getStringWidth,
inspect,
} = require('internal/util/inspect');

const debug = require('internal/util/debuglog').debuglog('repl');

Expand Down
131 changes: 122 additions & 9 deletions lib/internal/util/inspect.js
Expand Up @@ -192,6 +192,17 @@ const meta = [
'\\x98', '\\x99', '\\x9A', '\\x9B', '\\x9C', '\\x9D', '\\x9E', '\\x9F', // x9F
];

// Regex used for ansi escape code splitting
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
// Matches all ansi escape code sequences in a string
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
const ansi = new RegExp(ansiPattern, 'g');

let getStringWidth;

function getUserOptions(ctx) {
return {
stylize: ctx.stylize,
Expand Down Expand Up @@ -1154,7 +1165,7 @@ function groupArrayElements(ctx, output, value) {
// entries length of all output entries. We have to remove colors first,
// otherwise the length would not be calculated properly.
for (; i < outputLength; i++) {
const len = ctx.colors ? removeColors(output[i]).length : output[i].length;
const len = getStringWidth(output[i], ctx.colors);
dataLen[i] = len;
totalLength += len + separatorSpace;
if (maxLength < len)
Expand Down Expand Up @@ -1197,8 +1208,6 @@ function groupArrayElements(ctx, output, value) {
if (columns <= 1) {
return output;
}
// TODO(BridgeAR): Add unicode support. Use the readline getStringWidth
// function.
const tmp = [];
const maxLineLength = [];
for (let i = 0; i < columns; i++) {
Expand Down Expand Up @@ -1565,11 +1574,8 @@ function formatProperty(ctx, value, recurseTimes, key, type, desc) {
const diff = (ctx.compact !== true || type !== kObjectType) ? 2 : 3;
ctx.indentationLvl += diff;
str = formatValue(ctx, desc.value, recurseTimes);
if (diff === 3) {
const len = ctx.colors ? removeColors(str).length : str.length;
if (ctx.breakLength < len) {
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
}
if (diff === 3 && ctx.breakLength < getStringWidth(str, ctx.colors)) {
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
}
ctx.indentationLvl -= diff;
} else if (desc.get !== undefined) {
Expand Down Expand Up @@ -1889,9 +1895,116 @@ function formatWithOptionsInternal(inspectOptions, ...args) {
return str;
}

if (internalBinding('config').hasIntl) {
const icu = internalBinding('icu');
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
// TODO(BridgeAR): Expose the options to the user. That is probably the
// best thing possible at the moment, since it's difficult to know what
// the receiving end supports.
getStringWidth = function getStringWidth(str, removeControlChars = true) {
let width = 0;
if (removeControlChars)
str = stripVTControlCharacters(str);
for (let i = 0; i < str.length; i++) {
// Try to avoid calling into C++ by first handling the ASCII portion of
// the string. If it is fully ASCII, we skip the C++ part.
const code = str.charCodeAt(i);
if (code >= 127) {
width += icu.getStringWidth(str.slice(i));
break;
}
width += code >= 32 ? 1 : 0;
}
return width;
};
} else {
/**
* Returns the number of columns required to display the given string.
*/
getStringWidth = function getStringWidth(str, removeControlChars = true) {
let width = 0;

if (removeControlChars)
str = stripVTControlCharacters(str);

for (const char of str) {
const code = char.codePointAt(0);
if (isFullWidthCodePoint(code)) {
width += 2;
} else if (!isZeroWidthCodePoint(code)) {
width++;
}
}

return width;
};

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
const isFullWidthCodePoint = (code) => {
// Code points are partially derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
return code >= 0x1100 && (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this might be doable as a regex… it could be compiled as a regex, i don't think there's an East Asian Width property available in regex.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could definitely be a regular expression. I guess it's slower that way but I did not check. I'll have a look soon.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ICU4C also has API to get the East Asian Width.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we use that in case Node.js is build with ICU but this is the fallback code.

code <= 0x115f || // Hangul Jamo
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
(code >= 0x3250 && code <= 0x4dbf) ||
// CJK Unified Ideographs .. Yi Radicals
(code >= 0x4e00 && code <= 0xa4c6) ||
// Hangul Jamo Extended-A
(code >= 0xa960 && code <= 0xa97c) ||
// Hangul Syllables
(code >= 0xac00 && code <= 0xd7a3) ||
// CJK Compatibility Ideographs
(code >= 0xf900 && code <= 0xfaff) ||
// Vertical Forms
(code >= 0xfe10 && code <= 0xfe19) ||
// CJK Compatibility Forms .. Small Form Variants
(code >= 0xfe30 && code <= 0xfe6b) ||
// Halfwidth and Fullwidth Forms
(code >= 0xff01 && code <= 0xff60) ||
(code >= 0xffe0 && code <= 0xffe6) ||
// Kana Supplement
(code >= 0x1b000 && code <= 0x1b001) ||
// Enclosed Ideographic Supplement
(code >= 0x1f200 && code <= 0x1f251) ||
// Miscellaneous Symbols and Pictographs 0x1f300 - 0x1f5ff
// Emoticons 0x1f600 - 0x1f64f
(code >= 0x1f300 && code <= 0x1f64f) ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
(code >= 0x20000 && code <= 0x3fffd)
);
};

const isZeroWidthCodePoint = (code) => {
return code <= 0x1F || // C0 control codes
(code > 0x7F && code <= 0x9F) || // C1 control codes
(code >= 0x300 && code <= 0x36F) || // Combining Diacritical Marks
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
};
}

/**
* Remove all VT control characters. Use to estimate displayed string width.
*/
function stripVTControlCharacters(str) {
return str.replace(ansi, '');
}

module.exports = {
inspect,
format,
formatWithOptions,
inspectDefaultOptions
getStringWidth,
inspectDefaultOptions,
stripVTControlCharacters
};
8 changes: 5 additions & 3 deletions lib/readline.js
Expand Up @@ -46,17 +46,19 @@ const {
ERR_INVALID_OPT_VALUE
} = require('internal/errors').codes;
const { validateString } = require('internal/validators');
const { inspect } = require('internal/util/inspect');
const {
inspect,
getStringWidth,
stripVTControlCharacters,
} = require('internal/util/inspect');
const EventEmitter = require('events');
const {
charLengthAt,
charLengthLeft,
commonPrefix,
CSI,
emitKeys,
getStringWidth,
kSubstringSearch,
stripVTControlCharacters
} = require('internal/readline/utils');

const { clearTimeout, setTimeout } = require('timers');
Expand Down