Skip to content

Commit

Permalink
Add cell width support for emoji sequences (#68)
Browse files Browse the repository at this point in the history
  • Loading branch information
ajalt committed Jul 30, 2022
1 parent 5ec50e3 commit 1cf64be
Show file tree
Hide file tree
Showing 7 changed files with 2,572 additions and 571 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
## Unreleased
### Added
- Implemented `hideInput` for prompts on native targets [(#63)](https://github.com/ajalt/mordant/issues/63)
- Improve cell-width calculation for emoji sequences like skin tone modifiers. [(#64)](https://github.com/ajalt/mordant/issues/64)

## 2.0.0-beta7
### Added
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package com.github.ajalt.mordant.internal

import com.github.ajalt.mordant.internal.gen.CELL_WIDTH_TABLE
import com.github.ajalt.mordant.internal.gen.*


/*
Expand Down Expand Up @@ -42,5 +42,38 @@ internal fun cellWidth(codepoint: Int): Int {

/** Return the width, in terminal cells, of the given [string]*/
internal fun stringCellWidth(string: String): Int {
return codepointSequence(string).sumOf { cellWidth(it) }
var sum = 0
var sumSinceZwj = 0
var zwjSeq: IntTrie? = null
for (codepoint in codepointSequence(string)) {
val width = cellWidth(codepoint)
if (zwjSeq != null) {
sumSinceZwj += width
if (codepoint in zwjSeq.values) {
sumSinceZwj = 0
}
zwjSeq = zwjSeq.children[codepoint]
if (zwjSeq == null) {
// all ZWJ sequences combine to one glyph, which is always an emoji, so add 2 for the width of the
// emoji, plus the width of any codepoints since the end of the last complete sequence. Unfortunately,
// some of these emoji are wider than two cells, but given that their size is font-dependant and usually
// not cell-aligned anyway, there's no perfect solution here. Thanks, unicode.
sum += sumSinceZwj + 2
sumSinceZwj = 0
} else {
sumSinceZwj += width
}
} else {
// We do a fast range check to skip ZWJ sequence processing for most codepoints
if (couldStartEmojiSeq(codepoint)) {
zwjSeq = EMOJI_SEQUENCES.children[codepoint]
}
if (zwjSeq == null) {
sum += width
}
}
}
// If we were in a zwj sequence at the end of the string, add whatever was left to the sum
return sum + sumSinceZwj

}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ internal class CellWidthTest {
row("媒人", 4),
row("🙊🙉🙈", 6),
row("en\u0303e", 3),
row("👍🏿", 2),
row("🇩🇪", 2),
row("\uD83D\uDC68\uD83C\uDFFE\u200D\uD83E\uDDB1", 2), // MAN, FITZPATRICK TYPE-5, ZWJ, CURLY HAIR
row("\uD83D\uDC69\u200D\uD83D\uDC67", 2), // Emoji_ZWJ_Sequence ; family: woman, girl (👩‍👧)
row("\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66", 2), //Emoji_ZWJ_Sequence ; family: woman, girl, boy (👩‍👧‍👦)
row("\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC66\u200D\uD83D\uDC66", 2), // Emoji_ZWJ_Sequence ; family: woman, woman, boy, boy (👩‍👩‍👦‍👦)

) { str, width ->
stringCellWidth(str) shouldBe width
}
Expand Down

Large diffs are not rendered by default.

1,835 changes: 1,835 additions & 0 deletions mordant/src/gen/kotlin/com/github/ajalt/mordant/internal/gen/emojiseqtable.kt

Large diffs are not rendered by default.

298 changes: 154 additions & 144 deletions scripts/generate_cellwidth_table.py
Original file line number Diff line number Diff line change
@@ -1,144 +1,154 @@
import requests

categories_url = 'https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt'
east_asian_url = 'https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt'


def parse_categories():
"""Download and parse the latest Unicode Category document
Return a tuple of (low, high, width, description) for each range covering
categories Enclosing Marks (Me), Non-Spacing Marks (Mn) and Control Codes (Cc),
all of which are zero-width
"""
# example lines:
# 0591..05BD ; Mn # [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
# 05BF ; Mn # HEBREW POINT RAFE
text = requests.get(categories_url).text
categories = ('Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps')
ranges = []

for i, line in enumerate(text.splitlines()):
if not line or line.startswith("#"): continue
points, _, category, _, desc = line.split(maxsplit=4)

if category in categories:
low, high = parse_points(points)
ranges.append((low, high, 0, parse_desc(desc)))

return ranges


def parse_east_asian():
"""Download and parse the latest Unicode East Asian Width document.
Note that the document covers all codepoints, not just East Asian characters.
Return a tuple of (low, high, width, description) for each range covering width properties
`F` or `W`, the enclosing Fullwidth & Halfwidth characters, all of which have a width of two cells.
"""
# example lines
# 2322..2328;N # So [7] FROWN..KEYBOARD
# 2329;W # Ps LEFT-POINTING ANGLE BRACKET
text = requests.get(east_asian_url).text
properties = ('F', 'W')
ranges = []

# All glyphs in the following blocks have an emoji representation (see
# https://en.wikipedia.org/wiki/Emoji#Emoji_versus_text_presentation), but EastAsianWidth.txt
# lists a number of codepoints in them with width 'N' or 'A', often in ways that seem arbitrary
# but probably have some historical explanation.
#
# For example, U+1F004 🀄 MAHJONG TILE RED DRAGON is listed as 'W',
# but U+1F005 🀅 MAHJONG TILE GREEN DRAGON is listed as 'N'.
#
# Since most modern terminals display all of these codepoints as emojii, we list them as 2 cells
# wide. This list could be refined further; for example, the Miscellaneous Symbols block
# includes both single and double width characters, but not all of the double width characters
# are listed as such.
override_ranges = [
(0x1f000, 0x1f02f, 2, 'Mahjong Tiles'),
(0x1f0a0, 0x1f0ff, 2, 'Playing Cards'),
(0x1f300, 0x1f5ff, 2, 'Miscellaneous Symbols and Pictographs'),
(0x1f600, 0x1f64f, 2, 'Emoticons'),
(0x1f680, 0x1f6ff, 2, 'Transport and Map Symbols'),
(0x1f900, 0x1f9ff, 2, 'Supplemental Symbols and Pictographs'),
(0x1fa70, 0x1faff, 2, 'Symbols and Pictographs Extended-A'),
]

for i, line in enumerate(text.splitlines()):
if not line or line.startswith("#"): continue
field, _, _, desc = line.split(maxsplit=3)
points, prop = field.split(";")

if prop in properties:
low, high = parse_points(points)
if any(it[0] <= low <= it[1] for it in override_ranges):
continue
ranges.append((low, high, 2, parse_desc(desc)))

return ranges + override_ranges


def parse_cf():
"""Return a table of zero-width characters from categories that
contain both zero-width and non-zero-width characters.
This table is curated by hand since the unicode document doesn't
provide enough info to parse these.
"""
return [
(0x034F, 0x034F, 0, 'COMBINING GRAPHEME JOINER'),
(0x200B, 0x200F, 0, 'ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK'),
(0x2028, 0x202E, 0, 'LINE SEPARATOR..RIGHT-TO-LEFT OVERRIDE'),
(0x2060, 0x2063, 0, 'WORD JOINER..INVISIBLE SEPARATOR'),
]


def parse_desc(desc):
if desc.startswith('['):
return desc[desc.index(']') + 2:]
return desc


def parse_points(points):
if '..' in points:
low, high = points.split('..')
return int(low, 16), int(high, 16)
else:
point = int(points, 16)
return point, point


def parse_all():
combined = sorted(parse_categories(), key=lambda it: it[0])
# concat adjacent ranges
ranges = []
iterator = iter(combined)
prev = next(iterator)
for low, high, width, desc in iterator:
if width == prev[2] and prev[1] + 1 == low:
p1, p2 = prev[3].split('..') if '..' in prev[3] else (prev[3], prev[3])
d1, d2 = desc.split('..') if '..' in desc else (desc, desc)
prev = prev[0], high, width, f'{p1}..{d2}'
else:
ranges.append(prev)
prev = (low, high, width, desc)
ranges.append(prev)
return ranges


def main():
print('''package com.github.ajalt.mordant.internal.gen
internal class CellWidthTableEntry(val low: Int, val high: Int, val width: Byte)
internal val CELL_WIDTH_TABLE : Array<CellWidthTableEntry> = arrayOf<CellWidthTableEntry>('''
)
for low, high, width, desc in parse_all():
print(f" '{hex(low)}'..'{hex(high)}',// {desc}".replace('0x', '\\u'))
print(')')


if __name__ == '__main__':
main()
import requests

categories_url = (
"https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt"
)
east_asian_url = "https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt"


def parse_categories():
"""Download and parse the latest Unicode Category document
Return a tuple of (low, high, width, description) for each range covering
categories Enclosing Marks (Me), Non-Spacing Marks (Mn) and Control Codes (Cc),
all of which are zero-width
"""
# example lines:
# 0591..05BD ; Mn # [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
# 05BF ; Mn # HEBREW POINT RAFE
text = requests.get(categories_url).text
categories = ("Me", "Mn", "Cc")
ranges = []

for line in text.splitlines():
if not line or line.startswith("#"):
continue
points, _, category, _, desc = line.split(maxsplit=4)

if category in categories:
low, high = parse_points(points)
ranges.append((low, high, 0, parse_desc(desc)))

return ranges


def parse_east_asian():
"""Download and parse the latest Unicode East Asian Width document.
Note that the document covers all codepoints, not just East Asian characters.
Return a tuple of (low, high, width, description) for each range covering width properties
`F` or `W`, the enclosing Fullwidth & Halfwidth characters, all of which have a width of two cells.
"""
# example lines
# 2322..2328;N # So [7] FROWN..KEYBOARD
# 2329;W # Ps LEFT-POINTING ANGLE BRACKET
text = requests.get(east_asian_url).text
properties = ("F", "W")
ranges = []

# All glyphs in the following blocks have an emoji representation (see
# https://en.wikipedia.org/wiki/Emoji#Emoji_versus_text_presentation), but EastAsianWidth.txt
# lists a number of codepoints in them with width 'N' or 'A', often in ways that seem arbitrary
# but probably have some historical explanation.
#
# For example, U+1F004 🀄 MAHJONG TILE RED DRAGON is listed as 'W',
# but U+1F005 🀅 MAHJONG TILE GREEN DRAGON is listed as 'N'.
#
# Since most modern terminals display all of these codepoints as emoji, we list them as 2 cells
# wide. This list could be refined further; for example, the Miscellaneous Symbols block
# includes both single and double width characters, but not all of the double width characters
# are listed as such.
override_ranges = [
(0x1F000, 0x1F02F, 2, "Mahjong Tiles"),
(0x1F0A0, 0x1F0FF, 2, "Playing Cards"),
(0x1F300, 0x1F5FF, 2, "Miscellaneous Symbols and Pictographs"),
(0x1F600, 0x1F64F, 2, "Emoticons"),
(0x1F680, 0x1F6FF, 2, "Transport and Map Symbols"),
(0x1F900, 0x1F9FF, 2, "Supplemental Symbols and Pictographs"),
(0x1FA70, 0x1FAFF, 2, "Symbols and Pictographs Extended-A"),
]

for line in text.splitlines():
if not line or line.startswith("#"):
continue
field, _, _, desc = line.split(maxsplit=3)
points, prop = field.split(";")

if prop in properties:
low, high = parse_points(points)
if any(it[0] <= low <= it[1] for it in override_ranges):
continue
ranges.append((low, high, 2, parse_desc(desc)))

return ranges + override_ranges


def parse_cf():
"""Return a table of zero-width characters from categories that
contain both zero-width and non-zero-width characters.
This table is curated by hand since the unicode document doesn't
provide enough info to parse these.
"""
return [
(0x034F, 0x034F, 0, "COMBINING GRAPHEME JOINER"),
(0x200B, 0x200F, 0, "ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK"),
(0x2028, 0x202E, 0, "LINE SEPARATOR..RIGHT-TO-LEFT OVERRIDE"),
(0x2060, 0x2063, 0, "WORD JOINER..INVISIBLE SEPARATOR"),
]


def parse_desc(desc):
if desc.startswith("["):
return desc[desc.index("]") + 2 :]
return desc


def parse_points(points):
if ".." in points:
low, high = points.split("..")
return int(low, 16), int(high, 16)
else:
point = int(points, 16)
return point, point


def parse_all():
combined = sorted(
parse_categories() + parse_east_asian() + parse_cf(), key=lambda it: it[0]
)
# concat adjacent ranges
ranges = []
iterator = iter(combined)
prev = next(iterator)
for low, high, width, desc in iterator:
if width == prev[2] and prev[1] + 1 == low:
p1, p2 = prev[3].split("..") if ".." in prev[3] else (prev[3], prev[3])
d1, d2 = desc.split("..") if ".." in desc else (desc, desc)
prev = prev[0], high, width, f"{p1}..{d2}"
else:
ranges.append(prev)
prev = (low, high, width, desc)
ranges.append(prev)
return ranges


def main():
print(
"""package com.github.ajalt.mordant.internal.gen
import kotlin.native.concurrent.SharedImmutable
internal class CellWidthTableEntry(val low: Int, val high: Int, val width: Byte)
@SharedImmutable
internal val CELL_WIDTH_TABLE: Array<CellWidthTableEntry> = arrayOf<CellWidthTableEntry>("""
)
for low, high, width, desc in parse_all():
print(f" CellWidthTableEntry({hex(low)}, {hex(high)}, {width}), // {desc}")
print(")")


if __name__ == "__main__":
main()

0 comments on commit 1cf64be

Please sign in to comment.