Skip to content

Commit

Permalink
NEW Add function to manipulate emojis
Browse files Browse the repository at this point in the history
  • Loading branch information
eldy committed May 14, 2024
1 parent eacae62 commit 1b2bad3
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 69 deletions.
5 changes: 2 additions & 3 deletions htdocs/core/lib/functions.lib.php
Original file line number Diff line number Diff line change
Expand Up @@ -8118,13 +8118,12 @@ function dol_htmlwithnojs($stringtoencode, $nouseofiframesandbox = 0, $check = '
// like '<h1>Foo</h1><p>bar</p>' that wrongly ends up, without the trick, with '<h1>Foo<p>bar</p></h1>'
// like 'abc' that wrongly ends up, without the trick, with '<p>abc</p>'

// TODO Must accept emoji with MAIN_RESTRICTHTML_ONLY_VALID_HTML...

if (dol_textishtml($out)) {
$out = '<?xml encoding="UTF-8"><div class="tricktoremove">'.$out.'</div>';
} else {
$out = '<?xml encoding="UTF-8"><div class="tricktoremove">'.dol_nl2br($out).'</div>';
}

$dom->loadHTML($out, LIBXML_HTML_NODEFDTD | LIBXML_ERR_NONE | LIBXML_HTML_NOIMPLIED | LIBXML_NONET | LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NOXMLDECL);
$out = trim($dom->saveHTML());

Expand Down Expand Up @@ -8195,7 +8194,7 @@ static function ($m) {
},
$out
);

var_dump($out);

Check warning on line 8197 in htdocs/core/lib/functions.lib.php

View workflow job for this annotation

GitHub Actions / phan / Run phan

functions.lib.php: NoVarDumpPlugin: var_dump() should be commented in submitted code

// Now we remove all remaining HTML entities starting with a number. We don't want such entities.
$out = preg_replace('/&#x?[0-9]+/i', '', $out); // For example if we have j&#x61vascript with an entities without the ; to hide the 'a' of 'javascript'.
Expand Down
59 changes: 59 additions & 0 deletions htdocs/core/lib/functions2.lib.php
Original file line number Diff line number Diff line change
Expand Up @@ -2982,3 +2982,62 @@ function removeGlobalParenthesis($string)

return $string;
}


/**
* Return array of Emojis
*
* @return array Array of Emojis in hexadecimal
*/
function getArrayOfEmoji()
{
$arrayofcommonemoji = array(
'misc' => array('2600', '26FF'), // Miscellaneous Symbols
'ding' => array('2700', '27BF'), // Dingbats
'????' => array('9989', '9989'), // Variation Selectors
'vars' => array('FE00', 'FE0F'), // Variation Selectors
'pict' => array('1F300', '1F5FF'), // Miscellaneous Symbols and Pictographs
'emot' => array('1F600', '1F64F'), // Emoticons
'tran' => array('1F680', '1F6FF'), // Transport and Map Symbols
'flag' => array('1F1E0', '1F1FF'), // Flags (note: may be 1F1E6 instead of 1F1E0)
'supp' => array('1F900', '1F9FF'), // Supplemental Symbols and Pictographs
);

return $arrayofcommonemoji;
}

/**
* Remove EMoji from email content
*
* @param string $text String to sanitize
* @param int $allowedemoji Mode to allow emoji
* @return string Sanitized string
*/
function removeEmoji($text, $allowedemoji = 1)
{
// $allowedemoji can be
// 0=no emoji, 1=exclude the main known emojis (default), 2=keep only the main known (not implemented), 3=accept all
// Note that to accept emoji in database, you must use utf8mb4, utf8mb3 is not enough.

$arrayofcommonemoji = getArrayOfEmoji();

if ($allowedemoji == 0) {
// For a large removal:
$text = preg_replace('/[\x{2600}-\x{FFFF}]/u', '', $text);
$text = preg_replace('/[\x{10000}-\x{10FFFF}]/u', '', $text);
}

// Delete emoji chars with a regex
// See https://www.unicode.org/emoji/charts/full-emoji-list.html
if ($allowedemoji == 1) {
foreach ($arrayofcommonemoji as $key => $valarray) {
$text = preg_replace('/[\x{'.$valarray[0].'}-\x{'.$valarray[1].'}]/u', '', $text);
}
}

if ($allowedemoji == 2) {
// TODO Not yet implemented
}

return $text;
}
28 changes: 4 additions & 24 deletions htdocs/emailcollector/class/emailcollector.class.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

require_once DOL_DOCUMENT_ROOT .'/core/class/commonobject.class.php';
require_once DOL_DOCUMENT_ROOT .'/core/lib/files.lib.php';
require_once DOL_DOCUMENT_ROOT .'/core/lib/functions2.lib.php';

require_once DOL_DOCUMENT_ROOT .'/comm/propal/class/propal.class.php'; // Customer Proposal
require_once DOL_DOCUMENT_ROOT .'/commande/class/commande.class.php'; // Sale Order
Expand Down Expand Up @@ -1785,7 +1786,7 @@ public function doCollectOneCollector($mode = 0)
dol_syslog("msgid=".$overview['message_id']." date=".dol_print_date($overview['date'], 'dayrfc', 'gmt')." from=".$overview['from']." to=".$overview['to']." subject=".$overview['subject']);

// Removed emojis
$overview['subject'] = preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xEF\xBF\xBD", $overview['subject']);
$overview['subject'] = removeEmoji($overview['subject'], getDolGlobalInt('MAIN_EMAIL_COLLECTOR_ACCEPT_EMOJIS', 1));
} else {
dol_syslog("msgid=".$overview[0]->message_id." date=".dol_print_date($overview[0]->udate, 'dayrfc', 'gmt')." from=".$overview[0]->from." to=".$overview[0]->to." subject=".$overview[0]->subject);

Expand All @@ -1794,7 +1795,7 @@ public function doCollectOneCollector($mode = 0)
$overview[0]->from = $this->decodeSMTPSubject($overview[0]->from);

// Removed emojis
$overview[0]->subject = preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xEF\xBF\xBD", $overview[0]->subject);
$overview[0]->subject = removeEmoji($overview[0]->subject, getDolGlobalInt('MAIN_EMAIL_COLLECTOR_ACCEPT_EMOJIS', 1));
}
// GET IMAP email structure/content
global $htmlmsg, $plainmsg, $charset, $attachments;
Expand Down Expand Up @@ -1825,8 +1826,7 @@ public function doCollectOneCollector($mode = 0)
// Removed emojis

if (utf8_valid($messagetext)) {
//$messagetext = preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xEF\xBF\xBD", $messagetext);
$messagetext = $this->removeEmoji($messagetext);
$messagetext = removeEmoji($messagetext, getDolGlobalInt('MAIN_EMAIL_COLLECTOR_ACCEPT_EMOJIS', 1));
} else {
$operationslog .= '<br>Discarded - Email body is not valid utf8';
dol_syslog(" Discarded - Email body is not valid utf8");
Expand Down Expand Up @@ -3714,26 +3714,6 @@ protected function decodeSMTPSubject($subject)
return $subject;
}

/**
* Remove EMoji from email content
*
* @param string $text String to sanitize
* @return string Sanitized string
*/
protected function removeEmoji($text)
{
// Supprimer les caractères emoji en utilisant une expression régulière
$text = preg_replace('/[\x{1F600}-\x{1F64F}]/u', '', $text);
$text = preg_replace('/[\x{1F300}-\x{1F5FF}]/u', '', $text);
$text = preg_replace('/[\x{1F680}-\x{1F6FF}]/u', '', $text);
$text = preg_replace('/[\x{2600}-\x{26FF}]/u', '', $text);
$text = preg_replace('/[\x{2700}-\x{27BF}]/u', '', $text);
$text = preg_replace('/[\x{1F900}-\x{1F9FF}]/u', '', $text);
$text = preg_replace('/[\x{1F1E0}-\x{1F1FF}]/u', '', $text);

return $text;
}

/**
* saveAttachment
*
Expand Down
18 changes: 15 additions & 3 deletions htdocs/main.inc.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,35 @@
* Return the real char for a numeric entities.
* WARNING: This function is required by testSqlAndScriptInject() and the GETPOST 'restricthtml'. Regex calling must be similar.
*
* @param string $matches String of numeric entity
* @return string New value
* @param array<int:string> $matches Array with a decimal numeric entity into key 0, value without the &# into the key 1

Check warning on line 61 in htdocs/main.inc.php

View workflow job for this annotation

GitHub Actions / phan / Run phan

main.inc.php: PhanUnextractableAnnotationElementName: Saw possibly unextractable annotation for a fragment of comment '* @param array&lt;int:string&gt; $matches Array with a decimal numeric entity into key 0, value without the &amp;# into the key 1': after array, did not see an element name (will guess based on comment order)

Check warning on line 61 in htdocs/main.inc.php

View workflow job for this annotation

GitHub Actions / phan / Run phan

main.inc.php: PhanUnextractableAnnotationSuffix: Saw a token Phan may have failed to parse after '* @param array&lt;int:string&gt; $matches Array with a decimal numeric entity into key 0, value without the &amp;# into the key 1': after array, saw '&lt;'
* @return string New value
*/
function realCharForNumericEntities($matches)
{
$newstringnumentity = preg_replace('/;$/', '', $matches[1]);
//print ' $newstringnumentity='.$newstringnumentity;

if (preg_match('/^x/i', $newstringnumentity)) {
if (preg_match('/^x/i', $newstringnumentity)) { // if numeric is hexadecimal
$newstringnumentity = hexdec(preg_replace('/^x/i', '', $newstringnumentity));
} else {
$newstringnumentity = (int) $newstringnumentity;
}

// The numeric value we don't want as entities because they encode ascii char, and why using html entities on ascii except for haking ?
if (($newstringnumentity >= 65 && $newstringnumentity <= 90) || ($newstringnumentity >= 97 && $newstringnumentity <= 122)) {
return chr((int) $newstringnumentity);
}

// The numeric value we want in UTF8 instead of entities because it is emoji
include_once DOL_DOCUMENT_ROOT.'/core/lib/functions2.lib.php';
$arrayofemojis = getArrayOfEmoji();
foreach ($arrayofemojis as $valarray) {
if ($newstringnumentity >= hexdec($valarray[0]) && $newstringnumentity <= hexdec($valarray[1])) {
// This is a known emoji
return html_entity_decode($matches[0], ENT_COMPAT | ENT_HTML5, 'UTF-8');
}
}

return '&#'.$matches[1]; // Value will be unchanged because regex was /&#( )/
}

Expand Down
30 changes: 29 additions & 1 deletion test/phpunit/Functions2LibTest.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?php
/* Copyright (C) 2010-2012 Laurent Destailleur <eldy@users.sourceforge.net>
* Copyright (C) 2023 Alexandre Janniaux <alexandre.janniaux@gmail.com>
* Copyright (C) 2024 MDW <mdeweerd@users.noreply.github.com>
* Copyright (C) 2024 MDW <mdeweerd@users.noreply.github.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -92,6 +92,8 @@ public function testJsUnEscape()
*/
public function testIsValidMailDomain()
{
print __METHOD__."\n";

$mail = 'bidon@invalid.invalid';
$result = isValidMailDomain($mail);
$this->assertEquals(0, $result, 'Email isValidMailDomain('.$mail.') should return 0 (not valid) but returned '.$result);
Expand All @@ -108,6 +110,8 @@ public function testIsValidMailDomain()
*/
public function testIsValidUrl()
{
print __METHOD__."\n";

//Simple check
$result = isValidUrl('http://google.com');
$this->assertEquals(1, $result);
Expand Down Expand Up @@ -283,6 +287,30 @@ public function testNumeroSemaine($time_str, $expected_week)
{
$time = strtotime($time_str);
$str = date(DATE_ATOM, $time).PHP_EOL;
print __METHOD__." time=".$time."\n";
$this->assertEquals($expected_week, numero_semaine($time), "Computed week incorrect for $str");
}


/**
* Test testRemoveEmoji
*
* @return void
*/
public function testRemoveEmoji()
{
print __METHOD__."\n";

$text = 'abc ✅ def';
$result = removeEmoji($text, 0);
$this->assertEquals('abc def', $result, 'testRemoveEmoji 0');

$text = 'abc ✅ def';
$result = removeEmoji($text, 1);
$this->assertEquals('abc def', $result, 'testRemoveEmoji 1');

$text = 'abc ✅ def';
$result = removeEmoji($text, 2);
$this->assertEquals($text, $result, 'testRemoveEmoji 2');
}
}
104 changes: 66 additions & 38 deletions test/phpunit/SecurityTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1110,44 +1110,6 @@ public function testDolEval()
$this->assertStringContainsString('Bad string syntax to evaluate', $result);
}

/**
* testDolHtmlWithNoJs()
*
* @return int
*/
public function testDolHtmlWithNoJs()
{
global $conf;

$sav1 = $conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML;
$sav2 = $conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY;

// Test with an emoji
$test = 'abc ✅ def';

$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = 0;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = 1;
$result = dol_htmlwithnojs($test);
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = $sav1;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = $sav2;

print __METHOD__." result for dol_htmlwithnojs and MAIN_RESTRICTHTML_ONLY_VALID_HTML=0 with emoji = ".$result."\n";
$this->assertEquals($test, $result, 'dol_htmlwithnojs failed with an emoji when MAIN_RESTRICTHTML_ONLY_VALID_HTML=0');

/*
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = 1;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = 1;
$result = dol_htmlwithnojs($test);
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = $sav1;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = $sav2;
print __METHOD__." result for dol_htmlwithnojs and MAIN_RESTRICTHTML_ONLY_VALID_HTML=1 with emoji = ".$result."\n";
$this->assertEquals($test, $result, 'dol_htmlwithnojs failed with an emoji when MAIN_RESTRICTHTML_ONLY_VALID_HTML=1');
*/

return 0;
}

/**
* testDolPrintHTML.
* This method include calls to dol_htmlwithnojs()
Expand Down Expand Up @@ -1246,4 +1208,70 @@ public function testCheckLoginPassEntity()
print __METHOD__." login=".$login."\n";
$this->assertEquals('', $login, 'Error'); // Expected '' because should failed because login 'auto' does not exists
}


/**
* testRealCharforNumericEntities()
*
* @return int
*/
public function testRealCharforNumericEntities()
{
global $conf;

// Test that testRealCharforNumericEntities return an ascii char when code is inside Ascii range
$arraytmp = array(0 => '&#97;', 1 => '97;');
$result = realCharForNumericEntities($arraytmp);
$this->assertEquals('a', $result);

// Test that testRealCharforNumericEntities return an emoji utf8 char when code is inside Emoji range
$arraytmp = array(0 => '&#9989;', 1 => '9989;'); // Encoded as decimal
$result = realCharForNumericEntities($arraytmp);
$this->assertEquals('✅', $result);

$arraytmp = array(0 => '&#x2705;', 1 => 'x2705;'); // Encoded as hexadecimal
$result = realCharForNumericEntities($arraytmp);
$this->assertEquals('✅', $result);

return 0;
}


/**
* testDolHtmlWithNoJs()
*
* @return int
*/
public function testDolHtmlWithNoJs()
{
global $conf;

$sav1 = $conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML;
$sav2 = $conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY;

// Test with an emoji
$test = 'abc ✅ def';

$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = 0;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = 1;
$result = dol_htmlwithnojs($test);
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = $sav1;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = $sav2;

print __METHOD__." result for dol_htmlwithnojs and MAIN_RESTRICTHTML_ONLY_VALID_HTML=0 with emoji = ".$result."\n";
$this->assertEquals($test, $result, 'dol_htmlwithnojs failed with an emoji when MAIN_RESTRICTHTML_ONLY_VALID_HTML=0');


$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = 1;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = 0;
$result = dol_htmlwithnojs($test);
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML = $sav1;
$conf->global->MAIN_RESTRICTHTML_ONLY_VALID_HTML_TIDY = $sav2;

print __METHOD__." result for dol_htmlwithnojs and MAIN_RESTRICTHTML_ONLY_VALID_HTML=1 with emoji = ".$result."\n";
$this->assertEquals($test, $result, 'dol_htmlwithnojs failed with an emoji when MAIN_RESTRICTHTML_ONLY_VALID_HTML=1');


return 0;
}
}

0 comments on commit 1b2bad3

Please sign in to comment.