Skip to content

Commit

Permalink
Test and fix Bytes.is_valid_utf_8.
Browse files Browse the repository at this point in the history
  • Loading branch information
dbuenzli committed Oct 27, 2021
1 parent 08a8d9b commit 4ad5340
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 21 deletions.
35 changes: 21 additions & 14 deletions stdlib/bytes.ml
Original file line number Diff line number Diff line change
Expand Up @@ -674,48 +674,55 @@ let is_valid_utf_8 b =
| '\x00' .. '\x7F' -> loop max b (i + 1)
| '\xC2' .. '\xDF' ->
let last = i + 1 in
last > max
if last > max
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| '\xE0' ->
let last = i + 2 in
last > max
if last > max
|| not_in_xA0_to_xBF (get b (i + 1))
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
let last = i + 2 in
last > max
if last > max
|| not_in_x80_to_xBF (get b (i + 1))
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| '\xED' ->
let last = i + 2 in
last > max
if last > max
|| not_in_x80_to_x9F (get b (i + 1))
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| '\xF0' ->
let last = i + 3 in
last > max
if last > max
|| not_in_x90_to_xBF (get b (i + 1))
|| not_in_x80_to_xBF (get b (i + 2))
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| '\xF1' .. '\xF3' ->
let last = i + 3 in
last > max
if last > max
|| not_in_x80_to_xBF (get b (i + 1))
|| not_in_x80_to_xBF (get b (i + 2))
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| '\xF4' ->
let last = i + 3 in
last > max
if last > max
|| not_in_x80_to_x8F (get b (i + 1))
|| not_in_x80_to_xBF (get b (i + 2))
|| not_in_x80_to_xBF (get b last)
|| loop max b (last + 1)
then false
else loop max b (last + 1)
| _ -> false
in
loop (length b - 1) b 0
Expand Down
26 changes: 19 additions & 7 deletions testsuite/tests/lib-bytes-utf/test.ml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ let () =
assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
()

let () =
(* Test bug found during review *)
let b = Bytes.create 2 in
let () = Bytes.set_uint8 b 0 0xC3 in
let () = Bytes.set_uint8 b 1 0x00 in
assert (not (Bytes.is_valid_utf_8 b))

let () =
(* Test used bytes and replacement according to WHATWG recommendation.
This is just a recommendation.
Expand Down Expand Up @@ -215,17 +222,20 @@ let test_invalid_decodes () =
in
fold_uchars add (Sset.empty, Sset.empty, Sset.empty)
in
let test_seqs utf utf_encs get_utf_char =
let test_seqs utf utf_encs get_utf_char is_valid_utf =
let test seq =
let dec = get_utf_char seq 0 in
let valid = Uchar.utf_decode_is_valid dec in
let is_valid = is_valid_utf seq in
let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in
if not ((valid && is_enc) || (not valid && not is_enc)) then begin
if not ((valid && is_enc) || (not valid && not is_enc)) ||
not ((is_valid && is_enc) || (not is_valid && not is_enc))
then begin
for i = 0 to Bytes.length seq - 1 do
Printf.printf "%02X " (Bytes.get_uint8 seq i);
done;
Printf.printf "valid: %b is_encoding: %b decode: U+%04X\n" valid is_enc
(Uchar.to_int (Uchar.utf_decode_uchar dec));
Printf.printf "valid: %b is_encoding: %b decode: U+%04X\n is_valid:%b"
valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) is_valid;
assert false
end;
valid
Expand Down Expand Up @@ -257,7 +267,9 @@ let test_invalid_decodes () =
end
done
in
test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar;
test_seqs "UTF-16BE" utf_16be_encs Bytes.get_utf_16be_uchar;
test_seqs "UTF-16LE" utf_16le_encs Bytes.get_utf_16le_uchar;
test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8;
test_seqs "UTF-16BE"
utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be;
test_seqs "UTF-16LE" utf_16le_encs Bytes.get_utf_16le_uchar
Bytes.is_valid_utf_16le;
()

0 comments on commit 4ad5340

Please sign in to comment.