Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Synchronise functions in Bytes and String #9448

Merged
merged 6 commits into from
Mar 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ Working version

### Standard library:

- #9448: Add String.{empty,cat} as dual of Bytes.{empty,cat},
String.{of,to}_bytes as aliases of Bytes.{to,of}_string,
Bytes.split_on_char as dual of String.split_on_char, and binary decoding
functions in String to match those in Bytes.
(David Allsopp, review by Damien Doligez, Gabriel Scherer and others)

- #9533: Added String.starts_with and String.ends_with.
(Bernhard Schommer, review by Daniel Bünzli, Gabriel Scherer and
Alain Frisch)
Expand Down
35 changes: 35 additions & 0 deletions stdlib/bytes.ml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,27 @@ let apply1 f s =
let capitalize_ascii s = apply1 Char.uppercase_ascii s
let uncapitalize_ascii s = apply1 Char.lowercase_ascii s

(* duplicated in string.ml *)
let starts_with ~prefix s =
let len_s = length s
and len_pre = length prefix in
let rec aux i =
if i = len_pre then true
else if unsafe_get s i <> unsafe_get prefix i then false
else aux (i + 1)
in len_s >= len_pre && aux 0

(* duplicated in string.ml *)
let ends_with ~suffix s =
let len_s = length s
and len_suf = length suffix in
let diff = len_s - len_suf in
let rec aux i =
if i = len_suf then true
else if unsafe_get s (diff + i) <> unsafe_get suffix i then false
else aux (i + 1)
in diff >= 0 && aux 0

(* duplicated in string.ml *)
let rec index_rec s lim i c =
if i >= lim then raise Not_found else
Expand Down Expand Up @@ -322,6 +343,18 @@ type t = bytes
let compare (x: t) (y: t) = Stdlib.compare x y
external equal : t -> t -> bool = "caml_bytes_equal" [@@noalloc]

(* duplicated in string.ml *)
let split_on_char sep s =
let r = ref [] in
let j = ref (length s) in
for i = length s - 1 downto 0 do
if unsafe_get s i = sep then begin
r := sub s (i + 1) (!j - i - 1) :: !r;
j := i
end
done;
sub s 0 !j :: !r
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is literally a copy of String.split_on_char, since it lexically draws in Bytes.sub and Bytes.unsafe_get instead.


(* Deprecated functions implemented via other deprecated functions *)
[@@@ocaml.warning "-3"]
let uppercase s = map Char.uppercase s
Expand Down Expand Up @@ -371,6 +404,8 @@ let of_seq i =

(** {6 Binary encoding/decoding of integers} *)

(* The get_ functions are all duplicated in string.ml *)

external get_uint8 : bytes -> int -> int = "%bytes_safe_get"
external get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16"
external get_int32_ne : bytes -> int -> int32 = "%caml_bytes_get32"
Expand Down
28 changes: 28 additions & 0 deletions stdlib/bytes.mli
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,19 @@ val equal: t -> t -> bool
(** The equality function for byte sequences.
@since 4.03.0 (4.05.0 in BytesLabels) *)

val starts_with :
prefix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [starts_with ][~][prefix s] is [true] if and only if [s] starts with
[prefix].

@since 4.13.0 *)

val ends_with :
suffix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [ends_with suffix s] is [true] if and only if [s] ends with [suffix].

@since 4.13.0 *)

(** {1:unsafe Unsafe conversions (for advanced users)}

This section describes unsafe, low-level conversion functions
Expand Down Expand Up @@ -453,6 +466,21 @@ let s = Bytes.of_string "hello"
*)


val split_on_char: char -> bytes -> bytes list
(** [split_on_char sep s] returns the list of all (possibly empty)
subsequences of [s] that are delimited by the [sep] character.

The function's output is specified by the following invariants:

- The list is not empty.
- Concatenating its elements using [sep] as a separator returns a
byte sequence equal to the input ([Bytes.concat (Bytes.make 1 sep)
(Bytes.split_on_char sep s) = s]).
- No byte sequence in the result contains the [sep] character.

@since 4.13.0
*)

(** {1 Iterators} *)

val to_seq : t -> char Seq.t
Expand Down
28 changes: 28 additions & 0 deletions stdlib/bytesLabels.mli
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,19 @@ val equal: t -> t -> bool
(** The equality function for byte sequences.
@since 4.03.0 (4.05.0 in BytesLabels) *)

val starts_with :
prefix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [starts_with ][~][prefix s] is [true] if and only if [s] starts with
[prefix].

@since 4.13.0 *)

val ends_with :
suffix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [ends_with suffix s] is [true] if and only if [s] ends with [suffix].

@since 4.13.0 *)

(** {1:unsafe Unsafe conversions (for advanced users)}

This section describes unsafe, low-level conversion functions
Expand Down Expand Up @@ -453,6 +466,21 @@ let s = Bytes.of_string "hello"
*)


val split_on_char: sep:char -> bytes -> bytes list
(** [split_on_char sep s] returns the list of all (possibly empty)
subsequences of [s] that are delimited by the [sep] character.

The function's output is specified by the following invariants:

- The list is not empty.
- Concatenating its elements using [sep] as a separator returns a
byte sequence equal to the input ([Bytes.concat (Bytes.make 1 sep)
(Bytes.split_on_char sep s) = s]).
- No byte sequence in the result contains the [sep] character.

@since 4.13.0
*)

(** {1 Iterators} *)

val to_seq : t -> char Seq.t
Expand Down
26 changes: 26 additions & 0 deletions stdlib/string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ let make n c =
B.make n c |> bts
let init n f =
B.init n f |> bts
let empty = ""
let copy s =
B.copy (bos s) |> bts
let of_bytes = B.to_string
let to_bytes = B.of_string
let sub s ofs len =
B.sub (bos s) ofs len |> bts
let fill =
Expand Down Expand Up @@ -73,6 +76,8 @@ let concat sep = function
(B.create (sum_lengths 0 seplen l))
0 sep seplen l

let cat = ( ^ )

(* duplicated in bytes.ml *)
let iter f s =
for i = 0 to length s - 1 do f (unsafe_get s i) done
Expand Down Expand Up @@ -197,6 +202,7 @@ let capitalize_ascii s =
let uncapitalize_ascii s =
B.uncapitalize_ascii (bos s) |> bts

(* duplicated in bytes.ml *)
let starts_with ~prefix s =
let len_s = length s
and len_pre = length prefix in
Expand All @@ -206,6 +212,7 @@ let starts_with ~prefix s =
else aux (i + 1)
in len_s >= len_pre && aux 0

(* duplicated in bytes.ml *)
let ends_with ~suffix s =
let len_s = length s
and len_suf = length suffix in
Expand All @@ -216,6 +223,7 @@ let ends_with ~suffix s =
else aux (i + 1)
in diff >= 0 && aux 0

(* duplicated in bytes.ml *)
let split_on_char sep s =
let r = ref [] in
let j = ref (length s) in
Expand Down Expand Up @@ -250,3 +258,21 @@ let to_seq s = bos s |> B.to_seq
let to_seqi s = bos s |> B.to_seqi

let of_seq g = B.of_seq g |> bts

(** {6 Binary encoding/decoding of integers} *)

external get_uint8 : string -> int -> int = "%string_safe_get"
external get_uint16_ne : string -> int -> int = "%caml_string_get16"
external get_int32_ne : string -> int -> int32 = "%caml_string_get32"
external get_int64_ne : string -> int -> int64 = "%caml_string_get64"
Comment on lines +264 to +267
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The runtime already has the correctly-named primitives for the string_ versions (the names of these four primitives constitute the only difference in the code between the String and Bytes versions)


let get_int8 s i = B.get_int8 (bos s) i
let get_uint16_le s i = B.get_uint16_le (bos s) i
let get_uint16_be s i = B.get_uint16_be (bos s) i
let get_int16_ne s i = B.get_int16_ne (bos s) i
let get_int16_le s i = B.get_int16_le (bos s) i
let get_int16_be s i = B.get_int16_be (bos s) i
let get_int32_le s i = B.get_int32_le (bos s) i
let get_int32_be s i = B.get_int32_be (bos s) i
let get_int64_le s i = B.get_int64_le (bos s) i
let get_int64_be s i = B.get_int64_be (bos s) i
146 changes: 146 additions & 0 deletions stdlib/string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,26 @@ val init : int -> (int -> char) -> string
@raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}.
@since 4.02.0 *)

val empty : string
(** The empty string.

@since 4.13.0
*)

val of_bytes : bytes -> string
(** Return a new string that contains the same bytes as the given byte
sequence.

@since 4.13.0
*)

val to_bytes : string -> bytes
(** Return a new byte sequence that contains the same bytes as the given
string.

@since 4.13.0
*)

external length : string -> int = "%string_length"
(** [length s] is the length (number of bytes/characters) of [s]. *)

Expand All @@ -116,6 +136,12 @@ val concat : string -> string list -> string
@raise Invalid_argument if the result is longer than
{!Sys.max_string_length} bytes. *)

val cat : string -> string -> string
(** [cat s1 s2] concatenates s1 and s2 ([s1 ^ s2]).

@since 4.13.0
*)

(** {1:predicates Predicates and comparisons} *)

val equal : t -> t -> bool
Expand Down Expand Up @@ -398,6 +424,126 @@ val uncapitalize : string -> string

@deprecated Functions operating on Latin-1 character set are deprecated. *)

(** {1 Binary decoding of integers} *)

(** The functions in this section binary decode integers from strings.

All following functions raise [Invalid_argument] if the characters
needed at index [i] to decode the integer are not available.

Little-endian (resp. big-endian) encoding means that least
(resp. most) significant bytes are stored first. Big-endian is
also known as network byte order. Native-endian encoding is
either little-endian or big-endian depending on {!Sys.big_endian}.

32-bit and 64-bit integers are represented by the [int32] and
[int64] types, which can be interpreted either as signed or
unsigned numbers.

8-bit and 16-bit integers are represented by the [int] type,
which has more bits than the binary encoding. These extra bits
are sign-extended (or zero-extended) for functions which decode 8-bit
or 16-bit integers and represented them with [int] values.
*)

val get_uint8 : string -> int -> int
(** [get_uint8 b i] is [b]'s unsigned 8-bit integer starting at character
index [i].

@since 4.13.0
*)

val get_int8 : string -> int -> int
(** [get_int8 b i] is [b]'s signed 8-bit integer starting at character
index [i].

@since 4.13.0
*)

val get_uint16_ne : string -> int -> int
(** [get_uint16_ne b i] is [b]'s native-endian unsigned 16-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_uint16_be : string -> int -> int
(** [get_uint16_be b i] is [b]'s big-endian unsigned 16-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_uint16_le : string -> int -> int
(** [get_uint16_le b i] is [b]'s little-endian unsigned 16-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int16_ne : string -> int -> int
(** [get_int16_ne b i] is [b]'s native-endian signed 16-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int16_be : string -> int -> int
(** [get_int16_be b i] is [b]'s big-endian signed 16-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int16_le : string -> int -> int
(** [get_int16_le b i] is [b]'s little-endian signed 16-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int32_ne : string -> int -> int32
(** [get_int32_ne b i] is [b]'s native-endian 32-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int32_be : string -> int -> int32
(** [get_int32_be b i] is [b]'s big-endian 32-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int32_le : string -> int -> int32
(** [get_int32_le b i] is [b]'s little-endian 32-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int64_ne : string -> int -> int64
(** [get_int64_ne b i] is [b]'s native-endian 64-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int64_be : string -> int -> int64
(** [get_int64_be b i] is [b]'s big-endian 64-bit integer
starting at character index [i].

@since 4.13.0
*)

val get_int64_le : string -> int -> int64
(** [get_int64_le b i] is [b]'s little-endian 64-bit integer
starting at character index [i].

@since 4.13.0
*)

(**/**)

(* The following is for system use only. Do not call directly. *)
Expand Down