Merge pull request #9448 from dra27/missing-bytes-string

Synchronise functions in Bytes and String
ocaml · Mar 15, 2021 · 3576548 · 3576548
2 parents cbe61cc + 34606a5
commit 3576548
Show file tree

Hide file tree

Showing 8 changed files with 531 additions and 0 deletions.
diff --git a/Changes b/Changes
@@ -101,6 +101,12 @@ Working version
 
 ### Standard library:
 
+- #9448: Add String.{empty,cat} as dual of Bytes.{empty,cat},
+  String.{of,to}_bytes as aliases of Bytes.{to,of}_string,
+  Bytes.split_on_char as dual of String.split_on_char, and binary decoding
+  functions in String to match those in Bytes.
+  (David Allsopp, review by Damien Doligez, Gabriel Scherer and others)
+
 - #9533: Added String.starts_with and String.ends_with.
   (Bernhard Schommer, review by Daniel Bünzli, Gabriel Scherer and
   Alain Frisch)

diff --git a/stdlib/bytes.ml b/stdlib/bytes.ml
@@ -236,6 +236,27 @@ let apply1 f s =
 let capitalize_ascii s = apply1 Char.uppercase_ascii s
 let uncapitalize_ascii s = apply1 Char.lowercase_ascii s
 
+(* duplicated in string.ml *)
+let starts_with ~prefix s =
+  let len_s = length s
+  and len_pre = length prefix in
+  let rec aux i =
+    if i = len_pre then true
+    else if unsafe_get s i <> unsafe_get prefix i then false
+    else aux (i + 1)
+  in len_s >= len_pre && aux 0
+
+(* duplicated in string.ml *)
+let ends_with ~suffix s =
+  let len_s = length s
+  and len_suf = length suffix in
+  let diff = len_s - len_suf in
+  let rec aux i =
+    if i = len_suf then true
+    else if unsafe_get s (diff + i) <> unsafe_get suffix i then false
+    else aux (i + 1)
+  in diff >= 0 && aux 0
+
 (* duplicated in string.ml *)
 let rec index_rec s lim i c =
   if i >= lim then raise Not_found else
@@ -322,6 +343,18 @@ type t = bytes
 let compare (x: t) (y: t) = Stdlib.compare x y
 external equal : t -> t -> bool = "caml_bytes_equal" [@@noalloc]
 
+(* duplicated in string.ml *)
+let split_on_char sep s =
+  let r = ref [] in
+  let j = ref (length s) in
+  for i = length s - 1 downto 0 do
+    if unsafe_get s i = sep then begin
+      r := sub s (i + 1) (!j - i - 1) :: !r;
+      j := i
+    end
+  done;
+  sub s 0 !j :: !r
+
 (* Deprecated functions implemented via other deprecated functions *)
 [@@@ocaml.warning "-3"]
 let uppercase s = map Char.uppercase s
@@ -371,6 +404,8 @@ let of_seq i =
 
 (** {6 Binary encoding/decoding of integers} *)
 
+(* The get_ functions are all duplicated in string.ml *)
+
 external get_uint8 : bytes -> int -> int = "%bytes_safe_get"
 external get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16"
 external get_int32_ne : bytes -> int -> int32 = "%caml_bytes_get32"

diff --git a/stdlib/bytes.mli b/stdlib/bytes.mli
@@ -323,6 +323,19 @@ val equal: t -> t -> bool
 (** The equality function for byte sequences.
     @since 4.03.0 (4.05.0 in BytesLabels) *)
 
+val starts_with :
+  prefix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
+(** [starts_with ][~][prefix s] is [true] if and only if [s] starts with
+    [prefix].
+
+    @since 4.13.0 *)
+
+val ends_with :
+  suffix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
+(** [ends_with suffix s] is [true] if and only if [s] ends with [suffix].
+
+    @since 4.13.0 *)
+
 (** {1:unsafe Unsafe conversions (for advanced users)}
 
     This section describes unsafe, low-level conversion functions
@@ -453,6 +466,21 @@ let s = Bytes.of_string "hello"
 *)
 
 
+val split_on_char: char -> bytes -> bytes list
+(** [split_on_char sep s] returns the list of all (possibly empty)
+    subsequences of [s] that are delimited by the [sep] character.
+
+    The function's output is specified by the following invariants:
+
+    - The list is not empty.
+    - Concatenating its elements using [sep] as a separator returns a
+      byte sequence equal to the input ([Bytes.concat (Bytes.make 1 sep)
+      (Bytes.split_on_char sep s) = s]).
+    - No byte sequence in the result contains the [sep] character.
+
+    @since 4.13.0
+*)
+
 (** {1 Iterators} *)
 
 val to_seq : t -> char Seq.t

diff --git a/stdlib/bytesLabels.mli b/stdlib/bytesLabels.mli
@@ -323,6 +323,19 @@ val equal: t -> t -> bool
 (** The equality function for byte sequences.
     @since 4.03.0 (4.05.0 in BytesLabels) *)
 
+val starts_with :
+  prefix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
+(** [starts_with ][~][prefix s] is [true] if and only if [s] starts with
+    [prefix].
+
+    @since 4.13.0 *)
+
+val ends_with :
+  suffix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
+(** [ends_with suffix s] is [true] if and only if [s] ends with [suffix].
+
+    @since 4.13.0 *)
+
 (** {1:unsafe Unsafe conversions (for advanced users)}
 
     This section describes unsafe, low-level conversion functions
@@ -453,6 +466,21 @@ let s = Bytes.of_string "hello"
 *)
 
 
+val split_on_char: sep:char -> bytes -> bytes list
+(** [split_on_char sep s] returns the list of all (possibly empty)
+    subsequences of [s] that are delimited by the [sep] character.
+
+    The function's output is specified by the following invariants:
+
+    - The list is not empty.
+    - Concatenating its elements using [sep] as a separator returns a
+      byte sequence equal to the input ([Bytes.concat (Bytes.make 1 sep)
+      (Bytes.split_on_char sep s) = s]).
+    - No byte sequence in the result contains the [sep] character.
+
+    @since 4.13.0
+*)
+
 (** {1 Iterators} *)
 
 val to_seq : t -> char Seq.t

diff --git a/stdlib/string.ml b/stdlib/string.ml
@@ -41,8 +41,11 @@ let make n c =
   B.make n c |> bts
 let init n f =
   B.init n f |> bts
+let empty = ""
 let copy s =
   B.copy (bos s) |> bts
+let of_bytes = B.to_string
+let to_bytes = B.of_string
 let sub s ofs len =
   B.sub (bos s) ofs len |> bts
 let fill =
@@ -73,6 +76,8 @@ let concat sep = function
             (B.create (sum_lengths 0 seplen l))
             0 sep seplen l
 
+let cat = ( ^ )
+
 (* duplicated in bytes.ml *)
 let iter f s =
   for i = 0 to length s - 1 do f (unsafe_get s i) done
@@ -197,6 +202,7 @@ let capitalize_ascii s =
 let uncapitalize_ascii s =
   B.uncapitalize_ascii (bos s) |> bts
 
+(* duplicated in bytes.ml *)
 let starts_with ~prefix s =
   let len_s = length s
   and len_pre = length prefix in
@@ -206,6 +212,7 @@ let starts_with ~prefix s =
     else aux (i + 1)
   in len_s >= len_pre && aux 0
 
+(* duplicated in bytes.ml *)
 let ends_with ~suffix s =
   let len_s = length s
   and len_suf = length suffix in
@@ -216,6 +223,7 @@ let ends_with ~suffix s =
     else aux (i + 1)
   in diff >= 0 && aux 0
 
+(* duplicated in bytes.ml *)
 let split_on_char sep s =
   let r = ref [] in
   let j = ref (length s) in
@@ -250,3 +258,21 @@ let to_seq s = bos s |> B.to_seq
 let to_seqi s = bos s |> B.to_seqi
 
 let of_seq g = B.of_seq g |> bts
+
+(** {6 Binary encoding/decoding of integers} *)
+
+external get_uint8 : string -> int -> int = "%string_safe_get"
+external get_uint16_ne : string -> int -> int = "%caml_string_get16"
+external get_int32_ne : string -> int -> int32 = "%caml_string_get32"
+external get_int64_ne : string -> int -> int64 = "%caml_string_get64"
+
+let get_int8 s i = B.get_int8 (bos s) i
+let get_uint16_le s i = B.get_uint16_le (bos s) i
+let get_uint16_be s i = B.get_uint16_be (bos s) i
+let get_int16_ne s i = B.get_int16_ne (bos s) i
+let get_int16_le s i = B.get_int16_le (bos s) i
+let get_int16_be s i = B.get_int16_be (bos s) i
+let get_int32_le s i = B.get_int32_le (bos s) i
+let get_int32_be s i = B.get_int32_be (bos s) i
+let get_int64_le s i = B.get_int64_le (bos s) i
+let get_int64_be s i = B.get_int64_be (bos s) i
diff --git a/stdlib/string.mli b/stdlib/string.mli
@@ -95,6 +95,26 @@ val init : int -> (int -> char) -> string
     @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}.
     @since 4.02.0 *)
 
+val empty : string
+(** The empty string.
+
+    @since 4.13.0
+*)
+
+val of_bytes : bytes -> string
+(** Return a new string that contains the same bytes as the given byte
+    sequence.
+
+    @since 4.13.0
+*)
+
+val to_bytes : string -> bytes
+(** Return a new byte sequence that contains the same bytes as the given
+    string.
+
+    @since 4.13.0
+*)
+
 external length : string -> int = "%string_length"
 (** [length s] is the length (number of bytes/characters) of [s]. *)
 
@@ -116,6 +136,12 @@ val concat : string -> string list -> string
     @raise Invalid_argument if the result is longer than
     {!Sys.max_string_length} bytes. *)
 
+val cat : string -> string -> string
+(** [cat s1 s2] concatenates s1 and s2 ([s1 ^ s2]).
+
+    @since 4.13.0
+*)
+
 (** {1:predicates Predicates and comparisons} *)
 
 val equal : t -> t -> bool
@@ -398,6 +424,126 @@ val uncapitalize : string -> string
 
     @deprecated Functions operating on Latin-1 character set are deprecated. *)
 
+(** {1 Binary decoding of integers} *)
+
+(** The functions in this section binary decode integers from strings.
+
+    All following functions raise [Invalid_argument] if the characters
+    needed at index [i] to decode the integer are not available.
+
+    Little-endian (resp. big-endian) encoding means that least
+    (resp. most) significant bytes are stored first.  Big-endian is
+    also known as network byte order.  Native-endian encoding is
+    either little-endian or big-endian depending on {!Sys.big_endian}.
+
+    32-bit and 64-bit integers are represented by the [int32] and
+    [int64] types, which can be interpreted either as signed or
+    unsigned numbers.
+
+    8-bit and 16-bit integers are represented by the [int] type,
+    which has more bits than the binary encoding.  These extra bits
+    are sign-extended (or zero-extended) for functions which decode 8-bit
+    or 16-bit integers and represented them with [int] values.
+*)
+
+val get_uint8 : string -> int -> int
+(** [get_uint8 b i] is [b]'s unsigned 8-bit integer starting at character
+    index [i].
+
+    @since 4.13.0
+*)
+
+val get_int8 : string -> int -> int
+(** [get_int8 b i] is [b]'s signed 8-bit integer starting at character
+    index [i].
+
+    @since 4.13.0
+*)
+
+val get_uint16_ne : string -> int -> int
+(** [get_uint16_ne b i] is [b]'s native-endian unsigned 16-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_uint16_be : string -> int -> int
+(** [get_uint16_be b i] is [b]'s big-endian unsigned 16-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_uint16_le : string -> int -> int
+(** [get_uint16_le b i] is [b]'s little-endian unsigned 16-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int16_ne : string -> int -> int
+(** [get_int16_ne b i] is [b]'s native-endian signed 16-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int16_be : string -> int -> int
+(** [get_int16_be b i] is [b]'s big-endian signed 16-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int16_le : string -> int -> int
+(** [get_int16_le b i] is [b]'s little-endian signed 16-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int32_ne : string -> int -> int32
+(** [get_int32_ne b i] is [b]'s native-endian 32-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int32_be : string -> int -> int32
+(** [get_int32_be b i] is [b]'s big-endian 32-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int32_le : string -> int -> int32
+(** [get_int32_le b i] is [b]'s little-endian 32-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int64_ne : string -> int -> int64
+(** [get_int64_ne b i] is [b]'s native-endian 64-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int64_be : string -> int -> int64
+(** [get_int64_be b i] is [b]'s big-endian 64-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
+val get_int64_le : string -> int -> int64
+(** [get_int64_le b i] is [b]'s little-endian 64-bit integer
+    starting at character index [i].
+
+    @since 4.13.0
+*)
+
 (**/**)
 
 (* The following is for system use only. Do not call directly. *)