Skip to content

Commit

Permalink
Merge pull request #9448 from dra27/missing-bytes-string
Browse files Browse the repository at this point in the history
Synchronise functions in Bytes and String
  • Loading branch information
dra27 committed Mar 15, 2021
2 parents cbe61cc + 34606a5 commit 3576548
Show file tree
Hide file tree
Showing 8 changed files with 531 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ Working version

### Standard library:

- #9448: Add String.{empty,cat} as dual of Bytes.{empty,cat},
String.{of,to}_bytes as aliases of Bytes.{to,of}_string,
Bytes.split_on_char as dual of String.split_on_char, and binary decoding
functions in String to match those in Bytes.
(David Allsopp, review by Damien Doligez, Gabriel Scherer and others)

- #9533: Added String.starts_with and String.ends_with.
(Bernhard Schommer, review by Daniel Bünzli, Gabriel Scherer and
Alain Frisch)
Expand Down
35 changes: 35 additions & 0 deletions stdlib/bytes.ml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,27 @@ let apply1 f s =
let capitalize_ascii s = apply1 Char.uppercase_ascii s
let uncapitalize_ascii s = apply1 Char.lowercase_ascii s

(* duplicated in string.ml *)
let starts_with ~prefix s =
let len_s = length s
and len_pre = length prefix in
let rec aux i =
if i = len_pre then true
else if unsafe_get s i <> unsafe_get prefix i then false
else aux (i + 1)
in len_s >= len_pre && aux 0

(* duplicated in string.ml *)
let ends_with ~suffix s =
let len_s = length s
and len_suf = length suffix in
let diff = len_s - len_suf in
let rec aux i =
if i = len_suf then true
else if unsafe_get s (diff + i) <> unsafe_get suffix i then false
else aux (i + 1)
in diff >= 0 && aux 0

(* duplicated in string.ml *)
let rec index_rec s lim i c =
if i >= lim then raise Not_found else
Expand Down Expand Up @@ -322,6 +343,18 @@ type t = bytes
let compare (x: t) (y: t) = Stdlib.compare x y
external equal : t -> t -> bool = "caml_bytes_equal" [@@noalloc]

(* duplicated in string.ml *)
let split_on_char sep s =
let r = ref [] in
let j = ref (length s) in
for i = length s - 1 downto 0 do
if unsafe_get s i = sep then begin
r := sub s (i + 1) (!j - i - 1) :: !r;
j := i
end
done;
sub s 0 !j :: !r

(* Deprecated functions implemented via other deprecated functions *)
[@@@ocaml.warning "-3"]
let uppercase s = map Char.uppercase s
Expand Down Expand Up @@ -371,6 +404,8 @@ let of_seq i =

(** {6 Binary encoding/decoding of integers} *)

(* The get_ functions are all duplicated in string.ml *)

external get_uint8 : bytes -> int -> int = "%bytes_safe_get"
external get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16"
external get_int32_ne : bytes -> int -> int32 = "%caml_bytes_get32"
Expand Down
28 changes: 28 additions & 0 deletions stdlib/bytes.mli
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,19 @@ val equal: t -> t -> bool
(** The equality function for byte sequences.
@since 4.03.0 (4.05.0 in BytesLabels) *)

val starts_with :
prefix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [starts_with ][~][prefix s] is [true] if and only if [s] starts with
[prefix].
@since 4.13.0 *)

val ends_with :
suffix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [ends_with suffix s] is [true] if and only if [s] ends with [suffix].
@since 4.13.0 *)

(** {1:unsafe Unsafe conversions (for advanced users)}
This section describes unsafe, low-level conversion functions
Expand Down Expand Up @@ -453,6 +466,21 @@ let s = Bytes.of_string "hello"
*)


val split_on_char: char -> bytes -> bytes list
(** [split_on_char sep s] returns the list of all (possibly empty)
subsequences of [s] that are delimited by the [sep] character.
The function's output is specified by the following invariants:
- The list is not empty.
- Concatenating its elements using [sep] as a separator returns a
byte sequence equal to the input ([Bytes.concat (Bytes.make 1 sep)
(Bytes.split_on_char sep s) = s]).
- No byte sequence in the result contains the [sep] character.
@since 4.13.0
*)

(** {1 Iterators} *)

val to_seq : t -> char Seq.t
Expand Down
28 changes: 28 additions & 0 deletions stdlib/bytesLabels.mli
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,19 @@ val equal: t -> t -> bool
(** The equality function for byte sequences.
@since 4.03.0 (4.05.0 in BytesLabels) *)

val starts_with :
prefix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [starts_with ][~][prefix s] is [true] if and only if [s] starts with
[prefix].
@since 4.13.0 *)

val ends_with :
suffix (* comment thwarts tools/sync_stdlib_docs *) :bytes -> bytes -> bool
(** [ends_with suffix s] is [true] if and only if [s] ends with [suffix].
@since 4.13.0 *)

(** {1:unsafe Unsafe conversions (for advanced users)}
This section describes unsafe, low-level conversion functions
Expand Down Expand Up @@ -453,6 +466,21 @@ let s = Bytes.of_string "hello"
*)


val split_on_char: sep:char -> bytes -> bytes list
(** [split_on_char sep s] returns the list of all (possibly empty)
subsequences of [s] that are delimited by the [sep] character.
The function's output is specified by the following invariants:
- The list is not empty.
- Concatenating its elements using [sep] as a separator returns a
byte sequence equal to the input ([Bytes.concat (Bytes.make 1 sep)
(Bytes.split_on_char sep s) = s]).
- No byte sequence in the result contains the [sep] character.
@since 4.13.0
*)

(** {1 Iterators} *)

val to_seq : t -> char Seq.t
Expand Down
26 changes: 26 additions & 0 deletions stdlib/string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ let make n c =
B.make n c |> bts
let init n f =
B.init n f |> bts
let empty = ""
let copy s =
B.copy (bos s) |> bts
let of_bytes = B.to_string
let to_bytes = B.of_string
let sub s ofs len =
B.sub (bos s) ofs len |> bts
let fill =
Expand Down Expand Up @@ -73,6 +76,8 @@ let concat sep = function
(B.create (sum_lengths 0 seplen l))
0 sep seplen l

let cat = ( ^ )

(* duplicated in bytes.ml *)
let iter f s =
for i = 0 to length s - 1 do f (unsafe_get s i) done
Expand Down Expand Up @@ -197,6 +202,7 @@ let capitalize_ascii s =
let uncapitalize_ascii s =
B.uncapitalize_ascii (bos s) |> bts

(* duplicated in bytes.ml *)
let starts_with ~prefix s =
let len_s = length s
and len_pre = length prefix in
Expand All @@ -206,6 +212,7 @@ let starts_with ~prefix s =
else aux (i + 1)
in len_s >= len_pre && aux 0

(* duplicated in bytes.ml *)
let ends_with ~suffix s =
let len_s = length s
and len_suf = length suffix in
Expand All @@ -216,6 +223,7 @@ let ends_with ~suffix s =
else aux (i + 1)
in diff >= 0 && aux 0

(* duplicated in bytes.ml *)
let split_on_char sep s =
let r = ref [] in
let j = ref (length s) in
Expand Down Expand Up @@ -250,3 +258,21 @@ let to_seq s = bos s |> B.to_seq
let to_seqi s = bos s |> B.to_seqi

let of_seq g = B.of_seq g |> bts

(** {6 Binary encoding/decoding of integers} *)

external get_uint8 : string -> int -> int = "%string_safe_get"
external get_uint16_ne : string -> int -> int = "%caml_string_get16"
external get_int32_ne : string -> int -> int32 = "%caml_string_get32"
external get_int64_ne : string -> int -> int64 = "%caml_string_get64"

let get_int8 s i = B.get_int8 (bos s) i
let get_uint16_le s i = B.get_uint16_le (bos s) i
let get_uint16_be s i = B.get_uint16_be (bos s) i
let get_int16_ne s i = B.get_int16_ne (bos s) i
let get_int16_le s i = B.get_int16_le (bos s) i
let get_int16_be s i = B.get_int16_be (bos s) i
let get_int32_le s i = B.get_int32_le (bos s) i
let get_int32_be s i = B.get_int32_be (bos s) i
let get_int64_le s i = B.get_int64_le (bos s) i
let get_int64_be s i = B.get_int64_be (bos s) i
146 changes: 146 additions & 0 deletions stdlib/string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,26 @@ val init : int -> (int -> char) -> string
@raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}.
@since 4.02.0 *)

val empty : string
(** The empty string.
@since 4.13.0
*)

val of_bytes : bytes -> string
(** Return a new string that contains the same bytes as the given byte
sequence.
@since 4.13.0
*)

val to_bytes : string -> bytes
(** Return a new byte sequence that contains the same bytes as the given
string.
@since 4.13.0
*)

external length : string -> int = "%string_length"
(** [length s] is the length (number of bytes/characters) of [s]. *)

Expand All @@ -116,6 +136,12 @@ val concat : string -> string list -> string
@raise Invalid_argument if the result is longer than
{!Sys.max_string_length} bytes. *)

val cat : string -> string -> string
(** [cat s1 s2] concatenates s1 and s2 ([s1 ^ s2]).
@since 4.13.0
*)

(** {1:predicates Predicates and comparisons} *)

val equal : t -> t -> bool
Expand Down Expand Up @@ -398,6 +424,126 @@ val uncapitalize : string -> string
@deprecated Functions operating on Latin-1 character set are deprecated. *)

(** {1 Binary decoding of integers} *)

(** The functions in this section binary decode integers from strings.
All following functions raise [Invalid_argument] if the characters
needed at index [i] to decode the integer are not available.
Little-endian (resp. big-endian) encoding means that least
(resp. most) significant bytes are stored first. Big-endian is
also known as network byte order. Native-endian encoding is
either little-endian or big-endian depending on {!Sys.big_endian}.
32-bit and 64-bit integers are represented by the [int32] and
[int64] types, which can be interpreted either as signed or
unsigned numbers.
8-bit and 16-bit integers are represented by the [int] type,
which has more bits than the binary encoding. These extra bits
are sign-extended (or zero-extended) for functions which decode 8-bit
or 16-bit integers and represented them with [int] values.
*)

val get_uint8 : string -> int -> int
(** [get_uint8 b i] is [b]'s unsigned 8-bit integer starting at character
index [i].
@since 4.13.0
*)

val get_int8 : string -> int -> int
(** [get_int8 b i] is [b]'s signed 8-bit integer starting at character
index [i].
@since 4.13.0
*)

val get_uint16_ne : string -> int -> int
(** [get_uint16_ne b i] is [b]'s native-endian unsigned 16-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_uint16_be : string -> int -> int
(** [get_uint16_be b i] is [b]'s big-endian unsigned 16-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_uint16_le : string -> int -> int
(** [get_uint16_le b i] is [b]'s little-endian unsigned 16-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int16_ne : string -> int -> int
(** [get_int16_ne b i] is [b]'s native-endian signed 16-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int16_be : string -> int -> int
(** [get_int16_be b i] is [b]'s big-endian signed 16-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int16_le : string -> int -> int
(** [get_int16_le b i] is [b]'s little-endian signed 16-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int32_ne : string -> int -> int32
(** [get_int32_ne b i] is [b]'s native-endian 32-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int32_be : string -> int -> int32
(** [get_int32_be b i] is [b]'s big-endian 32-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int32_le : string -> int -> int32
(** [get_int32_le b i] is [b]'s little-endian 32-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int64_ne : string -> int -> int64
(** [get_int64_ne b i] is [b]'s native-endian 64-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int64_be : string -> int -> int64
(** [get_int64_be b i] is [b]'s big-endian 64-bit integer
starting at character index [i].
@since 4.13.0
*)

val get_int64_le : string -> int -> int64
(** [get_int64_le b i] is [b]'s little-endian 64-bit integer
starting at character index [i].
@since 4.13.0
*)

(**/**)

(* The following is for system use only. Do not call directly. *)
Expand Down

0 comments on commit 3576548

Please sign in to comment.