From 2215d360dc531ec46c112223763404650b0a8be1 Mon Sep 17 00:00:00 2001 From: Eric Eldredge Date: Tue, 13 Feb 2024 18:38:00 -0500 Subject: [PATCH] Adjacency list optimizations (#9444) * Parameterize AdjacencyList * perf: reduce memory overhead in AdjacencyList Previously, we were allocating extra space for 'buckets' to accommodate hash collisions, but this turns out to waste a lot of space in large graphs. Additionally, we are no longer allocating space for nodes ahead of time; now, the nodes array will grow on demand, as edges are added. * refactor: extract edge linking behavior from addEdges method This unlocks the ability to resize without creating a new intermediary AdjacencyList. * fix: improve map capacity overflow detection * fix: resizing computations * Remove loadFactor * fix: node resizing * fix: avg collisions calculation * Rename capacity to initialCapacity * fix tests * fix: remove erroneous assertion The (incorrect) assumption was that there should be the same node record count after a resize of edges, but this is not necessarily the case; if there were deleted edges before the resize, then there may be node records that will also be deleted (by virtue of no longer having any edges connected to them) as part of the resize. * Enforce assumption that linked edge types must match * Add docs * Update AdjacencyList.md * Update AdjacencyList.md * Update AdjacencyList.md * Refactor link results to an enum * Update AdjacencyList.md --- docs/AdjacencyList.md | 753 +++++++++++++++++ packages/core/graph/src/AdjacencyList.js | 778 +++++++++++++----- .../core/graph/test/AdjacencyList.test.js | 60 +- 3 files changed, 1384 insertions(+), 207 deletions(-) create mode 100644 docs/AdjacencyList.md diff --git a/docs/AdjacencyList.md b/docs/AdjacencyList.md new file mode 100644 index 00000000000..4f359a98f5d --- /dev/null +++ b/docs/AdjacencyList.md @@ -0,0 +1,753 @@ +# AdjacencyList + +The `AdjacencyList` in Parcel is a complex implementation of +an otherwise [straightforward data structure](https://en.wikipedia.org/wiki/Adjacency_list). +The core ideas behind the `AdjacencyList` are: + +- Any two **nodes** that are connected by an **edge** are said to be _adjacent_ +- For _any_ node in the graph, every _adjacent node_ should be discoverable + in _linear time_. + +Conceptually, this is achieved by associating each edge that is added +_to or from_ a node with a previous edge that was added to or from that node. + +Where Parcel's `AdjacencyList` gets complex is in its _data layer_, which is +designed for: + +- shared access in a multithreaded runtime +- fast serialization and deserialization + +## A list of lists + +The simplest way of thinking about `AdjacencyList` is as a _list of lists_. +The primary list contains (conceptually) _node_ structs (ordered by numeric id), +with each field on the struct pointing to _doubly linked lists_ of (again, conceptual) +_edge_ structs that also connect to or from the same node (ordered by insertion). + +This looks something like: + +```js +// NOTE: Not a real type, just an illustration! +type AdjacencyList<{ + id: NodeId, + incoming?: EdgeList, + incomingReverse?: EdgeList, + outgoing?: EdgeList, + outgoingReverse?: EdgeList, +}> + +type EdgeList<{ + from: NodeId, + to: NodeId, +}> +``` + + + + + +
So, given a graph like:you can imagine doubly linked lists like:
+ +```mermaid +graph TD; + 0 --> |a| 1 + 0 --> |b| 2 + 1 --> |c| 2 +``` + + + +```mermaid +graph LR + subgraph 0[Node 0] + direction LR + 0o([outgoing]) --- 0oa[[a]] <--> 0ob[[b]] --- 0or([outgoingReverse]) + end + + subgraph 1[Node 1] + direction LR + 1i([incoming]) --- 1ia[[a]] --- 1ir([incomingReverse]) + 1o([outgoing]) --- 1oc[[c]] --- 1or([outgoingReverse]) + end + + subgraph 2[Node 2] + direction LR + 2i([incoming]) --- 2ib[[b]] <--> 2ic[[c]] --- 2ir([incomingReverse]) + end +``` + +This makes traversal of every edge of the graph _from any node_ +a straightforward process of following the links. + +
+ +## SharedTypeMap + +Under the hood, things are less straightforward. + +Since Parcel is multi-threaded, its implementation of `AdjacencyList` +features another core idea: + +- The data stored in the `AdjacencyList` should be accessible + from multiple threads with _no overhead_ + +Here, _overhead_ refers to the serialization costs that add up +in multi-threading scenarios. + +To accommodate this, `AdjacencyList` features a custom data structure called +the `SharedTypeMap`: + +- **Shared** because it is a [`Uint32Array`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint32Array) + backed by a [`SharedArrayBuffer`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/SharedArrayBuffer) +- **Type** because each item has a `TYPE` field +- **Map** because it is an implementation of a [hash map](https://en.wikipedia.org/wiki/Hash_table), + using a [coalesced hashing](https://en.wikipedia.org/wiki/Coalesced_hashing) strategy + +This means that data being added to `SharedTypeMap` gets a pass through +a hash function to produce stable key for storing and retrieving that data. + +### Why _Type_, though? + +Two nodes in a Parcel graph may be related to in each other in more than one way. +In order to represent multiple relationships between the same two nodes, +each _edge_ that connects the nodes has a _type_. + +Thus, when storing data about connections between adjacent nodes, +`AdjacencyList` uses the _type_ as part of the identifying information. + +### Uint32Array partitions + +As mentioned above, to deal with collisions, `SharedTypeMap` employs a version +of a coalesced hashing strategy. In this version: + +- A contiguous array of memory is maintained for storing the data +- A portion of the array is designated as the hash table +- The rest of the array is the addressable space, which is where values are stored +- the hash table stores pointers into the addressable space + +The `SharedTypeMap` is partitioned into 3 subarrays: + +```mermaid +graph LR + header ~~~ hashtable ~~~ items + + subgraph header + direction LR + 0[0: CAPACITY] ~~~ 1[1: COUNT] + end + + subgraph hashtable[hash table] + direction LR + 2[2: hash 0] ~~~ 3[3: hash 1] ~~~ 4[4: hash 2] + end + + subgraph items[addressable space] + direction LR + item1 ~~~ item2 ~~~ item3 ~~~ item4 + + subgraph item1[item 1] + direction LR + 5([5: NEXT]) ~~~ 6[6: TYPE] + end + + subgraph item2[item 2] + direction LR + 7([7: NEXT]) ~~~ 8[8: TYPE] + end + + subgraph item3[item 3] + direction LR + 9([9: NEXT]) ~~~ 10[10: TYPE] + end + + subgraph item4[item 4] + direction LR + 11([11: NEXT]) ~~~ 12[12: TYPE] + end + end +``` + +- The _header_ partition stores metadata: + - `CAPACITY`: the total number of items that can fit in the array. + This is always equal to the length of the _hash table_. + - `COUNT`: the number of items that are currently in + the _addressable space_ partition. +- The _hash table_ partition stores _addresses_, which are pointers + into the _addressable space_ partition + - each index represents the head of a linked list known as a _bucket_ +- The _addressable space_ partition stores contiguous slices of _item data_: + - `NEXT`: the _address_ of the next item with the same hash as this item; + this is a _link_ in the hash _bucket_. + - `TYPE`: the item's type + +Note that the fields described above are _inherited_ by `NodeTypeMap` +and `EdgeTypeMap`. They both extend this base set with fields of their own. + +When there is a hash collision, the previous value is augmented with a link +to the address of the colliding value. + +These linked lists of values that have hash collisions are known as _buckets_. + +## Hash buckets: another list of lists + +Though `SharedTypeMap` stores data in a _hash table_, it presents +an API that is more like a _linked list_. Instead of adding an item +to the `SharedTypeMap` through something like an `add` method, +`SharedTypeMap` has a method called `link`: + +```js + link(hash: THash, item: TAddress, type: TItemType): void; +``` + +This method pairs with `getNextAddress`: + +```js + getNextAddress(): TAddress +``` + +This is because `SharedTypeMap` is really a _base class_ implementation; +its subclasses (See [NodeTypeMap](#nodetypemap) and [EdgeTypeMap](#edgetypemap)) +extend this API with more map-like `add` methods. + +As `link` implies, each index in the hash table can be thought of +as the head of a _linked list_. + +When an item is added to a `SharedTypeMap` (or rather, to one of its subclasses), +the `getNextAddress` method is invoked to find the next location in the +_addressable space_ that is unoccupied. The next address is always computed +in constant time by adding the current item count (multiplied by the item size) +to the offset of the first addressable space. This has the effect of storing items +in the addressable space _in insertion order_. + +For example, here we see that hash `0` has 1 collision: + + + + + +
given an insertion order:imagine a list like:
+ +``` +// map.link(hash, address, type) +map.link(0, map.getNextAddress(), 1) +map.link(1, map.getNextAddress(), 2) +map.link(2, map.getNextAddress(), 3) +map.link(0, map.getNextAddress(), 1) +``` + + + +```mermaid +graph LR + 2[2: hash 0] -- head --> 5([5: NEXT]) -- next --> 11([11: NEXT]) + 3[3: hash 1] -- head --> 7([7: NEXT]) + 4[4: hash 2] -- head --> 9([9: NEXT]) + +``` + +
+ +Or, as raw `Uint32Array` data: + +| label | CAPACITY | COUNT | # | # | # | NEXT | TYPE | NEXT | TYPE | NEXT | TYPE | NEXT | TYPE | +| ----- | -------- | ----- | --- | --- | --- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | +| value | 4 | 4 | 5 | 7 | 9 | 11 | 1 | 0 | 2 | 0 | 3 | 0 | 1 | + +Notice that, thanks to `getNextAddress()`, the items are stored +in _insertion order_, not by type or hash! + +Also notice that the hashes we used to `link` are mapped to indices in the hash table +(the `#` above), and that the values stored at those indices are the _addresses_ +returned by `getNextAddress`. + +You may have also noticed that our last `link` has a _collision_ with our first `link` +(they both use hash `0`). In that case, the `address` for our last `link` is stored +in the `NEXT` field of the item at the `address` that is currently stored for hash `0`: + +```mermaid +graph LR + hash0[hash 0] -- HEADER_SIZE + 0 --> index2[2] + hash1[hash 1] -- HEADER_SIZE + 1 --> index3[3] + hash2[hash 2] -- HEADER_SIZE + 2 --> index4[4] + index2[2] -- head --> address5[5] -- next --> address11[11] + index3[3] -- head --> address7[7] + index4[4] -- head --> address9[9] +``` + +## How AdjacencyList uses SharedTypeMap + +As mentioned above, `SharedTypeMap` is a _base implementation_; it isn't used +directly. `AdjacencyList` uses two subclasses of `SharedTypeMap`: + +- `NodeTypeMap`, which is referred to internally as `nodes` +- `EdgeTypeMap`, which is referred to internally as `edges` + +The business of `AdjacencyList` is therefore interacting with these two maps when: + +- **adding** edges by _linking_ records in `nodes` with records in `edges` +- **deleting** edges by _unlinking_ records in `nodes` from records in `edges` +- **resizing** either map when they run low on space +- **traversing** edges by following the links from records in `nodes` + to records in `edges` (and back) + +## EdgeTypeMap + +The `EdgeTypeMap` extends `SharedTypeMap` with API for: + +- **adding** edges +- **deleting** edges +- **linking** adjacent edges +- **unlinking** adjacent edges + +### Edge hashes + +Edges are identified by a hash of their `to`, `from`, and `type` values. + +For any given set of these values, the hash function deterministically produces +a number that that fits within the **current capacity** of the `EdgeTypeMap`. + +That hash number is then the index in the hash table where the _head_ +of the linked list of edges with this hash is stored. + +See [Hash buckets: another list of lists](#hash-buckets-another-list-of-lists) for more. + +### Edge records + +The `EdgeTypeMap` records can be thought of as the links +in the linked lists of edges connected to or from a given node. +As described in [Node records](#node-records), There is a linked list _per edge type_, and also _per direction_: + + + + + +
given a graph like:imagine lists like:
+ +```mermaid +graph TD; + 0 --> |1| 1 + 0 --> |2| 1 + 1 --> |1| 2 + 0 --> |2| 2 + 1 --> |2| 2 +``` + + + +```mermaid +graph LR + subgraph e4[edge 4] + direction LR + e4n1[1] -- 2 --> e4n2[2] + end + subgraph e3[edge 3] + direction LR + e3n1[1] -- 1 --> e3n2[2] + end + subgraph e2[edge 2] + direction LR + e2n0[0] -- 2 --> e2n2[2] + end + subgraph e1[edge 1] + direction LR + e1n0[0] -- 2 --> e1n1[1] + end + subgraph e0[edge 0] + direction LR + e0n0[0] -- 1 --> e0n1[1] + end + + e1 -- next out --> e2 -- prev out --> e1 + e2 -- next in --> e4 -- prev in --> e2 +``` + +Because edge records are only created once per unique pair of node ids +and edge type, and deleted edges do not get reclaimed (without a resize +and rehash of every edge, see [Resizing the `EdgeTypeMap`](#resizing-the-edgetypemap)), +the maximum number of edge records that will be stored is `d + n * n * e`, +where `n` is the number of nodes and `e` is the number of _unique_ edge types, +and `d` is the number of deleted edges. + +
+ +### Edge fields + +Edge records occupy **32 bytes** of contiguous space, +labelled as the following **`uint32` (4 byte)** fields: + +- `NEXT`: The next edge record with the same hash +- `TYPE`: The edge type +- `FROM`: The id of the originating node +- `TO`: The id of the terminating node +- `NEXT_IN`: The address of the next incoming edge of the same type to the same terminating node +- `PREV_IN`: The address of the previous incoming edge of the same type to the same terminating node +- `NEXT_OUT`: The address of the next outgoing edge of the same type from the same originating node +- `PREV_OUT`: The address of the previous outgoing edge of the same type from the same originating node + +#### Sizing the EdgeTypeMap + +The capacity of `EdgeTypeMap` is always _at least_ the total number of edges +in the graph. + +Note that between any two nodes, there my only be one edge per unique edge type. + +Recalling that the `Uint32Array` is partitioned into 3 subarrays: + +| header | hash table | addressable space | +| ------ | ---------- | ----------------- | + +Given a capacity of `c`: + +- The first 3 values in the `Uint32Array` are the header: `HEADER_SIZE = 3` +- The hash table is always equal to the capacity: `c` +- The addressable space is equal to the capacity times the item size: `IITEM_SIZE * c` +- The item size is equal to the number of fields on an edge record: `ITEM_SIZE = 8` + +The size of the `Uint32Array` is therefore `3 + 9c`. + +#### Resizing the EdgeTypeMap + +The `EdgeTypeMap` is resized on demand in `addEdge`. +There are two scenarios in which the `EdgeTypeMap` is resized: + +- if there is not enough capacity in the addressable space + to accommodate an additional edge + +- if there are enough deletes that the capacity + can be reduced and still fit the number of edges plus an additional edge. + +In the first case, the capacity must increase. This is accomplished by +computing a next capacity from the current capacity. The next capacity +follows a linear progression, starting from the `maxGrowFactor (defaulting to 8)`, +and scaled inversely linearly the `minGrowFactor (defaulting to 2)` from capacity +0 to `peakCapacity`. Roughly the following formula: + +```js +maxGrowFactor + (minGrowFactor - maxGrowFactor) * (capacity / peakCapacity); +``` + +Once the `capacity` exceeds `peakCapacity`, the capacity grows by the `minGrowFactor`. + +In the second case, the capacity can decrease. This is accomplished by _halving_ the capacity. + +In both cases, after computing the next capacity, a new `EdgeTypeMap` +is created with that new capacity along with a new `NodeTypeMap` that retains +its current capacity. Every edge in the current `EdgeTypeMap` is then +_rehashed_ and _relinked_ into the new maps. + +## NodeTypeMap + +The `NodeTypeMap` extends `SharedTypeMap` with API for: + +- **adding** nodes +- **linking** nodes to edges +- **unlinking** nodes from edges + +### Node records + +The `NodeTypeMap` records can be thought of as the heads of +linked lists of edges connected to or from a given node. +There is a linked list _per edge type_, and also _per direction_: + + + + + +
given a graph like:imagine lists like:
+ +```mermaid +graph TD; + 0 --> |1| 1 + 0 --> |2| 1 + 1 --> |1| 2 + 0 --> |2| 2 +``` + + + +```mermaid +graph LR + subgraph n0[node 0] + et1(edge type 1) + et2(edge type 2) + end + + subgraph edges + subgraph e0[edge 0] + direction LR + e0n0[0] -- 1 --> e0n1[1] + end + + subgraph e1[edge 1] + direction LR + e1n0[0] -- 2 --> e1n1[1] + end + + subgraph e2[edge 2] + direction LR + e2n0[0] -- 2 --> e2n2[2] + end + end + + et1 -- first out --> e0 + et1 -- last out --> e0 + + et2 -- first out --> e1 + et2 -- last out --> e1 + + e1 -- next out --> e2 + e2 -- prev out --> e1 +``` + +```mermaid +graph LR + subgraph n1[node 1] + et1(edge type 1) + et2(edge type 2) + end + + + subgraph edges + subgraph e0[edge 0] + direction LR + e0n0[0] -- 1 --> e0n1[1] + end + + subgraph e1[edge 1] + direction LR + e1n0[0] -- 2 --> e1n1[1] + end + + subgraph e3[edge 3] + direction LR + e3n1[1] -- 1 --> e3n2[2] + end + end + + et1 -- first in --> e0 + et1 -- last in --> e0 + et1 -- first out --> e3 + et1 -- last out --> e3 + + et2 -- first in --> e1 + et2 -- last in --> e1 +``` + +```mermaid +graph LR + subgraph n2[node 2] + et1(edge type 1) + et2(edge type 2) + end + + + subgraph edges + subgraph e2[edge 2] + direction LR + e2n0[0] -- 2 --> e2n2[2] + end + + subgraph e3[edge 3] + direction LR + e3n1[1] -- 1 --> e3n2[2] + end + end + + et1 -- first in --> e3 + et1 -- last in --> e3 + + et2 -- first in --> e2 + et2 -- last in --> e2 +``` + +Because node records are only created once per unique node id and edge type, +the maximum number of node records that will be stored is `n * e`, +where `n` is the number of nodes (the count of _node ids_ issued) +and `e` is the number of _unique_ edge types in the graph. + +
+ +### Node fields + +Node records occupy **24 bytes** of contiguous space, +labelled as the following **`uint32` (4 byte)** fields: + +- `NEXT`: The next node record with the same node id, but a different `TYPE` +- `TYPE`: The edge type for edges linked to or from the node via this record +- `FIRST_IN`: The address in `EdgeTypeMap` of the first edge of this type **to** this node +- `FIRST_OUT`: The address in `EdgeTypeMap `of the first edge of this type **from** this node +- `LAST_IN`: The address in `EdgeTypeMap` of the last edge of this type **to** this node +- `LAST_OUT`: The address in `EdgeTypeMap` of the last edge of this type **from** this node + +#### Sizing the NodeTypeMap + +As implied above, adding a node to `AdjacencyList` does not actually increment +the count of nodes in the `NodeTypeMap`, it only increments the `nodeId`. +This means that the number of nodes in the graph is _not the same as_ +the number of node records in the `AdjacencyList`. + +However, the capacity of `NodeTypeMap` is always _at least_ the larger of the total +number of node ids issued (`nextId`) or the number of node records in the map (`count`). + +The reasons that the capacity must consider both `nextId` and `count` are: + +- when a node is connected to an edge, the node id is used + as the hash in the `NodeTypeMap`, so the hash table must have + _at least_ enough capacity to fit the highest node id. +- when a node is connected to edges of more than one unique type, + the additional types require additional node records, which increase the `count` + and may require additional addressable space (e.g., if most nodes are already connected). + +Recalling that the `Uint32Array` is partitioned into 3 subarrays: + +| header | hash table | addressable space | +| ------ | ---------- | ----------------- | + +Given a capacity of `c`: + +- The first 3 values in the `Uint32Array` are the header: `HEADER_SIZE = 3` + - `NEXT` and + - `TYPE` have the same meaning as in `SharedTypeMap` + - `NEXT_ID` tracks the number of unique node ids issued by `addNode` +- The hash table is always equal to the capacity: `c` +- The addressable space is equal to the capacity times the item size: `IITEM_SIZE * c` +- The item size is equal to the number of fields on a node record: `ITEM_SIZE = 6` + - these are the 6 fields describe in [Node fields](#node-fields) + +The size of the `Uint32Array` is therefore `3 + 7c`. + +#### Resizing the NodeTypeMap + +There are two scenarios in which the `NodeTypeMap` is resized: + +- in `addNode`, if there is not enough capacity in the hash table + to accommodate the number of nodes implied by the next node id. + +- in `addEdge`, if the edge being added is the first of its type for + either node, _and_ there is not enough capacity in the addressable space + to accommodate the additional count of node records required. + +In either case, the resizing is accomplished by creating a new `NodeTypeMap` +with double the capacity of the current one, and then copying the data +in the hash table and addressable space directly into the new map. + +To account for the increase in the size of the hash table (remember that +the size of the hash table is always equal to the capacity), all pointers +into the addressable space are adjusted by the delta between +the old map capacity and the new capacity, e.g., if the old capacity was 4, +and now it's 8, all of the internal addresses would be shifted by 4. + +This works because the node ids are sequential and immutable, so no rehashing +is necessary. The internal addresses can all be safely incremented by +the capacity delta, while the addresses to edges, which point to the `EdgeTypMap`, +don't need to change at all (as its size hasn't changed). + +## What AdjacencyList really looks like + + + +
given a graph like:AdjacencyList looks like:
+ +```mermaid +graph TD; + 0 --> |1| 1 + 0 --> |2| 1 + 1 --> |1| 2 + 0 --> |2| 2 +``` + +AdjacencyList looks like: + +```mermaid +graph LR + + subgraph nodes + subgraph n0[node 0] + na17[(type 1)] + na29[(type 2)] + na17 -- next --> na29 + end + + subgraph n1[node 1] + na11[(type 1)] + na23[(type 2)] + na11 -- next --> na23 + end + + subgraph n2[node 2] + na35[(type 2)] + na41[(type 1)] + na35 -- next --> na41 + end + + end + + subgraph edges + subgraph e0[hash 0] + ea7[(type 1)] + end + + subgraph e1[hash 1] + ea15[(type 2)] + end + + subgraph e2[hash 2] + ea31[(type 1)] + end + + subgraph e3[hash 3] + ea23[(type 2)] + end + + ea15 -- next out --> ea23 + ea23 -- prev out --> ea15 + end + + + ea7 -- from --> n0 + ea7 -- to --> n1 + ea15 -- from --> n0 + ea15 -- to --> n1 + ea23 -- from --> n0 + ea23 -- to --> n2 + ea31 -- from --> n1 + ea31 -- to --> n2 + + na11 -- first in --> ea7 + na11 -- first out --> ea31 + na11 -- last in --> ea7 + na11 -- last out --> ea31 + na17 -- first out --> ea7 + na17 -- last out --> ea7 + na23 -- first in --> ea15 + na23 -- last in --> ea15 + na29 -- first out --> ea15 + na29 -- last out --> ea23 + na35 -- first in --> ea23 + na35 -- last in --> ea23 + na41 -- first out --> ea31 + na41 -- last out --> ea31 +``` + +Or, as raw `Uint32Array` data: + +``` +nodes: + header hash table addressable space + 0 1 2 3 4 5 6 7 8 9 10 11 [ 23 1 7 31 7 31 ] + 8 6 3 17 11 35 0 0 0 0 0 17 [ 29 1 0 7 0 7 ] + 23 [ 0 2 15 0 15 0 ] + 29 [ 0 2 0 15 0 23 ] + 35 [ 41 2 23 0 23 0 ] + 41 [ 0 1 31 0 31 0 ] + 47 [ 0 0 0 0 0 0 ] + 53 [ 0 0 0 0 0 0 ] +edges: + header hash table addressable space + 0 1 2 3 4 5 6 7 [ 0 1 0 1 0 0 0 0 ] + 4 4 0 15 7 31 23 15 [ 0 2 0 1 0 0 23 0 ] + 23 [ 0 2 0 2 0 0 0 15 ] + 31 [ 0 1 1 2 0 0 0 0 ] + +``` diff --git a/packages/core/graph/src/AdjacencyList.js b/packages/core/graph/src/AdjacencyList.js index d4adb0657ce..6d88084122a 100644 --- a/packages/core/graph/src/AdjacencyList.js +++ b/packages/core/graph/src/AdjacencyList.js @@ -22,25 +22,111 @@ export type SerializedAdjacencyList = {| // eslint-disable-next-line no-unused-vars export type AdjacencyListOptions = {| - edgeCapacity?: number, - nodeCapacity?: number, + /** The initial number of edges to accommodate. */ + initialCapacity?: number, + /** The max amount by which to grow the capacity. */ + maxGrowFactor?: number, + /** The min amount by which to grow the capacity. */ + minGrowFactor?: number, + /** The size after which to grow the capacity by the minimum factor. */ + peakCapacity?: number, + /** The percentage of deleted edges above which the capcity should shink. */ + unloadFactor?: number, + /** The amount by which to shrink the capacity. */ + shrinkFactor?: number, +|}; + +type AdjacencyListParams = {| + initialCapacity: number, + unloadFactor: number, + maxGrowFactor: number, + minGrowFactor: number, + peakCapacity: number, + shrinkFactor: number, |}; -/** The upper bound above which capacity should be increased. */ -const LOAD_FACTOR = 0.7; -/** The lower bound below which capacity should be decreased. */ -const UNLOAD_FACTOR = 0.3; -/** The max amount by which to grow the capacity. */ -const MAX_GROW_FACTOR = 8; -/** The min amount by which to grow the capacity. */ -const MIN_GROW_FACTOR = 2; -/** The amount by which to shrink the capacity. */ -const SHRINK_FACTOR = 0.5; +const DEFAULT_PARAMS: AdjacencyListParams = { + initialCapacity: 2, + unloadFactor: 0.3, + maxGrowFactor: 8, + minGrowFactor: 2, + peakCapacity: 2 ** 18, + shrinkFactor: 0.5, +}; + +/** + * An Enum representing the result of a call to `link`. + * + * `EdgeAdded` = `0`: the edge was successfully linked + * `EdgeExists` = `1`: the edge already exists + * `EdgesOverloaded` = `2`: the edge map is overloaded + * `TooManyDeletes` = `3`: the edge map has too many deleted edges + * `NodesOverloaded` = `4`: the node map is overloaded + */ +const LinkResult: {| + /** The edge was successfully linked */ + EdgeAdded: 0, + /** The edge already exists */ + EdgeExists: 1, + /** The edge map is overloaded */ + EdgesOverloaded: 2, + /** The edge map has too many deleted edges */ + TooManyDeletes: 3, + /** The node map is overloaded */ + NodesOverloaded: 4, +|} = { + EdgeAdded: 0, + EdgeExists: 1, + EdgesOverloaded: 2, + TooManyDeletes: 3, + NodesOverloaded: 4, +}; + +/** + * Allow 3 attempts to link an edge before erroring. + * + * The three attempts correspond to the three possible inconclusive link results: + * - `LinkResult.EdgesOverloaded` + * - `LinkResult.TooManyDeletes` + * - `LinkResult.NodesOverloaded` + * + * If after 3 tries, the link result is still one of these, + * this is considered an error. + */ +const MAX_LINK_TRIES: 3 = 3; +/** + * `AdjacencyList` maps nodes to lists of their adjacent nodes. + * + * It is implemented as a hashmap of nodes, where each node has + * doubly linked lists of edges of each unique edge type. + * The edges are stored in a separate hashmap, where each edge has + * a pointer to the originating node, the terminating node, and + * the next and previous edges to and from adjacent nodes. + * + * The hash maps are each stored in a `Uint32Array` backed + * by a `SharedArrayBuffer`. See `SharedTypeMap` for more details. + * + * It's primary interface is through the `getNodeIdsConnectedFrom` + * and `getNodeIdsConnectedTo` methods, which return the list of + * nodes connected from or to a given node, respectively. + * + * It is also possible to get the lists of edges connected from or to + * a given node, using the `getOutboundEdgesByType` and + * `getInboundEdgesByType` methods. + * + */ export default class AdjacencyList { #nodes /*: NodeTypeMap */; #edges /*: EdgeTypeMap */; + #params /*: AdjacencyListParams */; + + /** + * Create a new `AdjacencyList` in one of two ways: + * - with specified options, or + * - with data serialized from a previous `AdjacencyList`. + */ constructor( opts?: | SerializedAdjacencyList @@ -53,26 +139,28 @@ export default class AdjacencyList { ({nodes, edges} = opts); this.#nodes = new NodeTypeMap(nodes); this.#edges = new EdgeTypeMap(edges); + this.#params = {...DEFAULT_PARAMS, initialCapacity: this.#edges.capacity}; } else { - let { - nodeCapacity = NodeTypeMap.MIN_CAPACITY, - edgeCapacity = EdgeTypeMap.MIN_CAPACITY, - } = opts ?? {}; - assert( - nodeCapacity <= NodeTypeMap.MAX_CAPACITY, - 'Node capacity overflow!', - ); - assert( - edgeCapacity <= EdgeTypeMap.MAX_CAPACITY, - 'Edge capacity overflow!', - ); - this.#nodes = new NodeTypeMap(nodeCapacity); - this.#edges = new EdgeTypeMap(edgeCapacity); + this.#params = {...DEFAULT_PARAMS, ...opts}; + + let {initialCapacity} = this.#params; + + // TODO: Find a heuristic for right-sizing nodes. + // e.g., given an average ratio of `e` edges for every `n` nodes, + // init nodes with `capacity * n / e`. + let initialNodeCapacity = 2; + + NodeTypeMap.assertMaxCapacity(initialNodeCapacity); + EdgeTypeMap.assertMaxCapacity(initialCapacity); + + this.#nodes = new NodeTypeMap(initialNodeCapacity); + this.#edges = new EdgeTypeMap(initialCapacity); } } /** - * Create a new `AdjacencyList` from the given options. + * Create a new `AdjacencyList` with data serialized + * from another `AdjacencyList`. */ static deserialize( opts: SerializedAdjacencyList, @@ -81,7 +169,7 @@ export default class AdjacencyList { } /** - * Returns a serializable object of the nodes and edges in the graph. + * Returns a serializable object of the nodes and edges in the AdjacencyList. */ serialize(): SerializedAdjacencyList { return { @@ -90,13 +178,14 @@ export default class AdjacencyList { }; } + /** Statistics about the current state of the `AdjacencyList`. */ get stats(): {| + /** The maximum number of edges the graph can contain. */ + capacity: number, /** The number of nodes in the graph. */ nodes: number, /** The number of edge types associated with nodes in the graph. */ nodeEdgeTypes: number, - /** The maximum number of nodes the graph can contain. */ - nodeCapacity: number, /** The size of the raw nodes buffer, in mb. */ nodeBufferSize: string, /** The current load on the nodes array. */ @@ -105,8 +194,8 @@ export default class AdjacencyList { edges: number, /** The number of edges deleted from the graph. */ deleted: number, - /** The maximum number of edges the graph can contain. */ - edgeCapacity: number, + /** The number of unique edge types in the graph. */ + edgeTypes: number, /** The size of the raw edges buffer, in mb. */ edgeBufferSize: string, /** The current load on the edges array, including deletes. */ @@ -119,9 +208,17 @@ export default class AdjacencyList { maxCollisions: number, /** The average number of collisions per hash. */ avgCollisions: number, - /** The likelihood of uniform distribution. ~1.0 indicates certainty. */ + /** + * The actual distribution of hashes vs. the expected (uniform) distribution. + * + * From: https://en.wikipedia.org/wiki/Hash_function#Testing_and_measurement + * + * > A ratio within one confidence interval (0.95 - 1.05) is indicative + * > that the hash function...has an expected uniform distribution. + */ uniformity: number, |} { + let edgeTypes = new Set(); let buckets = new Map(); for (let {from, to, type} of this.getAllEdges()) { let hash = this.#edges.hash(from, to, type); @@ -130,11 +227,20 @@ export default class AdjacencyList { assert(!bucket.has(key), `Duplicate node detected: ${key}`); bucket.add(key); buckets.set(hash, bucket); + edgeTypes.add(type); } let maxCollisions = 0; let collisions = 0; let distribution = 0; + /** + * The expected distribution of hashes across available hash buckets. + * + * See: https://en.wikipedia.org/wiki/Hash_function#Testing_and_measurement + */ + let uniformDistribution = + (this.#edges.count / (2 * this.#edges.capacity)) * + (this.#edges.count + 2 * this.#edges.capacity - 1); for (let bucket of buckets.values()) { maxCollisions = Math.max(maxCollisions, bucket.size - 1); @@ -142,21 +248,17 @@ export default class AdjacencyList { distribution += (bucket.size * (bucket.size + 1)) / 2; } - let uniformity = - distribution / - ((this.#edges.count / (2 * this.#edges.capacity)) * - (this.#edges.count + 2 * this.#edges.capacity - 1)); - return { + capacity: this.#edges.capacity, + nodes: fromNodeId(this.#nodes.nextId), nodeEdgeTypes: this.#nodes.count, - nodeCapacity: this.#nodes.capacity, nodeLoad: `${Math.round(this.#nodes.load * 100)}%`, nodeBufferSize: this.#nodes.bufferSize, edges: this.#edges.count, deleted: this.#edges.deletes, - edgeCapacity: this.#edges.capacity, + edgeTypes: edgeTypes.size, edgeLoad: `${Math.round(this.#edges.load * 100)}%`, edgeLoadWithDeletes: `${Math.round( this.#edges.getLoad(this.#edges.count + this.#edges.deletes) * 100, @@ -165,16 +267,18 @@ export default class AdjacencyList { collisions, maxCollisions, - avgCollisions: Math.round((collisions / buckets.size) * 100) / 100 || 0, - uniformity: Math.round(uniformity * 100) / 100 || 0, + avgCollisions: + Math.round((collisions / this.#edges.count) * 100) / 100 || 0, + uniformity: + Math.round((distribution / uniformDistribution) * 100) / 100 || 0, }; } /** * Resize the internal nodes array. * - * This is used in `addNode` when the `numNodes` meets or exceeds - * the allocated size of the `nodes` array. + * This is used in `addNode` and in `addEdge` when + * the `nodes` array is at capacity, */ resizeNodes(size: number) { let nodes = this.#nodes; @@ -187,57 +291,72 @@ export default class AdjacencyList { /** * Resize the internal edges array. * - * This is used in `addEdge` when the `numEdges` meets or exceeds - * the allocated size of the `edges` array. + * This is used in `addEdge` when the `edges` array is at capacity. */ resizeEdges(size: number) { // Allocate the required space for new `nodes` and `edges` maps. - let copy = new AdjacencyList({ - nodeCapacity: this.#nodes.capacity, - edgeCapacity: size, - }); + let edges = new EdgeTypeMap(size); + let nodes = new NodeTypeMap(this.#nodes.capacity); // Copy the existing edges into the new array. - copy.#nodes.nextId = this.#nodes.nextId; + nodes.nextId = this.#nodes.nextId; this.#edges.forEach( edge => - void copy.addEdge( + void link( this.#edges.from(edge), this.#edges.to(edge), this.#edges.typeOf(edge), + edges, + nodes, + this.#params.unloadFactor, ), ); // We expect to preserve the same number of edges. assert( - this.#edges.count === copy.#edges.count, - `Edge mismatch! ${this.#edges.count} does not match ${ - copy.#edges.count - }.`, + this.#edges.count === edges.count, + `Edge mismatch! ${this.#edges.count} does not match ${edges.count}.`, ); // Finally, copy the new data arrays over to this graph. - this.#nodes = copy.#nodes; - this.#edges = copy.#edges; + this.#nodes = nodes; + this.#edges = edges; } /** * Adds a node to the graph. * + * Note that this method does not increment the node count + * (that only happens in `addEdge`), it _may_ preemptively resize + * the nodes array if it is at capacity, under the asumption that + * at least 1 edge to or from this new node will be added. + * * Returns the id of the added node. */ addNode(): NodeId { let id = this.#nodes.getId(); - // If we're in danger of overflowing the `nodes` array, resize it. - if (this.#nodes.load > LOAD_FACTOR) { - this.resizeNodes(increaseNodeCapacity(this.#nodes.capacity)); + if (this.#nodes.getLoad() >= 1) { + this.resizeNodes( + increaseNodeCapacity(this.#nodes.capacity, this.#params), + ); } + return id; } /** * Adds an edge to the graph. * + * This method will increment the edge count, and it _may_ + * also increment the node count, if the originating or + * terminating node does not yet have any edges of the given type. + * + * If either the `nodes` or `edges` arrays are at capacity, + * this method will resize them before adding. + * + * Furthermore, if the `edges` array has a high number of + * deleted edges, it may reclaim the space before adding. + * * Returns `true` if the edge was added, * or `false` if the edge already exists. */ @@ -246,69 +365,54 @@ export default class AdjacencyList { to: NodeId, type: TEdgeType | NullEdgeType = 1, ): boolean { + assert(from < this.#nodes.nextId, `Node ${from} does not exist.`); + assert(to < this.#nodes.nextId, `Node ${to} does not exist.`); assert(type > 0, `Unsupported edge type ${type}`); - let hash = this.#edges.hash(from, to, type); - let edge = this.#edges.addressOf(hash, from, to, type); + let result; + let tries = 0; - // The edge is already in the graph; do nothing. - if (edge !== null) return false; - - let capacity = this.#edges.capacity; - // We add 1 to account for the edge we are adding. - let count = this.#edges.count + 1; - // Since the space occupied by deleted edges isn't reclaimed, - // we include them in our count to avoid overflowing the `edges` array. - let deletes = this.#edges.deletes; - let total = count + deletes; - // If we have enough space to keep adding edges, we can - // put off reclaiming the deleted space until the next resize. - if (this.#edges.getLoad(total) > LOAD_FACTOR) { - if (this.#edges.getLoad(deletes) > UNLOAD_FACTOR) { - // If we have a significant number of deletes, we compute our new - // capacity based on the current count, even though we decided to - // resize based on the sum total of count and deletes. - // In this case, resizing is more like a compaction. - this.resizeEdges( - getNextEdgeCapacity(capacity, count, this.#edges.getLoad(count)), - ); - } else { - this.resizeEdges( - getNextEdgeCapacity(capacity, total, this.#edges.getLoad(total)), - ); - } - // We must rehash because the capacity has changed. - hash = this.#edges.hash(from, to, type); - } - - let toNode = this.#nodes.addressOf(to, type); - let fromNode = this.#nodes.addressOf(from, type); - if (toNode === null || fromNode === null) { - // If we're in danger of overflowing the `nodes` array, resize it. - if (this.#nodes.load >= LOAD_FACTOR) { - this.resizeNodes(increaseNodeCapacity(this.#nodes.capacity)); - // We need to update our indices since the `nodes` array has changed. - toNode = this.#nodes.addressOf(to, type); - fromNode = this.#nodes.addressOf(from, type); - } - } - if (toNode === null) toNode = this.#nodes.add(to, type); - if (fromNode === null) fromNode = this.#nodes.add(from, type); - - // Add our new edge to its hash bucket. - edge = this.#edges.add(hash, from, to, type); + do { + assert(tries++ < MAX_LINK_TRIES, 'Failed to addEdge too many times!'); - // Link this edge to the node's list of incoming edges. - let prevIn = this.#nodes.linkIn(toNode, edge); - if (prevIn !== null) this.#edges.linkIn(prevIn, edge); + result = link( + from, + to, + type, + this.#edges, + this.#nodes, + this.#params.unloadFactor, + ); - // Link this edge to the node's list of outgoing edges. - let prevOut = this.#nodes.linkOut(fromNode, edge); - if (prevOut !== null) this.#edges.linkOut(prevOut, edge); + // Sometimes we need to resize before we can add. + switch (result) { + case LinkResult.NodesOverloaded: { + this.resizeNodes( + increaseNodeCapacity(this.#nodes.capacity, this.#params), + ); + break; + } + case LinkResult.EdgesOverloaded: { + this.resizeEdges( + increaseEdgeCapacity(this.#edges.capacity, this.#params), + ); + break; + } + case LinkResult.TooManyDeletes: { + this.resizeEdges( + decreaseEdgeCapacity(this.#edges.capacity, this.#params), + ); + break; + } + } + } while (result > LinkResult.EdgeExists); - return true; + return result === LinkResult.EdgeAdded; } + /** + * Iterate over all edges in insertion order. + */ *getAllEdges(): Iterator<{| type: TEdgeType | NullEdgeType, from: NodeId, @@ -344,7 +448,12 @@ export default class AdjacencyList { } /** + * Remove an edge connecting the `from` and `to` nodes. + * + * Note that space for the deleted edge is not reclaimed + * until the `edges` array is resized. * + * This method will increment the edge delete count. */ removeEdge( from: NodeId, @@ -386,6 +495,13 @@ export default class AdjacencyList { this.#edges.delete(edge); } + /** + * Check if the given node has any edges incoming from other nodes. + * + * Essentially, this is an orphan check. If a node has no incoming edges, + * it (and its entire subgraph) is completely disconnected from the + * rest of the graph. + */ hasInboundEdges(to: NodeId): boolean { let node = this.#nodes.head(to); while (node !== null) { @@ -395,6 +511,10 @@ export default class AdjacencyList { return false; } + /** + * Get a list of every node (labeled `from`) connecting _to_ + * the given `to` node, along with the edge `type` connecting them. + */ getInboundEdgesByType( to: NodeId, ): {|type: TEdgeType | NullEdgeType, from: NodeId|}[] { @@ -413,6 +533,10 @@ export default class AdjacencyList { return edges; } + /** + * Get a list of every node (labeled `to`) connected _from_ + * the given `from` node, along with the edge `type` connecting them. + */ getOutboundEdgesByType( from: NodeId, ): {|type: TEdgeType | NullEdgeType, to: NodeId|}[] { @@ -432,7 +556,11 @@ export default class AdjacencyList { } /** - * Get the list of nodes connected from this node. + * Get the list of node ids connected from this node. + * + * If `type` is specified, only return nodes connected by edges of that type. + * If `type` is an array, return nodes connected by edges of any of those types. + * If `type` is `AllEdgeTypes` (`-1`), return nodes connected by edges of any type. */ getNodeIdsConnectedFrom( from: NodeId, @@ -486,7 +614,11 @@ export default class AdjacencyList { } /** - * Get the list of nodes connected to this node. + * Get the list of node ids connected to this node. + * + * If `type` is specified, only return nodes connected by edges of that type. + * If `type` is an array, return nodes connected by edges of any of those types. + * If `type` is `AllEdgeTypes` (`-1`), return nodes connected by edges of any type. */ getNodeIdsConnectedTo( to: NodeId, @@ -549,7 +681,7 @@ export default class AdjacencyList { * └──┴──┴──┴───────┴──┴──┴──┴───────┴──┴──┘ * └──┬──┘ └─────────┬─────────┘ * header items - * (HEADER_SIZE) (capacity * ITEM_SIZE * BUCKET_SIZE) + * (HEADER_SIZE) (capacity * ITEM_SIZE) * * * An item is added with a hash key that fits within the range of the hash @@ -608,31 +740,47 @@ export class SharedTypeMap /** The offset at which an item's type is stored. */ static #TYPE: 1 = 1; - /** The number of items to accommodate per hash bucket. */ - static BUCKET_SIZE: number = 2; + /** The largest possible capacity. */ + static get MAX_CAPACITY(): number { + return Math.floor( + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Errors/Invalid_array_length#what_went_wrong + (2 ** 31 - 1 - this.HEADER_SIZE) / this.ITEM_SIZE, + ); + } + + /** Assert that the given `capacity` does not exceed `MAX_CAPACITY`. */ + static assertMaxCapacity(capacity: number): void { + assert(capacity <= this.MAX_CAPACITY, `${this.name} capacity overflow!`); + } data: Uint32Array; + /** The total number of items that can fit in the map. */ get capacity(): number { return this.data[SharedTypeMap.#CAPACITY]; } + /** The number of items in the map. */ get count(): number { return this.data[SharedTypeMap.#COUNT]; } + /** The ratio of the count to the capacity. */ get load(): number { return this.getLoad(); } + /** The total length of the map, in bytes. */ get length(): number { return this.getLength(); } + /** The address of the first item in the map. */ get addressableLimit(): number { return this.constructor.HEADER_SIZE + this.capacity; } + /** The size of the map in mb, as a localized string. */ get bufferSize(): string { return `${(this.data.byteLength / 1024 / 1024).toLocaleString(undefined, { minimumFractionDigits: 2, @@ -640,6 +788,11 @@ export class SharedTypeMap })} mb`; } + /** + * Create a new `SharedTypeMap` in one of two ways: + * - with a capacity of `capacityOrData` if it is a number, + * - or with `capacityOrData` as its data, if it is a `Uint32Array`. + */ constructor(capacityOrData: number | Uint32Array) { if (typeof capacityOrData === 'number') { let {BYTES_PER_ELEMENT} = Uint32Array; @@ -655,6 +808,13 @@ export class SharedTypeMap } } + /** + * Overwrite the data in this map with the given `data`. + * + * The `data` is expected to conform to the same + * partitioning and schema as the data in this map, + * and is expected to be of equal or smaller capacity to this map. + */ set(data: Uint32Array): void { let {HEADER_SIZE, ITEM_SIZE} = this.constructor; let NEXT = SharedTypeMap.#NEXT; @@ -686,14 +846,26 @@ export class SharedTypeMap } } + /** + * Given a `count` (defaulting to `this.count`), + * get the load on the map. + * + * The load is the ratio of the `count` the capacity of the map. + * + * If the load is `1`, it means the map is at capacity, and needs + * to be resized before adding more items. + */ getLoad(count: number = this.count): number { - let {BUCKET_SIZE} = this.constructor; - return count / (this.capacity * BUCKET_SIZE); + return count / this.capacity; } + /** + * Given a `capacity` (defaulting to `this.capacity`), + * get the length of the map, in bytes. + */ getLength(capacity: number = this.capacity): number { - let {HEADER_SIZE, ITEM_SIZE, BUCKET_SIZE} = this.constructor; - return capacity + HEADER_SIZE + ITEM_SIZE * BUCKET_SIZE * capacity; + let {HEADER_SIZE, ITEM_SIZE} = this.constructor; + return capacity + HEADER_SIZE + ITEM_SIZE * capacity; } /** Get the next available address in the map. */ @@ -714,10 +886,15 @@ export class SharedTypeMap return (this.data[(item: any) + NEXT]: any) || null; } + /** Get the type of the item at the given `item` address. */ typeOf(item: TAddress): TItemType { return (this.data[item + SharedTypeMap.#TYPE]: any); } + /** + * Store an item of `type` at the `item` address and + * link the address to the `hash` bucket. + */ link(hash: THash, item: TAddress, type: TItemType): void { let COUNT = SharedTypeMap.#COUNT; let NEXT = SharedTypeMap.#NEXT; @@ -741,6 +918,9 @@ export class SharedTypeMap this.data[COUNT]++; } + /** + * Remove the link to the `item` address from the `hash` bucket. + */ unlink(hash: THash, item: TAddress): void { let COUNT = SharedTypeMap.#COUNT; let NEXT = SharedTypeMap.#NEXT; @@ -815,13 +995,13 @@ export class SharedTypeMap table: Uint32Array, data: Uint32Array, |} { - const {HEADER_SIZE, ITEM_SIZE, BUCKET_SIZE} = this.constructor; - let min = HEADER_SIZE + this.capacity; - let max = min + this.capacity * BUCKET_SIZE * ITEM_SIZE; + const {HEADER_SIZE} = this.constructor; + let min = this.addressableLimit; + return { header: this.data.subarray(0, HEADER_SIZE), table: this.data.subarray(HEADER_SIZE, min), - data: this.data.subarray(min, max), + data: this.data.subarray(min), }; } } @@ -829,7 +1009,17 @@ export class SharedTypeMap /** * Nodes are stored in a `SharedTypeMap`, keyed on node id plus an edge type. * This means that for any given unique node id, there may be `e` nodes in the - * map, where `e` is the number of possible edge types in the graph. + * map, where `e` is the number of unique edge types in the graph. + * + * The _hash_ for a node is simply the node id (as issued by `getId`), + * and forms the head of linked list of unique _edge types_ connected + * to or from the same node id. + * + * In addition to a unique edge type, each Node contains the heads and tails + * of doubly linked lists of incoming and outgoing edges of the same type. + * + * Note that the links in the doubly linked lists are Edges (not Nodes), + * which are stored in a corresponding `EdgeTypeMap`. */ export class NodeTypeMap extends SharedTypeMap< TEdgeType, @@ -849,6 +1039,15 @@ export class NodeTypeMap extends SharedTypeMap< * ┌──────────┬───────┬─────────┐ * │ CAPACITY │ COUNT │ NEXT_ID │ * └──────────┴───────┴─────────┘ + * + * The `nextId` is a count of the number of times `getId` has been called. + * This is distinct concept from the `count`, which tracks the number of times + * `add` has been called. + * + * The reason for this distinction is that `getId` is called once per node + * (to issue a _unique_ id) and will _always increment_ the `nextId` counter, + * whereas `add` is called once per edge, and will only increment the `count` + * if the _type_ of edge is new for the given node. */ static HEADER_SIZE: number = 3; /** The offset from the header where the next available node id is stored. */ @@ -870,6 +1069,9 @@ export class NodeTypeMap extends SharedTypeMap< * ┌──────┬──────┬──────────┬───────────┬─────────┬──────────┐ * │ NEXT │ TYPE │ FIRST_IN │ FIRST_OUT │ LAST_IN │ LAST_OUT │ * └──────┴──────┴──────────┴───────────┴─────────┴──────────┘ + * + * The `Node` implicitly maps a node id (the hash the node was added with) + * to the first and last incoming and outgoing edges of the same _edge type_. */ static ITEM_SIZE: number = 6; /** The offset at which a node's first incoming edge of this type is stored. */ @@ -881,16 +1083,6 @@ export class NodeTypeMap extends SharedTypeMap< /** The offset at which a node's last outgoing edge of this type is stored. */ static #LAST_OUT = 5; - /** The smallest functional node map capacity. */ - static MIN_CAPACITY: number = 2; - /** The largest possible node map capacity. */ - static MAX_CAPACITY: number = Math.floor( - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Errors/Invalid_array_length#what_went_wrong - (2 ** 31 - 1 - NodeTypeMap.HEADER_SIZE) / - NodeTypeMap.ITEM_SIZE / - NodeTypeMap.BUCKET_SIZE, - ); - get nextId(): NodeId { return toNodeId(this.data[NodeTypeMap.#NEXT_ID]); } @@ -898,18 +1090,31 @@ export class NodeTypeMap extends SharedTypeMap< this.data[NodeTypeMap.#NEXT_ID] = fromNodeId(nextId); } - /** Get a unique node id. */ - getId(): NodeId { - return toNodeId(this.data[NodeTypeMap.#NEXT_ID]++); - } - - getLoad(count: number = this.count): number { + /** + * Get the load on the node map. + * + * The load is the greater of either: + * - the ratio of the number of node ids to the capacity of the map, + * - or the ratio of the `count` to the capacity of the map. + * + * if `count` is not provided, the default is the number of items + * currently added to the map. + */ + getLoad(count?: number): number { return Math.max( fromNodeId(this.nextId) / this.capacity, super.getLoad(count), ); } + /** Increment the node counter to get a unique node id. */ + getId(): NodeId { + return toNodeId(this.data[NodeTypeMap.#NEXT_ID]++); + } + + /** + * Add new lists of edges of the given `type` to and from the given `node`. + */ add(node: NodeId, type: TEdgeType): NodeAddress { let index = fromNodeId(node); assert( @@ -921,6 +1126,10 @@ export class NodeTypeMap extends SharedTypeMap< return address; } + /** + * Get the address of the lists edges of the given `type` + * to and from the given `node`. + */ addressOf(node: NodeId, type: TEdgeType): NodeAddress | null { let address = this.head(node); while (address !== null) { @@ -932,22 +1141,45 @@ export class NodeTypeMap extends SharedTypeMap< return null; } + /** + * Given a `node` address, get the _head_ of the linked list + * of incoming edges of the same type to the same node. + */ firstIn(node: NodeAddress): EdgeAddress | null { return this.data[node + NodeTypeMap.#FIRST_IN] || null; } + /** + * Given a `node` address, get the _head_ of the linked list + * of outgoing edges of the same type from the same node. + */ firstOut(node: NodeAddress): EdgeAddress | null { return this.data[node + NodeTypeMap.#FIRST_OUT] || null; } + /** + * Given a `node` address, get the _tail_ of the linked list + * of incoming edges of the same type to the same node. + */ lastIn(node: NodeAddress): EdgeAddress | null { return this.data[node + NodeTypeMap.#LAST_IN] || null; } + /** + * Given a `node` address, get the _tail_ of the linked list + * of outgoing edges of the same type from the same node. + */ lastOut(node: NodeAddress): EdgeAddress | null { return this.data[node + NodeTypeMap.#LAST_OUT] || null; } + /** + * Set `edge` as the last incoming edge to `node`. + * If `node` has no incoming edges, set `edge` + * as the first incoming edge, as well. + * + * Returns the address of the old last incoming edge, if any. + */ linkIn(node: NodeAddress, edge: EdgeAddress): EdgeAddress | null { let first = this.firstIn(node); let last = this.lastIn(node); @@ -956,6 +1188,13 @@ export class NodeTypeMap extends SharedTypeMap< return last; } + /** + * If `edge` is the last incoming edge to `node`, + * update the node's last incoming edge to `prev`. + * + * If `edge` is the first incoming edge to `node`, + * update the node's first incoming edge to `next`. + */ unlinkIn( node: NodeAddress, edge: EdgeAddress, @@ -972,6 +1211,13 @@ export class NodeTypeMap extends SharedTypeMap< } } + /** + * Set `edge` as the last outgoing edge from `node`. + * If `node` has no outgoing edges, set `edge` + * as the first outgoing edge, as well. + * + * Returns the address of the old last outgoing edge, if any. + */ linkOut(node: NodeAddress, edge: EdgeAddress): EdgeAddress | null { let first = this.firstOut(node); let last = this.lastOut(node); @@ -980,6 +1226,13 @@ export class NodeTypeMap extends SharedTypeMap< return last; } + /** + * If `edge` is the last outgoing edge from `node`, + * update the node's last outgoing edge to `prev`. + * + * If `edge` is the first outgoing edge from `node`, + * update the node's first outgoing edge to `next`. + */ unlinkOut( node: NodeAddress, edge: EdgeAddress, @@ -1000,6 +1253,14 @@ export class NodeTypeMap extends SharedTypeMap< /** * Edges are stored in a `SharedTypeMap`, * keyed on the 'from' and 'to' node ids, and the edge type. + * + * The _hash_ for an edge is a hash of the edge's `from`, `to`, and `type` values, + * and forms the head of linked list of edges with the same hash. + * + * In addition to the `from`, `to` and `type` values, each Edge contains + * the next and previous links of doubly linked lists of the _adjacent_ edges + * of the same type, both incoming to the `to` node, and outgoing from + * the `from` node. */ export class EdgeTypeMap extends SharedTypeMap< TEdgeType, @@ -1019,6 +1280,13 @@ export class EdgeTypeMap extends SharedTypeMap< * ┌──────────┬───────┬─────────┐ * │ CAPACITY │ COUNT │ DELETES │ * └──────────┴───────┴─────────┘ + * + * Since new edges are always appended, the space for deleted edges + * is not reused. Instead, the `deletes` count is incremented when an + * edge is deleted. The next available address is calculated by + * adding the `count` and `deletes` values to the header size. + * + * The only way to reclaim the space used by deleted edges is to resize the map. */ static HEADER_SIZE: number = 3; /** The offset from the header where the delete count is stored. */ @@ -1042,6 +1310,10 @@ export class EdgeTypeMap extends SharedTypeMap< * ┌──────┬──────┬──────┬────┬─────────┬─────────┬──────────┬──────────┐ * │ NEXT │ TYPE │ FROM │ TO │ NEXT_IN │ PREV_IN │ NEXT_OUT │ PREV_OUT │ * └──────┴──────┴──────┴────┴─────────┴─────────┴──────────┴──────────┘ + * + * The `Edge` implicitly maps an edge hash (the hash of the edge's `FROM`, + * `TO`, and `TYPE` values) to the next and previous adjacent edges of the + * same _edge type_. */ static ITEM_SIZE: number = 8; /** The offset at which an edge's 'from' node id is stored. */ @@ -1057,27 +1329,21 @@ export class EdgeTypeMap extends SharedTypeMap< /** The offset at which the 'from' node's previous outgoing edge is stored. */ static #PREV_OUT = 7; - /** The smallest functional edge map capacity. */ - static MIN_CAPACITY: number = 2; - /** The largest possible edge map capacity. */ - static MAX_CAPACITY: number = Math.floor( - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Errors/Invalid_array_length#what_went_wrong - (2 ** 31 - 1 - EdgeTypeMap.HEADER_SIZE) / - EdgeTypeMap.ITEM_SIZE / - EdgeTypeMap.BUCKET_SIZE, - ); - /** The size after which to grow the capacity by the minimum factor. */ - static PEAK_CAPACITY: number = 2 ** 18; - + /** The number of deleted edges currently occupying space in the map. */ get deletes(): number { return this.data[EdgeTypeMap.#DELETES]; } + /** Get the next available address in the map. */ getNextAddress(): EdgeAddress { let {ITEM_SIZE} = this.constructor; return this.addressableLimit + (this.count + this.deletes) * ITEM_SIZE; } + /** + * Add an edge of the given `type` between the `to` and `from` nodes + * and link the address to the `hash` bucket. + */ add(hash: EdgeHash, from: NodeId, to: NodeId, type: TEdgeType): EdgeAddress { assert( hash >= 0 && hash < this.capacity, @@ -1092,12 +1358,20 @@ export class EdgeTypeMap extends SharedTypeMap< return edge; } + /** + * Remove the `to` and `from` nodes for the given `edge` address + * and increment the `deletes` counter. + */ delete(edge: EdgeAddress): void { this.data[edge + EdgeTypeMap.#FROM] = 0; this.data[edge + EdgeTypeMap.#TO] = 0; this.data[EdgeTypeMap.#DELETES]++; } + /** + * Get the address of the edge with the given `hash`, `from` and `to` nodes, + * and edge `type`. + */ addressOf( hash: EdgeHash, from: NodeId, @@ -1118,27 +1392,44 @@ export class EdgeTypeMap extends SharedTypeMap< return null; } + /** Get the id of the 'from' node for the given `edge` address. */ from(edge: EdgeAddress): NodeId { return toNodeId(this.data[edge + EdgeTypeMap.#FROM]); } + /** Get the id of the 'to' node for the given `edge` address. */ to(edge: EdgeAddress): NodeId { return toNodeId(this.data[edge + EdgeTypeMap.#TO]); } + /** + * Get the address of the next edge _of the same type_ + * incoming _to the same node_ as the edge at the given address. + */ nextIn(edge: EdgeAddress): EdgeAddress | null { return this.data[edge + EdgeTypeMap.#NEXT_IN] || null; } + /** + * Get the address of the previous edge _of the same type_ + * incoming _to the same node_ as the edge at the given address. + */ prevIn(edge: EdgeAddress): EdgeAddress | null { return this.data[edge + EdgeTypeMap.#PREV_IN] || null; } + /** Link two adjacent edges of the same type incoming to the same node. */ linkIn(edge: EdgeAddress, next: EdgeAddress) { + assert(this.typeOf(edge) === this.typeOf(next), 'Edge types must match.'); + assert(this.to(edge) === this.to(next), 'To nodes must match.'); this.data[edge + EdgeTypeMap.#NEXT_IN] = next; this.data[next + EdgeTypeMap.#PREV_IN] = edge; } + /** + * Unlink an edge from the doubly linked list of incoming edges + * to the same node. + */ unlinkIn(edge: EdgeAddress) { let next = this.nextIn(edge); let prev = this.prevIn(edge); @@ -1154,19 +1445,34 @@ export class EdgeTypeMap extends SharedTypeMap< } } + /** + * Get the address of the next edge _of the same type_ + * outgoing _from the same node_ as the edge at the given address. + */ nextOut(edge: EdgeAddress): EdgeAddress | null { return this.data[edge + EdgeTypeMap.#NEXT_OUT] || null; } + /** + * Get the address of the previous edge _of the same type_ + * outgoing _from the same node_ as the edge at the given address. + */ prevOut(edge: EdgeAddress): EdgeAddress | null { return this.data[edge + EdgeTypeMap.#PREV_OUT] || null; } + /** Link two adjacent edges of the same type outgoing from the same node. */ linkOut(edge: EdgeAddress, next: EdgeAddress) { + assert(this.typeOf(edge) === this.typeOf(next), 'Edge types must match.'); + assert(this.from(edge) === this.from(next), 'From nodes must match.'); this.data[edge + EdgeTypeMap.#NEXT_OUT] = next; this.data[next + EdgeTypeMap.#PREV_OUT] = edge; } + /** + * Unlink an edge from the doubly linked list of outgoing edges + * of the same type from the same node. + */ unlinkOut(edge: EdgeAddress) { let next = this.nextOut(edge); let prev = this.prevOut(edge); @@ -1198,6 +1504,77 @@ export class EdgeTypeMap extends SharedTypeMap< } } +/** + * Links a node to another node with an edge of the given type. + * + * Returns one of the following numeric status codes: + * - `0` EdgeAdded: the edge was added + * - `1` EdgeExists: the edge already exists + * - `2` EdgesOverloaded: the edge map is overloaded + * - `3` TooManyDeletes: the edge map has too many deleted edges + * - `4` NodesOverloaded: the node map is overloaded + */ +function link( + from: NodeId, + to: NodeId, + type: TEdgeType | NullEdgeType, + edges: EdgeTypeMap, + nodes: NodeTypeMap, + unloadFactor: number = DEFAULT_PARAMS.unloadFactor, +): $Values { + let hash = edges.hash(from, to, type); + let edge = edges.addressOf(hash, from, to, type); + + // The edge is already in the graph; do nothing. + if (edge !== null) return LinkResult.EdgeExists; + + let toNode = nodes.addressOf(to, type); + let fromNode = nodes.addressOf(from, type); + + let nodeCount = nodes.count; + // add one for each node we must add. + if (toNode === null) nodeCount++; + if (fromNode === null) nodeCount++; + // If we're in danger of overflowing the `nodes` array, resize it. + if (nodes.getLoad(nodeCount) >= 1) { + return LinkResult.NodesOverloaded; + } + + // We add 1 to account for the edge we are adding. + let count = edges.count + 1; + // Since the space occupied by deleted edges isn't reclaimed, + // we include them in our count to avoid overflowing the `edges` array. + let deletes = edges.deletes; + let total = count + deletes; + if (edges.getLoad(total) >= 1) { + if ( + edges.getLoad(deletes) >= unloadFactor && + edges.getLoad(count) < unloadFactor + ) { + // If we have a significant number of deletes, reclaim the space. + return LinkResult.TooManyDeletes; + } else { + return LinkResult.EdgesOverloaded; + } + } + + if (toNode === null) toNode = nodes.add(to, type); + if (fromNode === null) fromNode = nodes.add(from, type); + + // Add our new edge to its hash bucket. + edge = edges.add(hash, from, to, type); + + // Link this edge to the node's list of incoming edges. + let prevIn = nodes.linkIn(toNode, edge); + if (prevIn !== null) edges.linkIn(prevIn, edge); + + // Link this edge to the node's list of outgoing edges. + let prevOut = nodes.linkOut(fromNode, edge); + if (prevOut !== null) edges.linkOut(prevOut, edge); + + return LinkResult.EdgeAdded; +} + // From https://gist.github.com/badboy/6267743#32-bit-mix-functions function hash32shift(key: number): number { key = ~key + (key << 15); // key = (key << 15) - key - 1; @@ -1213,33 +1590,66 @@ function interpolate(x: number, y: number, t: number): number { return x + (y - x) * Math.min(1, Math.max(0, t)); } -function increaseNodeCapacity(nodeCapacity: number): number { - let {MIN_CAPACITY, MAX_CAPACITY} = NodeTypeMap; - let newCapacity = Math.round(nodeCapacity * MIN_GROW_FACTOR); - assert(newCapacity <= MAX_CAPACITY, 'Node capacity overflow!'); - return Math.max(MIN_CAPACITY, newCapacity); +function increaseNodeCapacity( + currentCapacity: number, + params: AdjacencyListParams, +): number { + let newCapacity = Math.max( + // Make sure we have room for at least 2 more nodes. + currentCapacity + 2, + Math.ceil(currentCapacity * params.minGrowFactor), + ); + + if (newCapacity >= NodeTypeMap.MAX_CAPACITY) { + if (currentCapacity > NodeTypeMap.MAX_CAPACITY - 2) { + throw new Error('Node capacity overflow!'); + } + + return NodeTypeMap.MAX_CAPACITY; + } + + return newCapacity; } -function getNextEdgeCapacity( - capacity: number, - count: number, - load: number, +function increaseEdgeCapacity( + currentCapacity: number, + params: AdjacencyListParams, ): number { - let {MIN_CAPACITY, MAX_CAPACITY, PEAK_CAPACITY} = EdgeTypeMap; - let newCapacity = capacity; - if (load > LOAD_FACTOR) { - // This is intended to strike a balance between growing the edge capacity - // in too small increments, which causes a lot of resizing, and growing - // the edge capacity in too large increments, which results in a lot of - // wasted memory. - let pct = capacity / PEAK_CAPACITY; - let growFactor = interpolate(MAX_GROW_FACTOR, MIN_GROW_FACTOR, pct); - newCapacity = Math.round(capacity * growFactor); - } else if (load < UNLOAD_FACTOR) { - // In some cases, it may be possible to shrink the edge capacity, - // but this is only likely to occur when a lot of edges have been removed. - newCapacity = Math.round(capacity * SHRINK_FACTOR); - } - assert(newCapacity <= MAX_CAPACITY, 'Edge capacity overflow!'); - return Math.max(MIN_CAPACITY, newCapacity); + // This is intended to strike a balance between growing the edge capacity + // in too small increments, which causes a lot of resizing, and growing + // the edge capacity in too large increments, which results in a lot of + // wasted memory. + let pct = currentCapacity / params.peakCapacity; + let growFactor = interpolate(params.maxGrowFactor, params.minGrowFactor, pct); + + let newCapacity = Math.max( + // Make sure we have room for at least one more edge. + currentCapacity + 1, + Math.ceil(currentCapacity * growFactor), + ); + + if (newCapacity >= EdgeTypeMap.MAX_CAPACITY) { + if (currentCapacity > EdgeTypeMap.MAX_CAPACITY - 1) { + throw new Error('Edge capacity overflow!'); + } + + return EdgeTypeMap.MAX_CAPACITY; + } + + return newCapacity; +} + +function decreaseEdgeCapacity( + currentCapacity: number, + params: AdjacencyListParams, +): number { + return Math.max( + // Make sure we don't shrink the capacity _below_ 2. + 2, + Math.min( + // Make sure we shrink the capacity by at least 1. + currentCapacity - 1, + Math.ceil(currentCapacity * params.shrinkFactor), + ), + ); } diff --git a/packages/core/graph/test/AdjacencyList.test.js b/packages/core/graph/test/AdjacencyList.test.js index 37f096b6fa3..9d320819381 100644 --- a/packages/core/graph/test/AdjacencyList.test.js +++ b/packages/core/graph/test/AdjacencyList.test.js @@ -19,18 +19,18 @@ describe('AdjacencyList', () => { let id = graph.addNode(); assert.equal(id, 0); assert.equal(graph.stats.nodes, 1); + let id2 = graph.addNode(); + assert.equal(id2, 1); + assert.equal(graph.stats.nodes, 2); }); - it('addNode should resize nodes array when necessary', () => { + it('addNode should resize nodes array', () => { let graph = new AdjacencyList(); let size = graph.serialize().nodes.byteLength; - let a = graph.addNode(); - let b = graph.addNode(); - assert(size < (size = graph.serialize().nodes.byteLength)); - graph.addEdge(a, b, 1); - graph.addEdge(a, b, 2); - graph.addEdge(a, b, 3); - graph.addEdge(a, b, 4); + graph.addNode(); + graph.addNode(); + graph.addNode(); + graph.addNode(); assert(size < graph.serialize().nodes.byteLength); }); @@ -168,6 +168,18 @@ describe('AdjacencyList', () => { assert.equal(graph.addEdge(a, b), false); }); + it('addEdge should resize nodes array when necessary', () => { + let graph = new AdjacencyList(); + let a = graph.addNode(); + let b = graph.addNode(); + let size = graph.serialize().nodes.byteLength; + graph.addEdge(a, b, 1); + graph.addEdge(a, b, 2); + graph.addEdge(a, b, 3); + graph.addEdge(a, b, 4); + assert(size < graph.serialize().nodes.byteLength); + }); + it('addEdge should resize edges array when necessary', () => { let graph = new AdjacencyList(); let size = graph.serialize().edges.byteLength; @@ -226,21 +238,23 @@ describe('AdjacencyList', () => { // $FlowFixMe[prop-missing] AdjacencyList.prototype.hash = () => 1; - let graph = new AdjacencyList(); - let n0 = graph.addNode(); - let n1 = graph.addNode(); - graph.addEdge(n0, n1, 2); - graph.removeEdge(n0, n1, 2); - assert(graph.addEdge(n0, n1, 2)); - assert(graph.stats.edges === 1); - assert(graph.stats.deleted === 1); - // Resize to reclaim deleted edge space. - graph.resizeEdges(4); - assert(graph.stats.edges === 1); - assert(graph.stats.deleted === 0); - - // $FlowFixMe[prop-missing] - AdjacencyList.prototype.hash = originalHash; + try { + let graph = new AdjacencyList({initialCapacity: 3}); + let n0 = graph.addNode(); + let n1 = graph.addNode(); + graph.addEdge(n0, n1, 2); + graph.removeEdge(n0, n1, 2); + assert(graph.addEdge(n0, n1, 2)); + assert(graph.stats.edges === 1); + assert(graph.stats.deleted === 1); + // Resize to reclaim deleted edge space. + graph.resizeEdges(2); + assert(graph.stats.edges === 1); + assert(graph.stats.deleted === 0); + } finally { + // $FlowFixMe[prop-missing] + AdjacencyList.prototype.hash = originalHash; + } }); it('hasEdge should accept an array of edge types', () => {