fix: sanitize foreign schemas (#1058)

Arrow-js uses brittle `instanceof` checks throughout the code base. These fail unless the library instance that produced the object matches exactly the same instance the vectordb is using. At a minimum, this means that a user using arrow version 15 (or any version that doesn't match exactly the version that vectordb is using) will get strange errors when they try and use vectordb. However, there are even cases where the versions can be perfectly identical, and the instanceof check still fails. One such example is when using `vite` (e.g. vitejs/vite#3910) This PR solves the problem in a rather brute force, but workable, fashion. If we encounter a schema that does not pass the `instanceof` check then we will attempt to sanitize that schema by traversing the object and, if it has all the correct properties, constructing an appropriate `Schema` instance via deep cloning.
lancedb · Apr 5, 2024 · c60a193 · c60a193
1 parent 785ecfa
commit c60a193
Show file tree

Hide file tree

Showing 11 changed files with 1,241 additions and 42 deletions.
diff --git a/node/.eslintrc.js b/node/.eslintrc.js
@@ -13,5 +13,10 @@ module.exports = {
   },
   rules: {
     "@typescript-eslint/method-signature-style": "off",
+    "@typescript-eslint/quotes": "off",
+    "@typescript-eslint/semi": "off",
+    "@typescript-eslint/explicit-function-return-type": "off",
+    "@typescript-eslint/space-before-function-paren": "off",
+    "@typescript-eslint/indent": "off",
   }
 }
diff --git a/node/package.json b/node/package.json
@@ -41,6 +41,7 @@
     "@types/temp": "^0.9.1",
     "@types/uuid": "^9.0.3",
     "@typescript-eslint/eslint-plugin": "^5.59.1",
+    "apache-arrow-old": "npm:apache-arrow@13.0.0",
     "cargo-cp-artifact": "^0.1",
     "chai": "^4.3.7",
     "chai-as-promised": "^7.1.1",
@@ -93,4 +94,4 @@
     "@lancedb/vectordb-linux-x64-gnu": "0.4.11",
     "@lancedb/vectordb-win32-x64-msvc": "0.4.11"
   }
-}
+}
diff --git a/node/src/arrow.ts b/node/src/arrow.ts
@@ -20,19 +20,20 @@ import {
   type Vector,
   FixedSizeList,
   vectorFromArray,
-  type Schema,
+  Schema,
   Table as ArrowTable,
   RecordBatchStreamWriter,
   List,
   RecordBatch,
   makeData,
   Struct,
-  type Float,
+  Float,
   DataType,
   Binary,
   Float32
 } from 'apache-arrow'
 import { type EmbeddingFunction } from './index'
+import { sanitizeSchema } from './sanitize'
 
 /*
  * Options to control how a column should be converted to a vector array
@@ -201,10 +202,13 @@ export function makeArrowTable (
   }
 
   const opt = new MakeArrowTableOptions(options !== undefined ? options : {})
+  if (opt.schema !== undefined && opt.schema !== null) {
+    opt.schema = sanitizeSchema(opt.schema)
+  }
   const columns: Record<string, Vector> = {}
   // TODO: sample dataset to find missing columns
   // Prefer the field ordering of the schema, if present
-  const columnNames = ((options?.schema) != null) ? (options?.schema?.names as string[]) : Object.keys(data[0])
+  const columnNames = ((opt.schema) != null) ? (opt.schema.names as string[]) : Object.keys(data[0])
   for (const colName of columnNames) {
     if (data.length !== 0 && !Object.prototype.hasOwnProperty.call(data[0], colName)) {
       // The field is present in the schema, but not in the data, skip it
@@ -329,6 +333,9 @@ async function applyEmbeddings<T> (table: ArrowTable, embeddings?: EmbeddingFunc
   if (embeddings == null) {
     return table
   }
+  if (schema !== undefined && schema !== null) {
+    schema = sanitizeSchema(schema)
+  }
 
   // Convert from ArrowTable to Record<String, Vector>
   const colEntries = [...Array(table.numCols).keys()].map((_, idx) => {
@@ -439,6 +446,9 @@ export async function fromRecordsToBuffer<T> (
   embeddings?: EmbeddingFunction<T>,
   schema?: Schema
 ): Promise<Buffer> {
+  if (schema !== undefined && schema !== null) {
+    schema = sanitizeSchema(schema)
+  }
   const table = await convertToTable(data, embeddings, { schema })
   const writer = RecordBatchFileWriter.writeAll(table)
   return Buffer.from(await writer.toUint8Array())
@@ -456,6 +466,9 @@ export async function fromRecordsToStreamBuffer<T> (
   embeddings?: EmbeddingFunction<T>,
   schema?: Schema
 ): Promise<Buffer> {
+  if (schema !== null && schema !== undefined) {
+    schema = sanitizeSchema(schema)
+  }
   const table = await convertToTable(data, embeddings, { schema })
   const writer = RecordBatchStreamWriter.writeAll(table)
   return Buffer.from(await writer.toUint8Array())
@@ -474,6 +487,9 @@ export async function fromTableToBuffer<T> (
   embeddings?: EmbeddingFunction<T>,
   schema?: Schema
 ): Promise<Buffer> {
+  if (schema !== null && schema !== undefined) {
+    schema = sanitizeSchema(schema)
+  }
   const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema)
   const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings)
   return Buffer.from(await writer.toUint8Array())
@@ -492,6 +508,9 @@ export async function fromTableToStreamBuffer<T> (
   embeddings?: EmbeddingFunction<T>,
   schema?: Schema
 ): Promise<Buffer> {
+  if (schema !== null && schema !== undefined) {
+    schema = sanitizeSchema(schema)
+  }
   const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema)
   const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings)
   return Buffer.from(await writer.toUint8Array())
@@ -528,5 +547,5 @@ function alignTable (table: ArrowTable, schema: Schema): ArrowTable {
 
 // Creates an empty Arrow Table
 export function createEmptyTable (schema: Schema): ArrowTable {
-  return new ArrowTable(schema)
+  return new ArrowTable(sanitizeSchema(schema))
 }