From ea33b68c6c20caf327ff6e0677395520cf6d737e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 4 Mar 2024 13:06:36 -0800 Subject: [PATCH] fix: sanitize foreign schemas (#1058) Arrow-js uses brittle `instanceof` checks throughout the code base. These fail unless the library instance that produced the object matches exactly the same instance the vectordb is using. At a minimum, this means that a user using arrow version 15 (or any version that doesn't match exactly the version that vectordb is using) will get strange errors when they try and use vectordb. However, there are even cases where the versions can be perfectly identical, and the instanceof check still fails. One such example is when using `vite` (e.g. https://github.com/vitejs/vite/issues/3910) This PR solves the problem in a rather brute force, but workable, fashion. If we encounter a schema that does not pass the `instanceof` check then we will attempt to sanitize that schema by traversing the object and, if it has all the correct properties, constructing an appropriate `Schema` instance via deep cloning. --- node/.eslintrc.js | 5 + node/package-lock.json | 43 ++- node/package.json | 3 +- node/src/arrow.ts | 27 +- node/src/sanitize.ts | 501 +++++++++++++++++++++++++++++++++ node/src/test/arrow.test.ts | 42 ++- nodejs/__test__/arrow.test.ts | 70 ++++- nodejs/__test__/table.test.ts | 6 +- nodejs/lancedb/arrow.ts | 26 +- nodejs/lancedb/sanitize.ts | 507 ++++++++++++++++++++++++++++++++++ nodejs/package-lock.json | 93 +++++-- nodejs/package.json | 3 +- 12 files changed, 1280 insertions(+), 46 deletions(-) create mode 100644 node/src/sanitize.ts create mode 100644 nodejs/lancedb/sanitize.ts diff --git a/node/.eslintrc.js b/node/.eslintrc.js index 743f74f39..fe7d61e1c 100644 --- a/node/.eslintrc.js +++ b/node/.eslintrc.js @@ -13,5 +13,10 @@ module.exports = { }, rules: { "@typescript-eslint/method-signature-style": "off", + "@typescript-eslint/quotes": "off", + "@typescript-eslint/semi": "off", + "@typescript-eslint/explicit-function-return-type": "off", + "@typescript-eslint/space-before-function-paren": "off", + "@typescript-eslint/indent": "off", } } diff --git a/node/package-lock.json b/node/package-lock.json index 06adef3a4..7a7373455 100644 --- a/node/package-lock.json +++ b/node/package-lock.json @@ -18,9 +18,7 @@ "win32" ], "dependencies": { - "@apache-arrow/ts": "^14.0.2", "@neon-rs/load": "^0.0.74", - "apache-arrow": "^14.0.2", "axios": "^1.4.0" }, "devDependencies": { @@ -33,6 +31,7 @@ "@types/temp": "^0.9.1", "@types/uuid": "^9.0.3", "@typescript-eslint/eslint-plugin": "^5.59.1", + "apache-arrow-old": "npm:apache-arrow@13.0.0", "cargo-cp-artifact": "^0.1", "chai": "^4.3.7", "chai-as-promised": "^7.1.1", @@ -58,6 +57,10 @@ "@lancedb/vectordb-linux-arm64-gnu": "0.4.11", "@lancedb/vectordb-linux-x64-gnu": "0.4.11", "@lancedb/vectordb-win32-x64-msvc": "0.4.11" + }, + "peerDependencies": { + "@apache-arrow/ts": "^14.0.2", + "apache-arrow": "^14.0.2" } }, "node_modules/@75lb/deep-merge": { @@ -93,6 +96,7 @@ "version": "14.0.2", "resolved": "https://registry.npmjs.org/@apache-arrow/ts/-/ts-14.0.2.tgz", "integrity": "sha512-CtwAvLkK0CZv7xsYeCo91ml6PvlfzAmAJZkRYuz2GNBwfYufj5SVi0iuSMwIMkcU/szVwvLdzORSLa5PlF/2ug==", + "peer": true, "dependencies": { "@types/command-line-args": "5.2.0", "@types/command-line-usage": "5.0.2", @@ -109,7 +113,8 @@ "node_modules/@apache-arrow/ts/node_modules/@types/node": { "version": "20.3.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.3.0.tgz", - "integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==" + "integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==", + "peer": true }, "node_modules/@cargo-messages/android-arm-eabi": { "version": "0.0.160", @@ -948,6 +953,7 @@ "version": "14.0.2", "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-14.0.2.tgz", "integrity": "sha512-EBO2xJN36/XoY81nhLcwCJgFwkboDZeyNQ+OPsG7bCoQjc2BT0aTyH/MR6SrL+LirSNz+cYqjGRlupMMlP1aEg==", + "peer": true, "dependencies": { "@types/command-line-args": "5.2.0", "@types/command-line-usage": "5.0.2", @@ -964,10 +970,39 @@ "arrow2csv": "bin/arrow2csv.js" } }, + "node_modules/apache-arrow-old": { + "name": "apache-arrow", + "version": "13.0.0", + "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-13.0.0.tgz", + "integrity": "sha512-3gvCX0GDawWz6KFNC28p65U+zGh/LZ6ZNKWNu74N6CQlKzxeoWHpi4CgEQsgRSEMuyrIIXi1Ea2syja7dwcHvw==", + "dev": true, + "dependencies": { + "@types/command-line-args": "5.2.0", + "@types/command-line-usage": "5.0.2", + "@types/node": "20.3.0", + "@types/pad-left": "2.1.1", + "command-line-args": "5.2.1", + "command-line-usage": "7.0.1", + "flatbuffers": "23.5.26", + "json-bignum": "^0.0.3", + "pad-left": "^2.1.0", + "tslib": "^2.5.3" + }, + "bin": { + "arrow2csv": "bin/arrow2csv.js" + } + }, + "node_modules/apache-arrow-old/node_modules/@types/node": { + "version": "20.3.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.3.0.tgz", + "integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==", + "dev": true + }, "node_modules/apache-arrow/node_modules/@types/node": { "version": "20.3.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.3.0.tgz", - "integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==" + "integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==", + "peer": true }, "node_modules/arg": { "version": "4.1.3", diff --git a/node/package.json b/node/package.json index ec18d6293..3952a5df3 100644 --- a/node/package.json +++ b/node/package.json @@ -41,6 +41,7 @@ "@types/temp": "^0.9.1", "@types/uuid": "^9.0.3", "@typescript-eslint/eslint-plugin": "^5.59.1", + "apache-arrow-old": "npm:apache-arrow@13.0.0", "cargo-cp-artifact": "^0.1", "chai": "^4.3.7", "chai-as-promised": "^7.1.1", @@ -93,4 +94,4 @@ "@lancedb/vectordb-linux-x64-gnu": "0.4.11", "@lancedb/vectordb-win32-x64-msvc": "0.4.11" } -} \ No newline at end of file +} diff --git a/node/src/arrow.ts b/node/src/arrow.ts index bdf20cf49..792c68f28 100644 --- a/node/src/arrow.ts +++ b/node/src/arrow.ts @@ -20,19 +20,20 @@ import { type Vector, FixedSizeList, vectorFromArray, - type Schema, + Schema, Table as ArrowTable, RecordBatchStreamWriter, List, RecordBatch, makeData, Struct, - type Float, + Float, DataType, Binary, Float32 } from 'apache-arrow' import { type EmbeddingFunction } from './index' +import { sanitizeSchema } from './sanitize' /* * Options to control how a column should be converted to a vector array @@ -201,10 +202,13 @@ export function makeArrowTable ( } const opt = new MakeArrowTableOptions(options !== undefined ? options : {}) + if (opt.schema !== undefined && opt.schema !== null) { + opt.schema = sanitizeSchema(opt.schema) + } const columns: Record = {} // TODO: sample dataset to find missing columns // Prefer the field ordering of the schema, if present - const columnNames = ((options?.schema) != null) ? (options?.schema?.names as string[]) : Object.keys(data[0]) + const columnNames = ((opt.schema) != null) ? (opt.schema.names as string[]) : Object.keys(data[0]) for (const colName of columnNames) { if (data.length !== 0 && !Object.prototype.hasOwnProperty.call(data[0], colName)) { // The field is present in the schema, but not in the data, skip it @@ -329,6 +333,9 @@ async function applyEmbeddings (table: ArrowTable, embeddings?: EmbeddingFunc if (embeddings == null) { return table } + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema) + } // Convert from ArrowTable to Record const colEntries = [...Array(table.numCols).keys()].map((_, idx) => { @@ -439,6 +446,9 @@ export async function fromRecordsToBuffer ( embeddings?: EmbeddingFunction, schema?: Schema ): Promise { + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema) + } const table = await convertToTable(data, embeddings, { schema }) const writer = RecordBatchFileWriter.writeAll(table) return Buffer.from(await writer.toUint8Array()) @@ -456,6 +466,9 @@ export async function fromRecordsToStreamBuffer ( embeddings?: EmbeddingFunction, schema?: Schema ): Promise { + if (schema !== null && schema !== undefined) { + schema = sanitizeSchema(schema) + } const table = await convertToTable(data, embeddings, { schema }) const writer = RecordBatchStreamWriter.writeAll(table) return Buffer.from(await writer.toUint8Array()) @@ -474,6 +487,9 @@ export async function fromTableToBuffer ( embeddings?: EmbeddingFunction, schema?: Schema ): Promise { + if (schema !== null && schema !== undefined) { + schema = sanitizeSchema(schema) + } const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema) const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings) return Buffer.from(await writer.toUint8Array()) @@ -492,6 +508,9 @@ export async function fromTableToStreamBuffer ( embeddings?: EmbeddingFunction, schema?: Schema ): Promise { + if (schema !== null && schema !== undefined) { + schema = sanitizeSchema(schema) + } const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema) const writer = RecordBatchStreamWriter.writeAll(tableWithEmbeddings) return Buffer.from(await writer.toUint8Array()) @@ -528,5 +547,5 @@ function alignTable (table: ArrowTable, schema: Schema): ArrowTable { // Creates an empty Arrow Table export function createEmptyTable (schema: Schema): ArrowTable { - return new ArrowTable(schema) + return new ArrowTable(sanitizeSchema(schema)) } diff --git a/node/src/sanitize.ts b/node/src/sanitize.ts new file mode 100644 index 000000000..0788f41e8 --- /dev/null +++ b/node/src/sanitize.ts @@ -0,0 +1,501 @@ +// Copyright 2023 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The utilities in this file help sanitize data from the user's arrow +// library into the types expected by vectordb's arrow library. Node +// generally allows for mulitple versions of the same library (and sometimes +// even multiple copies of the same version) to be installed at the same +// time. However, arrow-js uses instanceof which expected that the input +// comes from the exact same library instance. This is not always the case +// and so we must sanitize the input to ensure that it is compatible. + +import { + Field, + Utf8, + FixedSizeBinary, + FixedSizeList, + Schema, + List, + Struct, + Float, + Bool, + Date_, + Decimal, + DataType, + Dictionary, + Binary, + Float32, + Interval, + Map_, + Duration, + Union, + Time, + Timestamp, + Type, + Null, + Int, + type Precision, + type DateUnit, + Int8, + Int16, + Int32, + Int64, + Uint8, + Uint16, + Uint32, + Uint64, + Float16, + Float64, + DateDay, + DateMillisecond, + DenseUnion, + SparseUnion, + TimeNanosecond, + TimeMicrosecond, + TimeMillisecond, + TimeSecond, + TimestampNanosecond, + TimestampMicrosecond, + TimestampMillisecond, + TimestampSecond, + IntervalDayTime, + IntervalYearMonth, + DurationNanosecond, + DurationMicrosecond, + DurationMillisecond, + DurationSecond, +} from "apache-arrow"; +import type { IntBitWidth, TimeBitWidth } from "apache-arrow/type"; + +function sanitizeMetadata( + metadataLike?: unknown +): Map | undefined { + if (metadataLike === undefined || metadataLike === null) { + return undefined; + } + if (!(metadataLike instanceof Map)) { + throw Error("Expected metadata, if present, to be a Map"); + } + for (const item of metadataLike) { + if (!(typeof item[0] === "string" || !(typeof item[1] === "string"))) { + throw Error( + "Expected metadata, if present, to be a Map but it had non-string keys or values" + ); + } + } + return metadataLike as Map; +} + +function sanitizeInt(typeLike: object) { + if ( + !("bitWidth" in typeLike) || + typeof typeLike.bitWidth !== "number" || + !("isSigned" in typeLike) || + typeof typeLike.isSigned !== "boolean" + ) { + throw Error( + "Expected an Int Type to have a `bitWidth` and `isSigned` property" + ); + } + return new Int(typeLike.isSigned, typeLike.bitWidth as IntBitWidth); +} + +function sanitizeFloat(typeLike: object) { + if (!("precision" in typeLike) || typeof typeLike.precision !== "number") { + throw Error("Expected a Float Type to have a `precision` property"); + } + return new Float(typeLike.precision as Precision); +} + +function sanitizeDecimal(typeLike: object) { + if ( + !("scale" in typeLike) || + typeof typeLike.scale !== "number" || + !("precision" in typeLike) || + typeof typeLike.precision !== "number" || + !("bitWidth" in typeLike) || + typeof typeLike.bitWidth !== "number" + ) { + throw Error( + "Expected a Decimal Type to have `scale`, `precision`, and `bitWidth` properties" + ); + } + return new Decimal(typeLike.scale, typeLike.precision, typeLike.bitWidth); +} + +function sanitizeDate(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected a Date type to have a `unit` property"); + } + return new Date_(typeLike.unit as DateUnit); +} + +function sanitizeTime(typeLike: object) { + if ( + !("unit" in typeLike) || + typeof typeLike.unit !== "number" || + !("bitWidth" in typeLike) || + typeof typeLike.bitWidth !== "number" + ) { + throw Error( + "Expected a Time type to have `unit` and `bitWidth` properties" + ); + } + return new Time(typeLike.unit, typeLike.bitWidth as TimeBitWidth); +} + +function sanitizeTimestamp(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected a Timestamp type to have a `unit` property"); + } + let timezone = null; + if ("timezone" in typeLike && typeof typeLike.timezone === "string") { + timezone = typeLike.timezone; + } + return new Timestamp(typeLike.unit, timezone); +} + +function sanitizeTypedTimestamp( + typeLike: object, + Datatype: + | typeof TimestampNanosecond + | typeof TimestampMicrosecond + | typeof TimestampMillisecond + | typeof TimestampSecond +) { + let timezone = null; + if ("timezone" in typeLike && typeof typeLike.timezone === "string") { + timezone = typeLike.timezone; + } + return new Datatype(timezone); +} + +function sanitizeInterval(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected an Interval type to have a `unit` property"); + } + return new Interval(typeLike.unit); +} + +function sanitizeList(typeLike: object) { + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a List type to have an array-like `children` property" + ); + } + if (typeLike.children.length !== 1) { + throw Error("Expected a List type to have exactly one child"); + } + return new List(sanitizeField(typeLike.children[0])); +} + +function sanitizeStruct(typeLike: object) { + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a Struct type to have an array-like `children` property" + ); + } + return new Struct(typeLike.children.map((child) => sanitizeField(child))); +} + +function sanitizeUnion(typeLike: object) { + if ( + !("typeIds" in typeLike) || + !("mode" in typeLike) || + typeof typeLike.mode !== "number" + ) { + throw Error( + "Expected a Union type to have `typeIds` and `mode` properties" + ); + } + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a Union type to have an array-like `children` property" + ); + } + + return new Union( + typeLike.mode, + typeLike.typeIds as any, + typeLike.children.map((child) => sanitizeField(child)) + ); +} + +function sanitizeTypedUnion( + typeLike: object, + UnionType: typeof DenseUnion | typeof SparseUnion +) { + if (!("typeIds" in typeLike)) { + throw Error( + "Expected a DenseUnion/SparseUnion type to have a `typeIds` property" + ); + } + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a DenseUnion/SparseUnion type to have an array-like `children` property" + ); + } + + return new UnionType( + typeLike.typeIds as any, + typeLike.children.map((child) => sanitizeField(child)) + ); +} + +function sanitizeFixedSizeBinary(typeLike: object) { + if (!("byteWidth" in typeLike) || typeof typeLike.byteWidth !== "number") { + throw Error( + "Expected a FixedSizeBinary type to have a `byteWidth` property" + ); + } + return new FixedSizeBinary(typeLike.byteWidth); +} + +function sanitizeFixedSizeList(typeLike: object) { + if (!("listSize" in typeLike) || typeof typeLike.listSize !== "number") { + throw Error("Expected a FixedSizeList type to have a `listSize` property"); + } + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a FixedSizeList type to have an array-like `children` property" + ); + } + if (typeLike.children.length !== 1) { + throw Error("Expected a FixedSizeList type to have exactly one child"); + } + return new FixedSizeList( + typeLike.listSize, + sanitizeField(typeLike.children[0]) + ); +} + +function sanitizeMap(typeLike: object) { + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a Map type to have an array-like `children` property" + ); + } + if (!("keysSorted" in typeLike) || typeof typeLike.keysSorted !== "boolean") { + throw Error("Expected a Map type to have a `keysSorted` property"); + } + return new Map_( + typeLike.children.map((field) => sanitizeField(field)) as any, + typeLike.keysSorted + ); +} + +function sanitizeDuration(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected a Duration type to have a `unit` property"); + } + return new Duration(typeLike.unit); +} + +function sanitizeDictionary(typeLike: object) { + if (!("id" in typeLike) || typeof typeLike.id !== "number") { + throw Error("Expected a Dictionary type to have an `id` property"); + } + if (!("indices" in typeLike) || typeof typeLike.indices !== "object") { + throw Error("Expected a Dictionary type to have an `indices` property"); + } + if (!("dictionary" in typeLike) || typeof typeLike.dictionary !== "object") { + throw Error("Expected a Dictionary type to have an `dictionary` property"); + } + if (!("isOrdered" in typeLike) || typeof typeLike.isOrdered !== "boolean") { + throw Error("Expected a Dictionary type to have an `isOrdered` property"); + } + return new Dictionary( + sanitizeType(typeLike.dictionary), + sanitizeType(typeLike.indices) as any, + typeLike.id, + typeLike.isOrdered + ); +} + +function sanitizeType(typeLike: unknown): DataType { + if (typeof typeLike !== "object" || typeLike === null) { + throw Error("Expected a Type but object was null/undefined"); + } + if (!("typeId" in typeLike) || !(typeof typeLike.typeId !== "function")) { + throw Error("Expected a Type to have a typeId function"); + } + let typeId: Type; + if (typeof typeLike.typeId === "function") { + typeId = (typeLike.typeId as () => unknown)() as Type; + } else if (typeof typeLike.typeId === "number") { + typeId = typeLike.typeId as Type; + } else { + throw Error("Type's typeId property was not a function or number"); + } + + switch (typeId) { + case Type.NONE: + throw Error("Received a Type with a typeId of NONE"); + case Type.Null: + return new Null(); + case Type.Int: + return sanitizeInt(typeLike); + case Type.Float: + return sanitizeFloat(typeLike); + case Type.Binary: + return new Binary(); + case Type.Utf8: + return new Utf8(); + case Type.Bool: + return new Bool(); + case Type.Decimal: + return sanitizeDecimal(typeLike); + case Type.Date: + return sanitizeDate(typeLike); + case Type.Time: + return sanitizeTime(typeLike); + case Type.Timestamp: + return sanitizeTimestamp(typeLike); + case Type.Interval: + return sanitizeInterval(typeLike); + case Type.List: + return sanitizeList(typeLike); + case Type.Struct: + return sanitizeStruct(typeLike); + case Type.Union: + return sanitizeUnion(typeLike); + case Type.FixedSizeBinary: + return sanitizeFixedSizeBinary(typeLike); + case Type.FixedSizeList: + return sanitizeFixedSizeList(typeLike); + case Type.Map: + return sanitizeMap(typeLike); + case Type.Duration: + return sanitizeDuration(typeLike); + case Type.Dictionary: + return sanitizeDictionary(typeLike); + case Type.Int8: + return new Int8(); + case Type.Int16: + return new Int16(); + case Type.Int32: + return new Int32(); + case Type.Int64: + return new Int64(); + case Type.Uint8: + return new Uint8(); + case Type.Uint16: + return new Uint16(); + case Type.Uint32: + return new Uint32(); + case Type.Uint64: + return new Uint64(); + case Type.Float16: + return new Float16(); + case Type.Float32: + return new Float32(); + case Type.Float64: + return new Float64(); + case Type.DateMillisecond: + return new DateMillisecond(); + case Type.DateDay: + return new DateDay(); + case Type.TimeNanosecond: + return new TimeNanosecond(); + case Type.TimeMicrosecond: + return new TimeMicrosecond(); + case Type.TimeMillisecond: + return new TimeMillisecond(); + case Type.TimeSecond: + return new TimeSecond(); + case Type.TimestampNanosecond: + return sanitizeTypedTimestamp(typeLike, TimestampNanosecond); + case Type.TimestampMicrosecond: + return sanitizeTypedTimestamp(typeLike, TimestampMicrosecond); + case Type.TimestampMillisecond: + return sanitizeTypedTimestamp(typeLike, TimestampMillisecond); + case Type.TimestampSecond: + return sanitizeTypedTimestamp(typeLike, TimestampSecond); + case Type.DenseUnion: + return sanitizeTypedUnion(typeLike, DenseUnion); + case Type.SparseUnion: + return sanitizeTypedUnion(typeLike, SparseUnion); + case Type.IntervalDayTime: + return new IntervalDayTime(); + case Type.IntervalYearMonth: + return new IntervalYearMonth(); + case Type.DurationNanosecond: + return new DurationNanosecond(); + case Type.DurationMicrosecond: + return new DurationMicrosecond(); + case Type.DurationMillisecond: + return new DurationMillisecond(); + case Type.DurationSecond: + return new DurationSecond(); + } +} + +function sanitizeField(fieldLike: unknown): Field { + if (fieldLike instanceof Field) { + return fieldLike; + } + if (typeof fieldLike !== "object" || fieldLike === null) { + throw Error("Expected a Field but object was null/undefined"); + } + if ( + !("type" in fieldLike) || + !("name" in fieldLike) || + !("nullable" in fieldLike) + ) { + throw Error( + "The field passed in is missing a `type`/`name`/`nullable` property" + ); + } + const type = sanitizeType(fieldLike.type); + const name = fieldLike.name; + if (!(typeof name === "string")) { + throw Error("The field passed in had a non-string `name` property"); + } + const nullable = fieldLike.nullable; + if (!(typeof nullable === "boolean")) { + throw Error("The field passed in had a non-boolean `nullable` property"); + } + let metadata; + if ("metadata" in fieldLike) { + metadata = sanitizeMetadata(fieldLike.metadata); + } + return new Field(name, type, nullable, metadata); +} + +export function sanitizeSchema(schemaLike: unknown): Schema { + if (schemaLike instanceof Schema) { + return schemaLike; + } + if (typeof schemaLike !== "object" || schemaLike === null) { + throw Error("Expected a Schema but object was null/undefined"); + } + if (!("fields" in schemaLike)) { + throw Error( + "The schema passed in does not appear to be a schema (no 'fields' property)" + ); + } + let metadata; + if ("metadata" in schemaLike) { + metadata = sanitizeMetadata(schemaLike.metadata); + } + if (!Array.isArray(schemaLike.fields)) { + throw Error( + "The schema passed in had a 'fields' property but it was not an array" + ); + } + const sanitizedFields = schemaLike.fields.map((field) => + sanitizeField(field) + ); + return new Schema(sanitizedFields, metadata); +} diff --git a/node/src/test/arrow.test.ts b/node/src/test/arrow.test.ts index c356c9d8a..38005e6a7 100644 --- a/node/src/test/arrow.test.ts +++ b/node/src/test/arrow.test.ts @@ -34,8 +34,20 @@ import { List, DataType, Dictionary, - Int64 + Int64, + MetadataVersion } from 'apache-arrow' +import { + Dictionary as OldDictionary, + Field as OldField, + FixedSizeList as OldFixedSizeList, + Float32 as OldFloat32, + Int32 as OldInt32, + Struct as OldStruct, + Schema as OldSchema, + TimestampNanosecond as OldTimestampNanosecond, + Utf8 as OldUtf8 +} from 'apache-arrow-old' import { type EmbeddingFunction } from '../embedding/embedding_function' chaiUse(chaiAsPromised) @@ -318,3 +330,31 @@ describe('makeEmptyTable', function () { await checkTableCreation(async (_, __, schema) => makeEmptyTable(schema)) }) }) + +describe('when using two versions of arrow', function () { + it('can still import data', async function() { + const schema = new OldSchema([ + new OldField('id', new OldInt32()), + new OldField('vector', new OldFixedSizeList(1024, new OldField("item", new OldFloat32(), true))), + new OldField('struct', new OldStruct([ + new OldField('nested', new OldDictionary(new OldUtf8(), new OldInt32(), 1, true)), + new OldField('ts_with_tz', new OldTimestampNanosecond("some_tz")), + new OldField('ts_no_tz', new OldTimestampNanosecond(null)) + ])) + ]) as any + // We use arrow version 13 to emulate a "foreign arrow" and this version doesn't have metadataVersion + // In theory, this wouldn't matter. We don't rely on that property. However, it causes deepEqual to + // fail so we patch it back in + schema.metadataVersion = MetadataVersion.V5 + const table = makeArrowTable( + [], + { schema } + ) + + const buf = await fromTableToBuffer(table) + assert.isAbove(buf.byteLength, 0) + const actual = tableFromIPC(buf) + const actualSchema = actual.schema + assert.deepEqual(actualSchema, schema) + }) +}) diff --git a/nodejs/__test__/arrow.test.ts b/nodejs/__test__/arrow.test.ts index 66c8ccfb3..4a267c57e 100644 --- a/nodejs/__test__/arrow.test.ts +++ b/nodejs/__test__/arrow.test.ts @@ -38,9 +38,22 @@ import { Int64, Float, Precision, + MetadataVersion, } from "apache-arrow"; +import { + Dictionary as OldDictionary, + Field as OldField, + FixedSizeList as OldFixedSizeList, + Float32 as OldFloat32, + Int32 as OldInt32, + Struct as OldStruct, + Schema as OldSchema, + TimestampNanosecond as OldTimestampNanosecond, + Utf8 as OldUtf8, +} from "apache-arrow-old"; import { type EmbeddingFunction } from "../dist/embedding/embedding_function"; +// eslint-disable-next-line @typescript-eslint/no-explicit-any function sampleRecords(): Array> { return [ { @@ -57,8 +70,8 @@ function sampleRecords(): Array> { // Helper method to verify various ways to create a table async function checkTableCreation( tableCreationMethod: ( - records: any, - recordsReversed: any, + records: Record[], + recordsReversed: Record[], schema: Schema, ) => Promise, infersTypes: boolean, @@ -402,3 +415,56 @@ describe("makeEmptyTable", function () { ); }); }); + +describe("when using two versions of arrow", function () { + it("can still import data", async function () { + const schema = new OldSchema([ + new OldField("id", new OldInt32()), + new OldField( + "vector", + new OldFixedSizeList( + 1024, + new OldField("item", new OldFloat32(), true), + ), + ), + new OldField( + "struct", + new OldStruct([ + new OldField( + "nested", + new OldDictionary(new OldUtf8(), new OldInt32(), 1, true), + ), + new OldField("ts_with_tz", new OldTimestampNanosecond("some_tz")), + new OldField("ts_no_tz", new OldTimestampNanosecond(null)), + ]), + ), + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ]) as any; + schema.metadataVersion = MetadataVersion.V5; + const table = makeArrowTable([], { schema }); + + const buf = await fromTableToBuffer(table); + expect(buf.byteLength).toBeGreaterThan(0); + const actual = tableFromIPC(buf); + const actualSchema = actual.schema; + expect(actualSchema.fields.length).toBe(3); + + // Deep equality gets hung up on some very minor unimportant differences + // between arrow version 13 and 15 which isn't really what we're testing for + // and so we do our own comparison that just checks name/type/nullability + function compareFields(lhs: Field, rhs: Field) { + expect(lhs.name).toEqual(rhs.name); + expect(lhs.nullable).toEqual(rhs.nullable); + expect(lhs.typeId).toEqual(rhs.typeId); + if ("children" in lhs.type && lhs.type.children !== null) { + const lhs_children = lhs.type.children as Field[]; + lhs_children.forEach((child: Field, idx) => { + compareFields(child, rhs.type.children[idx]); + }); + } + } + actualSchema.fields.forEach((field, idx) => { + compareFields(field, actualSchema.fields[idx]); + }); + }); +}); diff --git a/nodejs/__test__/table.test.ts b/nodejs/__test__/table.test.ts index 74080fa5e..c06fccaee 100644 --- a/nodejs/__test__/table.test.ts +++ b/nodejs/__test__/table.test.ts @@ -103,12 +103,12 @@ describe("Test creating index", () => { // TODO: check index type. // Search without specifying the column - let query_vector = data.toArray()[5].vec.toJSON(); - let rst = await tbl.query().nearestTo(query_vector).limit(2).toArrow(); + const query_vector = data.toArray()[5].vec.toJSON(); + const rst = await tbl.query().nearestTo(query_vector).limit(2).toArrow(); expect(rst.numRows).toBe(2); // Search with specifying the column - let rst2 = await tbl.search(query_vector, "vec").limit(2).toArrow(); + const rst2 = await tbl.search(query_vector, "vec").limit(2).toArrow(); expect(rst2.numRows).toBe(2); expect(rst.toString()).toEqual(rst2.toString()); }); diff --git a/nodejs/lancedb/arrow.ts b/nodejs/lancedb/arrow.ts index a86804ff1..21c84b53a 100644 --- a/nodejs/lancedb/arrow.ts +++ b/nodejs/lancedb/arrow.ts @@ -33,6 +33,7 @@ import { Float32, } from "apache-arrow"; import { type EmbeddingFunction } from "./embedding/embedding_function"; +import { sanitizeSchema } from "./sanitize"; /** Data type accepted by NodeJS SDK */ export type Data = Record[] | ArrowTable; @@ -208,13 +209,14 @@ export function makeArrowTable( } const opt = new MakeArrowTableOptions(options !== undefined ? options : {}); + if (opt.schema !== undefined && opt.schema !== null) { + opt.schema = sanitizeSchema(opt.schema); + } const columns: Record = {}; // TODO: sample dataset to find missing columns // Prefer the field ordering of the schema, if present const columnNames = - options?.schema != null - ? (options?.schema?.names as string[]) - : Object.keys(data[0]); + opt.schema != null ? (opt.schema.names as string[]) : Object.keys(data[0]); for (const colName of columnNames) { if ( data.length !== 0 && @@ -381,6 +383,10 @@ async function applyEmbeddings( return table; } + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema); + } + // Convert from ArrowTable to Record const colEntries = [...Array(table.numCols).keys()].map((_, idx) => { const name = table.schema.fields[idx].name; @@ -510,6 +516,9 @@ export async function fromRecordsToBuffer( embeddings?: EmbeddingFunction, schema?: Schema, ): Promise { + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema); + } const table = await convertToTable(data, embeddings, { schema }); const writer = RecordBatchFileWriter.writeAll(table); return Buffer.from(await writer.toUint8Array()); @@ -527,6 +536,9 @@ export async function fromRecordsToStreamBuffer( embeddings?: EmbeddingFunction, schema?: Schema, ): Promise { + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema); + } const table = await convertToTable(data, embeddings, { schema }); const writer = RecordBatchStreamWriter.writeAll(table); return Buffer.from(await writer.toUint8Array()); @@ -545,6 +557,9 @@ export async function fromTableToBuffer( embeddings?: EmbeddingFunction, schema?: Schema, ): Promise { + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema); + } const tableWithEmbeddings = await applyEmbeddings(table, embeddings, schema); const writer = RecordBatchFileWriter.writeAll(tableWithEmbeddings); return Buffer.from(await writer.toUint8Array()); @@ -555,6 +570,9 @@ export async function fromDataToBuffer( embeddings?: EmbeddingFunction, schema?: Schema, ): Promise { + if (schema !== undefined && schema !== null) { + schema = sanitizeSchema(schema); + } if (data instanceof ArrowTable) { return fromTableToBuffer(data, embeddings, schema); } else { @@ -612,5 +630,5 @@ function alignTable(table: ArrowTable, schema: Schema): ArrowTable { // Creates an empty Arrow Table export function createEmptyTable(schema: Schema): ArrowTable { - return new ArrowTable(schema); + return new ArrowTable(sanitizeSchema(schema)); } diff --git a/nodejs/lancedb/sanitize.ts b/nodejs/lancedb/sanitize.ts new file mode 100644 index 000000000..9a5face30 --- /dev/null +++ b/nodejs/lancedb/sanitize.ts @@ -0,0 +1,507 @@ +// Copyright 2023 LanceDB Developers. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The utilities in this file help sanitize data from the user's arrow +// library into the types expected by vectordb's arrow library. Node +// generally allows for mulitple versions of the same library (and sometimes +// even multiple copies of the same version) to be installed at the same +// time. However, arrow-js uses instanceof which expected that the input +// comes from the exact same library instance. This is not always the case +// and so we must sanitize the input to ensure that it is compatible. + +import { + Field, + Utf8, + FixedSizeBinary, + FixedSizeList, + Schema, + List, + Struct, + Float, + Bool, + Date_, + Decimal, + DataType, + Dictionary, + Binary, + Float32, + Interval, + Map_, + Duration, + Union, + Time, + Timestamp, + Type, + Null, + Int, + type Precision, + type DateUnit, + Int8, + Int16, + Int32, + Int64, + Uint8, + Uint16, + Uint32, + Uint64, + Float16, + Float64, + DateDay, + DateMillisecond, + DenseUnion, + SparseUnion, + TimeNanosecond, + TimeMicrosecond, + TimeMillisecond, + TimeSecond, + TimestampNanosecond, + TimestampMicrosecond, + TimestampMillisecond, + TimestampSecond, + IntervalDayTime, + IntervalYearMonth, + DurationNanosecond, + DurationMicrosecond, + DurationMillisecond, + DurationSecond, +} from "apache-arrow"; +import type { IntBitWidth, TKeys, TimeBitWidth } from "apache-arrow/type"; + +function sanitizeMetadata( + metadataLike?: unknown, +): Map | undefined { + if (metadataLike === undefined || metadataLike === null) { + return undefined; + } + if (!(metadataLike instanceof Map)) { + throw Error("Expected metadata, if present, to be a Map"); + } + for (const item of metadataLike) { + if (!(typeof item[0] === "string" || !(typeof item[1] === "string"))) { + throw Error( + "Expected metadata, if present, to be a Map but it had non-string keys or values", + ); + } + } + return metadataLike as Map; +} + +function sanitizeInt(typeLike: object) { + if ( + !("bitWidth" in typeLike) || + typeof typeLike.bitWidth !== "number" || + !("isSigned" in typeLike) || + typeof typeLike.isSigned !== "boolean" + ) { + throw Error( + "Expected an Int Type to have a `bitWidth` and `isSigned` property", + ); + } + return new Int(typeLike.isSigned, typeLike.bitWidth as IntBitWidth); +} + +function sanitizeFloat(typeLike: object) { + if (!("precision" in typeLike) || typeof typeLike.precision !== "number") { + throw Error("Expected a Float Type to have a `precision` property"); + } + return new Float(typeLike.precision as Precision); +} + +function sanitizeDecimal(typeLike: object) { + if ( + !("scale" in typeLike) || + typeof typeLike.scale !== "number" || + !("precision" in typeLike) || + typeof typeLike.precision !== "number" || + !("bitWidth" in typeLike) || + typeof typeLike.bitWidth !== "number" + ) { + throw Error( + "Expected a Decimal Type to have `scale`, `precision`, and `bitWidth` properties", + ); + } + return new Decimal(typeLike.scale, typeLike.precision, typeLike.bitWidth); +} + +function sanitizeDate(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected a Date type to have a `unit` property"); + } + return new Date_(typeLike.unit as DateUnit); +} + +function sanitizeTime(typeLike: object) { + if ( + !("unit" in typeLike) || + typeof typeLike.unit !== "number" || + !("bitWidth" in typeLike) || + typeof typeLike.bitWidth !== "number" + ) { + throw Error( + "Expected a Time type to have `unit` and `bitWidth` properties", + ); + } + return new Time(typeLike.unit, typeLike.bitWidth as TimeBitWidth); +} + +function sanitizeTimestamp(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected a Timestamp type to have a `unit` property"); + } + let timezone = null; + if ("timezone" in typeLike && typeof typeLike.timezone === "string") { + timezone = typeLike.timezone; + } + return new Timestamp(typeLike.unit, timezone); +} + +function sanitizeTypedTimestamp( + typeLike: object, + Datatype: + | typeof TimestampNanosecond + | typeof TimestampMicrosecond + | typeof TimestampMillisecond + | typeof TimestampSecond, +) { + let timezone = null; + if ("timezone" in typeLike && typeof typeLike.timezone === "string") { + timezone = typeLike.timezone; + } + return new Datatype(timezone); +} + +function sanitizeInterval(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected an Interval type to have a `unit` property"); + } + return new Interval(typeLike.unit); +} + +function sanitizeList(typeLike: object) { + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a List type to have an array-like `children` property", + ); + } + if (typeLike.children.length !== 1) { + throw Error("Expected a List type to have exactly one child"); + } + return new List(sanitizeField(typeLike.children[0])); +} + +function sanitizeStruct(typeLike: object) { + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a Struct type to have an array-like `children` property", + ); + } + return new Struct(typeLike.children.map((child) => sanitizeField(child))); +} + +function sanitizeUnion(typeLike: object) { + if ( + !("typeIds" in typeLike) || + !("mode" in typeLike) || + typeof typeLike.mode !== "number" + ) { + throw Error( + "Expected a Union type to have `typeIds` and `mode` properties", + ); + } + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a Union type to have an array-like `children` property", + ); + } + + return new Union( + typeLike.mode, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + typeLike.typeIds as any, + typeLike.children.map((child) => sanitizeField(child)), + ); +} + +function sanitizeTypedUnion( + typeLike: object, + UnionType: typeof DenseUnion | typeof SparseUnion, +) { + if (!("typeIds" in typeLike)) { + throw Error( + "Expected a DenseUnion/SparseUnion type to have a `typeIds` property", + ); + } + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a DenseUnion/SparseUnion type to have an array-like `children` property", + ); + } + + return new UnionType( + typeLike.typeIds as Int32Array | number[], + typeLike.children.map((child) => sanitizeField(child)), + ); +} + +function sanitizeFixedSizeBinary(typeLike: object) { + if (!("byteWidth" in typeLike) || typeof typeLike.byteWidth !== "number") { + throw Error( + "Expected a FixedSizeBinary type to have a `byteWidth` property", + ); + } + return new FixedSizeBinary(typeLike.byteWidth); +} + +function sanitizeFixedSizeList(typeLike: object) { + if (!("listSize" in typeLike) || typeof typeLike.listSize !== "number") { + throw Error("Expected a FixedSizeList type to have a `listSize` property"); + } + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a FixedSizeList type to have an array-like `children` property", + ); + } + if (typeLike.children.length !== 1) { + throw Error("Expected a FixedSizeList type to have exactly one child"); + } + return new FixedSizeList( + typeLike.listSize, + sanitizeField(typeLike.children[0]), + ); +} + +function sanitizeMap(typeLike: object) { + if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { + throw Error( + "Expected a Map type to have an array-like `children` property", + ); + } + if (!("keysSorted" in typeLike) || typeof typeLike.keysSorted !== "boolean") { + throw Error("Expected a Map type to have a `keysSorted` property"); + } + + return new Map_( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + typeLike.children.map((field) => sanitizeField(field)) as any, + typeLike.keysSorted, + ); +} + +function sanitizeDuration(typeLike: object) { + if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { + throw Error("Expected a Duration type to have a `unit` property"); + } + return new Duration(typeLike.unit); +} + +function sanitizeDictionary(typeLike: object) { + if (!("id" in typeLike) || typeof typeLike.id !== "number") { + throw Error("Expected a Dictionary type to have an `id` property"); + } + if (!("indices" in typeLike) || typeof typeLike.indices !== "object") { + throw Error("Expected a Dictionary type to have an `indices` property"); + } + if (!("dictionary" in typeLike) || typeof typeLike.dictionary !== "object") { + throw Error("Expected a Dictionary type to have an `dictionary` property"); + } + if (!("isOrdered" in typeLike) || typeof typeLike.isOrdered !== "boolean") { + throw Error("Expected a Dictionary type to have an `isOrdered` property"); + } + return new Dictionary( + sanitizeType(typeLike.dictionary), + sanitizeType(typeLike.indices) as TKeys, + typeLike.id, + typeLike.isOrdered, + ); +} + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function sanitizeType(typeLike: unknown): DataType { + if (typeof typeLike !== "object" || typeLike === null) { + throw Error("Expected a Type but object was null/undefined"); + } + if (!("typeId" in typeLike) || !(typeof typeLike.typeId !== "function")) { + throw Error("Expected a Type to have a typeId function"); + } + let typeId: Type; + if (typeof typeLike.typeId === "function") { + typeId = (typeLike.typeId as () => unknown)() as Type; + } else if (typeof typeLike.typeId === "number") { + typeId = typeLike.typeId as Type; + } else { + throw Error("Type's typeId property was not a function or number"); + } + + switch (typeId) { + case Type.NONE: + throw Error("Received a Type with a typeId of NONE"); + case Type.Null: + return new Null(); + case Type.Int: + return sanitizeInt(typeLike); + case Type.Float: + return sanitizeFloat(typeLike); + case Type.Binary: + return new Binary(); + case Type.Utf8: + return new Utf8(); + case Type.Bool: + return new Bool(); + case Type.Decimal: + return sanitizeDecimal(typeLike); + case Type.Date: + return sanitizeDate(typeLike); + case Type.Time: + return sanitizeTime(typeLike); + case Type.Timestamp: + return sanitizeTimestamp(typeLike); + case Type.Interval: + return sanitizeInterval(typeLike); + case Type.List: + return sanitizeList(typeLike); + case Type.Struct: + return sanitizeStruct(typeLike); + case Type.Union: + return sanitizeUnion(typeLike); + case Type.FixedSizeBinary: + return sanitizeFixedSizeBinary(typeLike); + case Type.FixedSizeList: + return sanitizeFixedSizeList(typeLike); + case Type.Map: + return sanitizeMap(typeLike); + case Type.Duration: + return sanitizeDuration(typeLike); + case Type.Dictionary: + return sanitizeDictionary(typeLike); + case Type.Int8: + return new Int8(); + case Type.Int16: + return new Int16(); + case Type.Int32: + return new Int32(); + case Type.Int64: + return new Int64(); + case Type.Uint8: + return new Uint8(); + case Type.Uint16: + return new Uint16(); + case Type.Uint32: + return new Uint32(); + case Type.Uint64: + return new Uint64(); + case Type.Float16: + return new Float16(); + case Type.Float32: + return new Float32(); + case Type.Float64: + return new Float64(); + case Type.DateMillisecond: + return new DateMillisecond(); + case Type.DateDay: + return new DateDay(); + case Type.TimeNanosecond: + return new TimeNanosecond(); + case Type.TimeMicrosecond: + return new TimeMicrosecond(); + case Type.TimeMillisecond: + return new TimeMillisecond(); + case Type.TimeSecond: + return new TimeSecond(); + case Type.TimestampNanosecond: + return sanitizeTypedTimestamp(typeLike, TimestampNanosecond); + case Type.TimestampMicrosecond: + return sanitizeTypedTimestamp(typeLike, TimestampMicrosecond); + case Type.TimestampMillisecond: + return sanitizeTypedTimestamp(typeLike, TimestampMillisecond); + case Type.TimestampSecond: + return sanitizeTypedTimestamp(typeLike, TimestampSecond); + case Type.DenseUnion: + return sanitizeTypedUnion(typeLike, DenseUnion); + case Type.SparseUnion: + return sanitizeTypedUnion(typeLike, SparseUnion); + case Type.IntervalDayTime: + return new IntervalDayTime(); + case Type.IntervalYearMonth: + return new IntervalYearMonth(); + case Type.DurationNanosecond: + return new DurationNanosecond(); + case Type.DurationMicrosecond: + return new DurationMicrosecond(); + case Type.DurationMillisecond: + return new DurationMillisecond(); + case Type.DurationSecond: + return new DurationSecond(); + default: + throw new Error("Unrecoginized type id in schema: " + typeId); + } +} + +function sanitizeField(fieldLike: unknown): Field { + if (fieldLike instanceof Field) { + return fieldLike; + } + if (typeof fieldLike !== "object" || fieldLike === null) { + throw Error("Expected a Field but object was null/undefined"); + } + if ( + !("type" in fieldLike) || + !("name" in fieldLike) || + !("nullable" in fieldLike) + ) { + throw Error( + "The field passed in is missing a `type`/`name`/`nullable` property", + ); + } + const type = sanitizeType(fieldLike.type); + const name = fieldLike.name; + if (!(typeof name === "string")) { + throw Error("The field passed in had a non-string `name` property"); + } + const nullable = fieldLike.nullable; + if (!(typeof nullable === "boolean")) { + throw Error("The field passed in had a non-boolean `nullable` property"); + } + let metadata; + if ("metadata" in fieldLike) { + metadata = sanitizeMetadata(fieldLike.metadata); + } + return new Field(name, type, nullable, metadata); +} + +export function sanitizeSchema(schemaLike: unknown): Schema { + if (schemaLike instanceof Schema) { + return schemaLike; + } + if (typeof schemaLike !== "object" || schemaLike === null) { + throw Error("Expected a Schema but object was null/undefined"); + } + if (!("fields" in schemaLike)) { + throw Error( + "The schema passed in does not appear to be a schema (no 'fields' property)", + ); + } + let metadata; + if ("metadata" in schemaLike) { + metadata = sanitizeMetadata(schemaLike.metadata); + } + if (!Array.isArray(schemaLike.fields)) { + throw Error( + "The schema passed in had a 'fields' property but it was not an array", + ); + } + const sanitizedFields = schemaLike.fields.map((field) => + sanitizeField(field), + ); + return new Schema(sanitizedFields, metadata); +} diff --git a/nodejs/package-lock.json b/nodejs/package-lock.json index 3fc7eb2ff..6f3938cd0 100644 --- a/nodejs/package-lock.json +++ b/nodejs/package-lock.json @@ -23,6 +23,7 @@ "@types/tmp": "^0.2.6", "@typescript-eslint/eslint-plugin": "^6.19.0", "@typescript-eslint/parser": "^6.19.0", + "apache-arrow-old": "npm:apache-arrow@13.0.0", "eslint": "^8.57.0", "eslint-config-prettier": "^9.1.0", "jest": "^29.7.0", @@ -52,7 +53,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/@75lb/deep-merge/-/deep-merge-1.1.1.tgz", "integrity": "sha512-xvgv6pkMGBA6GwdyJbNAnDmfAIR/DfWhrj9jgWh3TY7gRm3KO46x/GPjRg6wJ0nOepwqrNxFfojebh0Df4h4Tw==", - "peer": true, "dependencies": { "lodash.assignwith": "^4.2.0", "typical": "^7.1.1" @@ -65,7 +65,6 @@ "version": "7.1.1", "resolved": "https://registry.npmjs.org/typical/-/typical-7.1.1.tgz", "integrity": "sha512-T+tKVNs6Wu7IWiAce5BgMd7OZfNYUndHwc5MknN+UHOudi7sGZzuHdCadllRuqJ3fPtgFtIH9+lt9qRv6lmpfA==", - "peer": true, "engines": { "node": ">=12.17" } @@ -1437,8 +1436,7 @@ "node_modules/@types/command-line-usage": { "version": "5.0.2", "resolved": "https://registry.npmjs.org/@types/command-line-usage/-/command-line-usage-5.0.2.tgz", - "integrity": "sha512-n7RlEEJ+4x4TS7ZQddTmNSxP+zziEG0TNsMfiRIxcIVXt71ENJ9ojeXmGO3wPoTdn7pJcU2xc3CJYMktNT6DPg==", - "peer": true + "integrity": "sha512-n7RlEEJ+4x4TS7ZQddTmNSxP+zziEG0TNsMfiRIxcIVXt71ENJ9ojeXmGO3wPoTdn7pJcU2xc3CJYMktNT6DPg==" }, "node_modules/@types/graceful-fs": { "version": "4.1.9", @@ -1507,6 +1505,12 @@ "form-data": "^4.0.0" } }, + "node_modules/@types/pad-left": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@types/pad-left/-/pad-left-2.1.1.tgz", + "integrity": "sha512-Xd22WCRBydkGSApl5Bw0PhAOHKSVjNL3E3AwzKaps96IMraPqy5BvZIsBVK6JLwdybUzjHnuWVwpDd0JjTfHXA==", + "dev": true + }, "node_modules/@types/semver": { "version": "7.5.6", "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.6.tgz", @@ -1910,6 +1914,40 @@ "arrow2csv": "bin/arrow2csv.cjs" } }, + "node_modules/apache-arrow-old": { + "name": "apache-arrow", + "version": "13.0.0", + "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-13.0.0.tgz", + "integrity": "sha512-3gvCX0GDawWz6KFNC28p65U+zGh/LZ6ZNKWNu74N6CQlKzxeoWHpi4CgEQsgRSEMuyrIIXi1Ea2syja7dwcHvw==", + "dev": true, + "dependencies": { + "@types/command-line-args": "5.2.0", + "@types/command-line-usage": "5.0.2", + "@types/node": "20.3.0", + "@types/pad-left": "2.1.1", + "command-line-args": "5.2.1", + "command-line-usage": "7.0.1", + "flatbuffers": "23.5.26", + "json-bignum": "^0.0.3", + "pad-left": "^2.1.0", + "tslib": "^2.5.3" + }, + "bin": { + "arrow2csv": "bin/arrow2csv.js" + } + }, + "node_modules/apache-arrow-old/node_modules/@types/command-line-args": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/@types/command-line-args/-/command-line-args-5.2.0.tgz", + "integrity": "sha512-UuKzKpJJ/Ief6ufIaIzr3A/0XnluX7RvFgwkV89Yzvm77wCh1kFaFmqN8XEnGcN62EuHdedQjEMb8mYxFLGPyA==", + "dev": true + }, + "node_modules/apache-arrow-old/node_modules/@types/node": { + "version": "20.3.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.3.0.tgz", + "integrity": "sha512-cumHmIAf6On83X7yP+LrsEyUOf/YlociZelmpRYaGFydoaPdxdt80MAbu6vWerQT2COCp2nPvHdsbD7tHn/YlQ==", + "dev": true + }, "node_modules/argparse": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", @@ -1923,7 +1961,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/array-back/-/array-back-3.1.0.tgz", "integrity": "sha512-TkuxA4UCOvxuDK6NZYXCalszEzj+TLszyASooky+i742l9TqsOdYCMJJupxRic61hwquNtppB3hgcuq9SVSH1Q==", - "peer": true, "engines": { "node": ">=6" } @@ -2200,7 +2237,6 @@ "version": "0.4.0", "resolved": "https://registry.npmjs.org/chalk-template/-/chalk-template-0.4.0.tgz", "integrity": "sha512-/ghrgmhfY8RaSdeo43hNXxpoHAtxdbskUHjPpfqUWGttFgycUhYPGx3YZBCnUCvOa7Doivn1IZec3DEGFoMgLg==", - "peer": true, "dependencies": { "chalk": "^4.1.2" }, @@ -2297,7 +2333,6 @@ "version": "5.2.1", "resolved": "https://registry.npmjs.org/command-line-args/-/command-line-args-5.2.1.tgz", "integrity": "sha512-H4UfQhZyakIjC74I9d34fGYDwk3XpSr17QhEd0Q3I9Xq1CETHo4Hcuo87WyWHpAF1aSLjLRf5lD9ZGX2qStUvg==", - "peer": true, "dependencies": { "array-back": "^3.1.0", "find-replace": "^3.0.0", @@ -2312,7 +2347,6 @@ "version": "7.0.1", "resolved": "https://registry.npmjs.org/command-line-usage/-/command-line-usage-7.0.1.tgz", "integrity": "sha512-NCyznE//MuTjwi3y84QVUGEOT+P5oto1e1Pk/jFPVdPPfsG03qpTIl3yw6etR+v73d0lXsoojRpvbru2sqePxQ==", - "peer": true, "dependencies": { "array-back": "^6.2.2", "chalk-template": "^0.4.0", @@ -2327,7 +2361,6 @@ "version": "6.2.2", "resolved": "https://registry.npmjs.org/array-back/-/array-back-6.2.2.tgz", "integrity": "sha512-gUAZ7HPyb4SJczXAMUXMGAvI976JoK3qEx9v1FTmeYuJj0IBiaKttG1ydtGKdkfqWkIkouke7nG8ufGy77+Cvw==", - "peer": true, "engines": { "node": ">=12.17" } @@ -2336,7 +2369,6 @@ "version": "7.1.1", "resolved": "https://registry.npmjs.org/typical/-/typical-7.1.1.tgz", "integrity": "sha512-T+tKVNs6Wu7IWiAce5BgMd7OZfNYUndHwc5MknN+UHOudi7sGZzuHdCadllRuqJ3fPtgFtIH9+lt9qRv6lmpfA==", - "peer": true, "engines": { "node": ">=12.17" } @@ -2872,7 +2904,6 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/find-replace/-/find-replace-3.0.0.tgz", "integrity": "sha512-6Tb2myMioCAgv5kfvP5/PkZZ/ntTpVK39fHY7WkWBgvbeE+VHd/tZuZ4mrC+bxh4cfOZeYKVPaJIZtZXV7GNCQ==", - "peer": true, "dependencies": { "array-back": "^3.0.1" }, @@ -2913,8 +2944,7 @@ "node_modules/flatbuffers": { "version": "23.5.26", "resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-23.5.26.tgz", - "integrity": "sha512-vE+SI9vrJDwi1oETtTIFldC/o9GsVKRM+s6EL0nQgxXlYV1Vc4Tk30hj4xGICftInKQKj1F3up2n8UbIVobISQ==", - "peer": true + "integrity": "sha512-vE+SI9vrJDwi1oETtTIFldC/o9GsVKRM+s6EL0nQgxXlYV1Vc4Tk30hj4xGICftInKQKj1F3up2n8UbIVobISQ==" }, "node_modules/flatted": { "version": "3.2.9", @@ -4158,7 +4188,6 @@ "version": "0.0.3", "resolved": "https://registry.npmjs.org/json-bignum/-/json-bignum-0.0.3.tgz", "integrity": "sha512-2WHyXj3OfHSgNyuzDbSxI1w2jgw5gkWSWhS7Qg4bWXx1nLk3jnbwfUeS0PSba3IzpTUWdHxBieELUzXRjQB2zg==", - "peer": true, "engines": { "node": ">=0.8" } @@ -4269,14 +4298,12 @@ "node_modules/lodash.assignwith": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/lodash.assignwith/-/lodash.assignwith-4.2.0.tgz", - "integrity": "sha512-ZznplvbvtjK2gMvnQ1BR/zqPFZmS6jbK4p+6Up4xcRYA7yMIwxHCfbTcrYxXKzzqLsQ05eJPVznEW3tuwV7k1g==", - "peer": true + "integrity": "sha512-ZznplvbvtjK2gMvnQ1BR/zqPFZmS6jbK4p+6Up4xcRYA7yMIwxHCfbTcrYxXKzzqLsQ05eJPVznEW3tuwV7k1g==" }, "node_modules/lodash.camelcase": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", - "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==", - "peer": true + "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==" }, "node_modules/lodash.memoize": { "version": "4.1.2", @@ -4588,6 +4615,18 @@ "node": ">=6" } }, + "node_modules/pad-left": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/pad-left/-/pad-left-2.1.0.tgz", + "integrity": "sha512-HJxs9K9AztdIQIAIa/OIazRAUW/L6B9hbQDxO4X07roW3eo9XqZc2ur9bn1StH9CnbbI9EgvejHQX7CBpCF1QA==", + "dev": true, + "dependencies": { + "repeat-string": "^1.5.4" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -4865,6 +4904,15 @@ "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==", "dev": true }, + "node_modules/repeat-string": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz", + "integrity": "sha512-PV0dzCYDNfRi1jCDbJzpW7jNNDRuCOG/jI5ctQcGKt/clZD+YcPS3yIlWuTJMmESC8aevCFmWJy5wjAFgNqN6w==", + "dev": true, + "engines": { + "node": ">=0.10" + } + }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -5088,7 +5136,6 @@ "version": "3.0.1", "resolved": "https://registry.npmjs.org/stream-read-all/-/stream-read-all-3.0.1.tgz", "integrity": "sha512-EWZT9XOceBPlVJRrYcykW8jyRSZYbkb/0ZK36uLEmoWVO5gxBOnntNTseNzfREsqxqdfEGQrD8SXQ3QWbBmq8A==", - "peer": true, "engines": { "node": ">=10" } @@ -5180,7 +5227,6 @@ "version": "3.0.2", "resolved": "https://registry.npmjs.org/table-layout/-/table-layout-3.0.2.tgz", "integrity": "sha512-rpyNZYRw+/C+dYkcQ3Pr+rLxW4CfHpXjPDnG7lYhdRoUcZTUt+KEsX+94RGp/aVp/MQU35JCITv2T/beY4m+hw==", - "peer": true, "dependencies": { "@75lb/deep-merge": "^1.1.1", "array-back": "^6.2.2", @@ -5201,7 +5247,6 @@ "version": "6.2.2", "resolved": "https://registry.npmjs.org/array-back/-/array-back-6.2.2.tgz", "integrity": "sha512-gUAZ7HPyb4SJczXAMUXMGAvI976JoK3qEx9v1FTmeYuJj0IBiaKttG1ydtGKdkfqWkIkouke7nG8ufGy77+Cvw==", - "peer": true, "engines": { "node": ">=12.17" } @@ -5210,7 +5255,6 @@ "version": "7.1.1", "resolved": "https://registry.npmjs.org/typical/-/typical-7.1.1.tgz", "integrity": "sha512-T+tKVNs6Wu7IWiAce5BgMd7OZfNYUndHwc5MknN+UHOudi7sGZzuHdCadllRuqJ3fPtgFtIH9+lt9qRv6lmpfA==", - "peer": true, "engines": { "node": ">=12.17" } @@ -5335,8 +5379,7 @@ "node_modules/tslib": { "version": "2.6.2", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", - "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==", - "peer": true + "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==" }, "node_modules/type-check": { "version": "0.4.0", @@ -5672,7 +5715,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/typical/-/typical-4.0.0.tgz", "integrity": "sha512-VAH4IvQ7BDFYglMd7BPRDfLgxZZX4O4TFcRDA6EN5X7erNJJq+McIEp8np9aVtxrCJ6qx4GTYVfOWNjcqwZgRw==", - "peer": true, "engines": { "node": ">=8" } @@ -5819,7 +5861,6 @@ "version": "5.1.0", "resolved": "https://registry.npmjs.org/wordwrapjs/-/wordwrapjs-5.1.0.tgz", "integrity": "sha512-JNjcULU2e4KJwUNv6CHgI46UvDGitb6dGryHajXTDiLgg1/RiGoPSDw4kZfYnwGtEXf2ZMeIewDQgFGzkCB2Sg==", - "peer": true, "engines": { "node": ">=12.17" } diff --git a/nodejs/package.json b/nodejs/package.json index 0d2cecb64..e023e3560 100644 --- a/nodejs/package.json +++ b/nodejs/package.json @@ -22,6 +22,7 @@ "@types/tmp": "^0.2.6", "@typescript-eslint/eslint-plugin": "^6.19.0", "@typescript-eslint/parser": "^6.19.0", + "apache-arrow-old": "npm:apache-arrow@13.0.0", "eslint": "^8.57.0", "eslint-config-prettier": "^9.1.0", "jest": "^29.7.0", @@ -55,7 +56,7 @@ "build": "npm run build:debug && tsc -b", "chkformat": "prettier . --check", "docs": "typedoc --plugin typedoc-plugin-markdown lancedb/index.ts", - "lint": "eslint lancedb", + "lint": "eslint lancedb && eslint __test__", "prepublishOnly": "napi prepublish -t npm", "test": "npm run build && jest --verbose", "universal": "napi universal",