diff --git a/README.md b/README.md index 960dcf9..f374783 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Unfurl +# Unfurl A metadata scraper with support for oEmbed, Twitter Cards and Open Graph Protocol for Node.js (>=v8.0.0) @@ -26,10 +26,10 @@ npm install unfurl.js --- #### opts - `object` of: - `oembed?: boolean` - support retrieving oembed metadata -- `timeout? number` - req/res timeout in ms, it resets on redirect. 0 to disable (OS limit applies) +- `timeout? number` - req/res timeout in ms, it resets on redirect. 0 to disable (OS limit applies) - `follow?: number` - maximum redirect count. 0 to not follow redirect -- `compress?: boolean` - support gzip/deflate content encoding -- `size?: number` - maximum response body size in bytes. 0 to disable +- `compress?: boolean` - support gzip/deflate content encoding +- `size?: number` - maximum response body size in bytes. 0 to disable - `headers?: Headers | Record | Iterable | Iterable>` - map of request headers, overrides the defaults Default headers: @@ -55,6 +55,7 @@ type Metadata = { favicon?: string author?: string theme_color?: string + canonical_url?: string oEmbed?: { type: 'photo' | 'video' | 'link' | 'rich' version?: string diff --git a/src/index.ts b/src/index.ts index c3ac0e1..173e200 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,7 @@ type ParserContext = { text: string; title?: string; tagName?: string; + canonical_url?: string; }; const defaultHeaders = { @@ -268,6 +269,13 @@ function getMetadata(url: string, opts: Opts) { ]); } + if (parserContext.canonical_url) { + metadata.push([ + "canonical_url", + new URL(parserContext.canonical_url, url).href, + ]); + } + resolve({ oembed, metadata }); }, @@ -315,6 +323,14 @@ function getMetadata(url: string, opts: Opts) { parserContext.favicon = attribs.href; } + if ( + tagname === "link" && + attribs.href && + attribs.rel === "canonical" + ) { + parserContext.canonical_url = attribs.href; + } + let pair: [string, string | string[]]; if (tagname === "meta") { @@ -448,7 +464,7 @@ function parse(url: string) { } } - // some fields map to the same name so once nicwe have one stick with it + // some fields map to the same name so once we have one stick with it target[item.name] || (target[item.name] = metaValue); } diff --git a/test/basic/basic-body.html b/test/basic/basic-body.html index cb69ae0..35963ea 100644 --- a/test/basic/basic-body.html +++ b/test/basic/basic-body.html @@ -8,7 +8,7 @@ ccc - + I'm a SVG diff --git a/test/basic/basic.html b/test/basic/basic.html index d133b76..cb91de5 100644 --- a/test/basic/basic.html +++ b/test/basic/basic.html @@ -8,6 +8,7 @@ + diff --git a/test/basic/test.ts b/test/basic/test.ts index d42a1a2..3940e89 100644 --- a/test/basic/test.ts +++ b/test/basic/test.ts @@ -13,7 +13,7 @@ test("should handle content which is escaped badly", async () => { expect(result.description).toEqual('"'); }); -test("should detect title, description and keywords", async () => { +test("should detect title, description, keywords and canonical URL", async () => { nock("http://localhost") .get("/html/basic") .replyWithFile(200, __dirname + "/basic.html", { @@ -29,12 +29,13 @@ test("should detect title, description and keywords", async () => { keywords: ["a", "b", "c"], title: "ccc", theme_color: "#ff00ff", + canonical_url: "https://ccc.website.test/basic/", }; expect(result).toEqual(expected); }); -test("should detect title, description and keywords even when they are in the body", async () => { +test("should detect title, description, keywords and canonical URL even when they are in the body", async () => { nock("http://localhost") .get("/html/basic-body") .replyWithFile(200, __dirname + "/basic-body.html", { @@ -48,6 +49,7 @@ test("should detect title, description and keywords even when they are in the bo description: "aaa", keywords: ["a", "b", "c"], title: "ccc", + canonical_url: "http://ccc.website.test/basic/", }; expect(result).toEqual(expected);