Skip to content

Commit

Permalink
feat: adding canonical_url information (#106)
Browse files Browse the repository at this point in the history
  • Loading branch information
adam-ismael committed Mar 28, 2023
1 parent 608b61d commit 47d2baa
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 8 deletions.
9 changes: 5 additions & 4 deletions README.md
@@ -1,4 +1,4 @@
# Unfurl
# Unfurl

A metadata scraper with support for oEmbed, Twitter Cards and Open Graph Protocol for Node.js (>=v8.0.0)

Expand Down Expand Up @@ -26,10 +26,10 @@ npm install unfurl.js
---
#### opts - `object` of:
- `oembed?: boolean` - support retrieving oembed metadata
- `timeout? number` - req/res timeout in ms, it resets on redirect. 0 to disable (OS limit applies)
- `timeout? number` - req/res timeout in ms, it resets on redirect. 0 to disable (OS limit applies)
- `follow?: number` - maximum redirect count. 0 to not follow redirect
- `compress?: boolean` - support gzip/deflate content encoding
- `size?: number` - maximum response body size in bytes. 0 to disable
- `compress?: boolean` - support gzip/deflate content encoding
- `size?: number` - maximum response body size in bytes. 0 to disable
- `headers?: Headers | Record<string, string> | Iterable<readonly [string, string]> | Iterable<Iterable<string>>` - map of request headers, overrides the defaults

Default headers:
Expand All @@ -55,6 +55,7 @@ type Metadata = {
favicon?: string
author?: string
theme_color?: string
canonical_url?: string
oEmbed?: {
type: 'photo' | 'video' | 'link' | 'rich'
version?: string
Expand Down
18 changes: 17 additions & 1 deletion src/index.ts
Expand Up @@ -14,6 +14,7 @@ type ParserContext = {
text: string;
title?: string;
tagName?: string;
canonical_url?: string;
};

const defaultHeaders = {
Expand Down Expand Up @@ -268,6 +269,13 @@ function getMetadata(url: string, opts: Opts) {
]);
}

if (parserContext.canonical_url) {
metadata.push([
"canonical_url",
new URL(parserContext.canonical_url, url).href,
]);
}

resolve({ oembed, metadata });
},

Expand Down Expand Up @@ -315,6 +323,14 @@ function getMetadata(url: string, opts: Opts) {
parserContext.favicon = attribs.href;
}

if (
tagname === "link" &&
attribs.href &&
attribs.rel === "canonical"
) {
parserContext.canonical_url = attribs.href;
}

let pair: [string, string | string[]];

if (tagname === "meta") {
Expand Down Expand Up @@ -448,7 +464,7 @@ function parse(url: string) {
}
}

// some fields map to the same name so once nicwe have one stick with it
// some fields map to the same name so once we have one stick with it
target[item.name] || (target[item.name] = metaValue);
}

Expand Down
2 changes: 1 addition & 1 deletion test/basic/basic-body.html
Expand Up @@ -8,7 +8,7 @@
<title>ccc</title>
<meta name="description" content="aaa" />
<meta name="keywords" content="a, b, c" />

<link rel="canonical" href="//ccc.website.test/basic/" />
<svg viewBox="0 0 20 10" xmlns="http://www.w3.org/2000/svg">
<title>I'm a SVG</title>

Expand Down
1 change: 1 addition & 0 deletions test/basic/basic.html
Expand Up @@ -8,6 +8,7 @@
<meta name="description" content="aaa" />
<meta name="keywords" content="a, b, c" />
<meta name="theme-color" content="#ff00ff" />
<link rel="canonical" href="https://ccc.website.test/basic/" />
</head>
<body></body>
</html>
Expand Down
6 changes: 4 additions & 2 deletions test/basic/test.ts
Expand Up @@ -13,7 +13,7 @@ test("should handle content which is escaped badly", async () => {
expect(result.description).toEqual('"');
});

test("should detect title, description and keywords", async () => {
test("should detect title, description, keywords and canonical URL", async () => {
nock("http://localhost")
.get("/html/basic")
.replyWithFile(200, __dirname + "/basic.html", {
Expand All @@ -29,12 +29,13 @@ test("should detect title, description and keywords", async () => {
keywords: ["a", "b", "c"],
title: "ccc",
theme_color: "#ff00ff",
canonical_url: "https://ccc.website.test/basic/",
};

expect(result).toEqual(expected);
});

test("should detect title, description and keywords even when they are in the body", async () => {
test("should detect title, description, keywords and canonical URL even when they are in the body", async () => {
nock("http://localhost")
.get("/html/basic-body")
.replyWithFile(200, __dirname + "/basic-body.html", {
Expand All @@ -48,6 +49,7 @@ test("should detect title, description and keywords even when they are in the bo
description: "aaa",
keywords: ["a", "b", "c"],
title: "ccc",
canonical_url: "http://ccc.website.test/basic/",
};

expect(result).toEqual(expected);
Expand Down

0 comments on commit 47d2baa

Please sign in to comment.