/
markdownUtils.ts
356 lines (325 loc) · 10.4 KB
/
markdownUtils.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
import logger from '@docusaurus/logger';
import matter from 'gray-matter';
import {createSlugger, type Slugger, type SluggerOptions} from './slugger';
// Some utilities for parsing Markdown content. These things are only used on
// server-side when we infer metadata like `title` and `description` from the
// content. Most parsing is still done in MDX through the mdx-loader.
/**
* Parses custom ID from a heading. The ID can contain any characters except
* `{#` and `}`.
*
* @param heading e.g. `## Some heading {#some-heading}` where the last
* character must be `}` for the ID to be recognized
*/
export function parseMarkdownHeadingId(heading: string): {
/**
* The heading content sans the ID part, right-trimmed. e.g. `## Some heading`
*/
text: string;
/** The heading ID. e.g. `some-heading` */
id: string | undefined;
} {
const customHeadingIdRegex = /\s*\{#(?<id>(?:.(?!\{#|\}))*.)\}$/;
const matches = customHeadingIdRegex.exec(heading);
if (matches) {
return {
text: heading.replace(matches[0]!, ''),
id: matches.groups!.id!,
};
}
return {text: heading, id: undefined};
}
// TODO: Find a better way to do so, possibly by compiling the Markdown content,
// stripping out HTML tags and obtaining the first line.
/**
* Creates an excerpt of a Markdown file. This function will:
*
* - Ignore h1 headings (setext or atx)
* - Ignore import/export
* - Ignore code blocks
*
* And for the first contentful line, it will strip away most Markdown
* syntax, including HTML tags, emphasis, links (keeping the text), etc.
*/
export function createExcerpt(fileString: string): string | undefined {
const fileLines = fileString
.trimStart()
// Remove Markdown alternate title
.replace(/^[^\r\n]*\r?\n[=]+/g, '')
.split(/\r?\n/);
let inCode = false;
let inImport = false;
let lastCodeFence = '';
for (const fileLine of fileLines) {
// An empty line marks the end of imports
if (!fileLine.trim() && inImport) {
inImport = false;
}
// Skip empty line.
if (!fileLine.trim()) {
continue;
}
// Skip import/export declaration.
if ((/^(?:import|export)\s.*/.test(fileLine) || inImport) && !inCode) {
inImport = true;
continue;
}
// Skip code block line.
if (fileLine.trim().startsWith('```')) {
const codeFence = fileLine.trim().match(/^`+/)![0]!;
if (!inCode) {
inCode = true;
lastCodeFence = codeFence;
// If we are in a ````-fenced block, all ``` would be plain text instead
// of fences
} else if (codeFence.length >= lastCodeFence.length) {
inCode = false;
}
continue;
} else if (inCode) {
continue;
}
const cleanedLine = fileLine
// Remove HTML tags.
.replace(/<[^>]*>/g, '')
// Remove Title headers
.replace(/^#[^#]+#?/gm, '')
// Remove Markdown + ATX-style headers
.replace(/^#{1,6}\s*(?<text>[^#]*)\s*#{0,6}/gm, '$1')
// Remove emphasis.
.replace(/(?<opening>[*_]{1,3})(?<text>.*?)\1/g, '$2')
// Remove strikethroughs.
.replace(/~~(?<text>\S.*\S)~~/g, '$1')
// Remove images.
.replace(/!\[(?<alt>.*?)\][[(].*?[\])]/g, '$1')
// Remove footnotes.
.replace(/\[\^.+?\](?:: .*$)?/g, '')
// Remove inline links.
.replace(/\[(?<alt>.*?)\][[(].*?[\])]/g, '$1')
// Remove inline code.
.replace(/`(?<text>.+?)`/g, '$1')
// Remove blockquotes.
.replace(/^\s{0,3}>\s?/g, '')
// Remove admonition definition.
.replace(/:::.*/, '')
// Remove Emoji names within colons include preceding whitespace.
.replace(/\s?:(?:::|[^:\n])+:/g, '')
// Remove custom Markdown heading id.
.replace(/\{#*[\w-]+\}/, '')
.trim();
if (cleanedLine) {
return cleanedLine;
}
}
return undefined;
}
/**
* Takes a raw Markdown file content, and parses the front matter using
* gray-matter. Worth noting that gray-matter accepts TOML and other markup
* languages as well.
*
* @throws Throws when gray-matter throws. e.g.:
* ```md
* ---
* foo: : bar
* ---
* ```
*/
export function parseFrontMatter(markdownFileContent: string): {
/** Front matter as parsed by gray-matter. */
frontMatter: {[key: string]: unknown};
/** The remaining content, trimmed. */
content: string;
} {
const {data, content} = matter(markdownFileContent);
return {
frontMatter: data,
content: content.trim(),
};
}
function toTextContentTitle(contentTitle: string): string {
return contentTitle.replace(/`(?<text>[^`]*)`/g, '$<text>');
}
type ParseMarkdownContentTitleOptions = {
/**
* If `true`, the matching title will be removed from the returned content.
* We can promise that at least one empty line will be left between the
* content before and after, but you shouldn't make too much assumption
* about what's left.
*/
removeContentTitle?: boolean;
};
/**
* Takes the raw Markdown content, without front matter, and tries to find an h1
* title (setext or atx) to be used as metadata.
*
* It only searches until the first contentful paragraph, ignoring import/export
* declarations.
*
* It will try to convert markdown to reasonable text, but won't be best effort,
* since it's only used as a fallback when `frontMatter.title` is not provided.
* For now, we just unwrap inline code (``# `config.js` `` => `config.js`).
*/
export function parseMarkdownContentTitle(
contentUntrimmed: string,
options?: ParseMarkdownContentTitleOptions,
): {
/** The content, optionally without the content title. */
content: string;
/** The title, trimmed and without the `#`. */
contentTitle: string | undefined;
} {
const removeContentTitleOption = options?.removeContentTitle ?? false;
const content = contentUntrimmed.trim();
// We only need to detect import statements that will be parsed by MDX as
// `import` nodes, as broken syntax can't render anyways. That means any block
// that has `import` at the very beginning and surrounded by empty lines.
const contentWithoutImport = content
.replace(/^(?:import\s(?:.|\r?\n(?!\r?\n))*(?:\r?\n){2,})*/, '')
.trim();
const regularTitleMatch = /^#[ \t]+(?<title>[^ \t].*)(?:\r?\n|$)/.exec(
contentWithoutImport,
);
const alternateTitleMatch = /^(?<title>.*)\r?\n=+(?:\r?\n|$)/.exec(
contentWithoutImport,
);
const titleMatch = regularTitleMatch ?? alternateTitleMatch;
if (!titleMatch) {
return {content, contentTitle: undefined};
}
const newContent = removeContentTitleOption
? content.replace(titleMatch[0]!, '')
: content;
if (regularTitleMatch) {
return {
content: newContent.trim(),
contentTitle: toTextContentTitle(
regularTitleMatch
.groups!.title!.trim()
.replace(/\s*(?:\{#*[\w-]+\}|#+)$/, ''),
).trim(),
};
}
return {
content: newContent.trim(),
contentTitle: toTextContentTitle(
alternateTitleMatch!.groups!.title!.trim().replace(/\s*=+$/, ''),
).trim(),
};
}
/**
* Makes a full-round parse.
*
* @throws Throws when `parseFrontMatter` throws, usually because of invalid
* syntax.
*/
export function parseMarkdownString(
markdownFileContent: string,
options?: ParseMarkdownContentTitleOptions,
): {
/** @see {@link parseFrontMatter} */
frontMatter: {[key: string]: unknown};
/** @see {@link parseMarkdownContentTitle} */
contentTitle: string | undefined;
/** @see {@link createExcerpt} */
excerpt: string | undefined;
/**
* Content without front matter and (optionally) without title, depending on
* the `removeContentTitle` option.
*/
content: string;
} {
try {
const {frontMatter, content: contentWithoutFrontMatter} =
parseFrontMatter(markdownFileContent);
const {content, contentTitle} = parseMarkdownContentTitle(
contentWithoutFrontMatter,
options,
);
const excerpt = createExcerpt(content);
return {
frontMatter,
content,
contentTitle,
excerpt,
};
} catch (err) {
logger.error(`Error while parsing Markdown front matter.
This can happen if you use special characters in front matter values (try using double quotes around that value).`);
throw err;
}
}
function unwrapMarkdownLinks(line: string): string {
return line.replace(
/\[(?<alt>[^\]]+)\]\([^)]+\)/g,
(match, p1: string) => p1,
);
}
function addHeadingId(
line: string,
slugger: Slugger,
maintainCase: boolean,
): string {
let headingLevel = 0;
while (line.charAt(headingLevel) === '#') {
headingLevel += 1;
}
const headingText = line.slice(headingLevel).trimEnd();
const headingHashes = line.slice(0, headingLevel);
const slug = slugger.slug(unwrapMarkdownLinks(headingText).trim(), {
maintainCase,
});
return `${headingHashes}${headingText} {#${slug}}`;
}
export type WriteHeadingIDOptions = SluggerOptions & {
/** Overwrite existing heading IDs. */
overwrite?: boolean;
};
/**
* Takes Markdown content, returns new content with heading IDs written.
* Respects existing IDs (unless `overwrite=true`) and never generates colliding
* IDs (through the slugger).
*/
export function writeMarkdownHeadingId(
content: string,
options: WriteHeadingIDOptions = {maintainCase: false, overwrite: false},
): string {
const {maintainCase = false, overwrite = false} = options;
const lines = content.split('\n');
const slugger = createSlugger();
// If we can't overwrite existing slugs, make sure other headings don't
// generate colliding slugs by first marking these slugs as occupied
if (!overwrite) {
lines.forEach((line) => {
const parsedHeading = parseMarkdownHeadingId(line);
if (parsedHeading.id) {
slugger.slug(parsedHeading.id);
}
});
}
let inCode = false;
return lines
.map((line) => {
if (line.startsWith('```')) {
inCode = !inCode;
return line;
}
// Ignore h1 headings, as we don't create anchor links for those
if (inCode || !line.startsWith('##')) {
return line;
}
const parsedHeading = parseMarkdownHeadingId(line);
// Do not process if id is already there
if (parsedHeading.id && !overwrite) {
return line;
}
return addHeadingId(parsedHeading.text, slugger, maintainCase);
})
.join('\n');
}