-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmarkdown.ts
More file actions
91 lines (72 loc) · 2.83 KB
/
markdown.ts
File metadata and controls
91 lines (72 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import TurndownService from 'turndown';
import { tables } from 'turndown-plugin-gfm';
import type { ArticleMeta } from './types';
import { cleanArticle } from './article-dom';
import { collectImageUrls } from './images';
import { extractCodeLanguage } from './text-utils';
export function prepareMarkdown(articleEl: Element, meta: ArticleMeta) {
cleanArticle(articleEl);
const baseUrl = new URL(meta.url);
const imageUrls = collectImageUrls(articleEl, baseUrl);
const turndown = new TurndownService({
codeBlockStyle: 'fenced',
fence: '```',
headingStyle: 'atx',
emDelimiter: '_',
});
turndown.use(tables);
turndown.addRule('listParagraph', {
filter: (node: HTMLElement) =>
node.nodeName === 'P' && node.parentNode?.nodeName === 'LI',
replacement: (content: string, node: HTMLElement) => {
const trimmed = content.replace(/\n{2,}/g, '\n').trim();
return node.nextSibling ? `${trimmed}\n` : trimmed;
},
});
turndown.addRule('tableCell', {
filter: ['th', 'td'],
replacement: (content: string, node: HTMLElement) => {
const normalized = normalizeTableCellContent(content);
return formatTableCell(normalized, node);
},
});
turndown.addRule('fencedCodeBlock', {
filter: (node: HTMLElement) =>
node.nodeName === 'PRE' &&
node.firstChild instanceof HTMLElement &&
node.firstChild.nodeName === 'CODE',
replacement: (_content: string, node: HTMLElement) => {
const codeNode = node.firstChild as HTMLElement;
const code = codeNode.textContent ?? '';
const className = codeNode.getAttribute('class') ?? '';
const language = extractCodeLanguage(className);
return `\n\n\
${'```'}${language}\n${code}\n${'```'}\n\n`;
},
});
turndown.addRule('iframeToLink', {
filter: (node: HTMLElement) => node.nodeName === 'IFRAME',
replacement: (_content: string, node: HTMLElement) => {
const src = node.getAttribute('src');
if (!src) return '';
return `\n\n[Embedded content](${src})\n\n`;
},
});
const markdown = turndown.turndown(articleEl as HTMLElement);
return { markdown, imageUrls };
}
function normalizeTableCellContent(content: string) {
const lines = content
.replace(/\r?\n/g, '\n')
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0);
return lines.join('<br>');
}
function formatTableCell(content: string, node: HTMLElement) {
const parent = node.parentNode;
const siblings = Array.from(parent?.childNodes ?? []);
const index = siblings.indexOf(node);
const prefix = index === 0 ? '| ' : ' ';
return `${prefix}${content} |`;
}