Skip to content

Commit f2c2ee3

Browse files
authored
fix(core): preserve whitespace edge cases but collapse html formatting newlines (BLO-1065) (#2551)
* fix(core): preserve whitespace edge cases but collapse html formatting newlines (BLO-1065) - Added a targeted DOM preprocessing step before ProseMirror parsing. - Explictly replicates CSS white-space: normal behavior internally to fix MS Word line breaks. - Retains preserveWhitespace: true in PM to satisfy AI diffing constraints from PR #2230. - Skips Notion HTML to preserve intentional hard breaks. * delete file
1 parent e519f9c commit f2c2ee3

File tree

6 files changed

+285
-9
lines changed

6 files changed

+285
-9
lines changed

packages/core/src/api/parsers/html/parseHTML.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ import {
88
import { Block } from "../../../blocks/defaultBlocks.js";
99
import { nodeToBlock } from "../../nodeConversions/nodeToBlock.js";
1010
import { nestedListsToBlockNoteStructure } from "./util/nestedLists.js";
11+
import { preprocessHTMLWhitespace } from "./util/normalizeWhitespace.js";
1112

1213
export function HTMLToBlocks<
1314
BSchema extends BlockSchema,
1415
I extends InlineContentSchema,
1516
S extends StyleSchema,
1617
>(html: string, pmSchema: Schema): Block<BSchema, I, S>[] {
1718
const htmlNode = nestedListsToBlockNoteStructure(html);
19+
preprocessHTMLWhitespace(htmlNode);
1820
const parser = DOMParser.fromSchema(pmSchema);
1921

2022
// Other approach might be to use
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/**
2+
* Checks if the given HTML element contains markers indicating it was
3+
* generated by Notion. Notion uses `\n` in text nodes to represent hard
4+
* breaks, which is non-standard but intentional.
5+
*
6+
* Detected by the `<!-- notionvc: UUID -->` comment that Notion places
7+
* on the clipboard.
8+
*/
9+
function isNotionHTML(element: HTMLElement): boolean {
10+
const walker = element.ownerDocument.createTreeWalker(
11+
element,
12+
// NodeFilter.SHOW_COMMENT
13+
128,
14+
);
15+
16+
let node: Node | null;
17+
while ((node = walker.nextNode())) {
18+
if (/^\s*notionvc:/.test(node.nodeValue || "")) {
19+
return true;
20+
}
21+
}
22+
23+
return false;
24+
}
25+
26+
/**
27+
* Normalizes whitespace in text nodes by collapsing runs of whitespace
28+
* (including newlines) to single spaces, matching CSS white-space:normal
29+
* behavior.
30+
*
31+
* This is needed because ProseMirror's DOMParser, when `linebreakReplacement`
32+
* is set in the schema (as BlockNote does for hard breaks), converts `\n`
33+
* characters in text nodes to hard break nodes instead of collapsing them.
34+
* This causes HTML source line wrapping (e.g. from MS Word) to create
35+
* visible line breaks in the editor.
36+
*
37+
* Skipped for sources like Notion that intentionally use `\n` in text nodes
38+
* to represent hard breaks instead of `<br>` tags.
39+
*
40+
* Skips `<pre>` and `<code>` elements where whitespace should be preserved.
41+
*/
42+
function normalizeTextNodeWhitespace(element: HTMLElement) {
43+
const preserveWSTags = new Set(["PRE", "CODE"]);
44+
const walker = element.ownerDocument.createTreeWalker(
45+
element,
46+
// NodeFilter.SHOW_TEXT
47+
4,
48+
{
49+
acceptNode(node) {
50+
// Skip text nodes inside pre/code elements
51+
let parent = node.parentElement;
52+
while (parent && parent !== element) {
53+
if (preserveWSTags.has(parent.tagName)) {
54+
// NodeFilter.FILTER_REJECT
55+
return 2;
56+
}
57+
parent = parent.parentElement;
58+
}
59+
// NodeFilter.FILTER_ACCEPT
60+
return 1;
61+
},
62+
},
63+
);
64+
65+
const textNodes: Text[] = [];
66+
let node: Node | null;
67+
while ((node = walker.nextNode())) {
68+
textNodes.push(node as Text);
69+
}
70+
71+
for (const textNode of textNodes) {
72+
if (textNode.nodeValue && /[\r\n]/.test(textNode.nodeValue)) {
73+
textNode.nodeValue = textNode.nodeValue.replace(/[ \t\r\n\f]+/g, " ");
74+
}
75+
}
76+
}
77+
78+
/**
79+
* Normalizes whitespace in HTML text nodes to match standard CSS
80+
* white-space:normal behavior. Skipped for Notion HTML which intentionally
81+
* uses `\n` for hard breaks.
82+
*/
83+
export function preprocessHTMLWhitespace(element: HTMLElement) {
84+
if (!isNotionHTML(element)) {
85+
normalizeTextNodeWhitespace(element);
86+
}
87+
}

tests/src/unit/core/clipboard/paste/pasteTestInstances.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import { TextSelection } from "@tiptap/pm/state";
22

3-
import {
4-
TestBlockSchema,
5-
TestInlineContentSchema,
6-
TestStyleSchema,
7-
} from "../../testSchema.js";
83
import { PasteTestCase } from "../../../shared/clipboard/paste/pasteTestCase.js";
94
import {
105
testPasteHTML,
116
testPasteMarkdown,
127
} from "../../../shared/clipboard/paste/pasteTestExecutors.js";
138
import { getPosOfTextNode } from "../../../shared/testUtil.js";
149
import { TestInstance } from "../../../types.js";
10+
import {
11+
TestBlockSchema,
12+
TestInlineContentSchema,
13+
TestStyleSchema,
14+
} from "../../testSchema.js";
1515

1616
export const pasteTestInstancesHTML: TestInstance<
1717
PasteTestCase<TestBlockSchema, TestInlineContentSchema, TestStyleSchema>,

tests/src/unit/core/formatConversion/parse/__snapshots__/html/mixedTextTableCell.json

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@
1515
{
1616
"styles": {},
1717
"text": "Table Cell
18-
Table Cell
19-
20-
Table Cell
21-
",
18+
Table Cell Table Cell",
2219
"type": "text",
2320
},
2421
],
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {
7+
"bold": true,
8+
"underline": true,
9+
},
10+
"text": "Que se passe-t-il si je réponds tard à un message chat et que l'utilisateur n'est plus en ligne :",
11+
"type": "text",
12+
},
13+
],
14+
"id": "1",
15+
"props": {
16+
"backgroundColor": "default",
17+
"textAlignment": "left",
18+
"textColor": "default",
19+
},
20+
"type": "paragraph",
21+
},
22+
{
23+
"children": [],
24+
"content": [
25+
{
26+
"styles": {},
27+
"text": "Lorsque vous envoyez un message à un utilisateur dans une conversation chat, et qu'il est encore en ligne, il recevra le message sur sa bulle chatbot.",
28+
"type": "text",
29+
},
30+
],
31+
"id": "2",
32+
"props": {
33+
"backgroundColor": "default",
34+
"textAlignment": "left",
35+
"textColor": "default",
36+
},
37+
"type": "paragraph",
38+
},
39+
{
40+
"children": [],
41+
"content": [
42+
{
43+
"styles": {},
44+
"text": "Cependant S'il n'est plus en ligne, votre message sera envoyé par email si :",
45+
"type": "text",
46+
},
47+
],
48+
"id": "3",
49+
"props": {
50+
"backgroundColor": "default",
51+
"textAlignment": "left",
52+
"textColor": "default",
53+
},
54+
"type": "paragraph",
55+
},
56+
{
57+
"children": [],
58+
"content": [
59+
{
60+
"styles": {},
61+
"text": ". l'utilisateur n'a pas lu votre réponse après 2 minutes",
62+
"type": "text",
63+
},
64+
],
65+
"id": "4",
66+
"props": {
67+
"backgroundColor": "default",
68+
"textAlignment": "left",
69+
"textColor": "default",
70+
},
71+
"type": "paragraph",
72+
},
73+
{
74+
"children": [],
75+
"content": [
76+
{
77+
"styles": {},
78+
"text": ". l'utilisateur n'est plus présent sur votre site web",
79+
"type": "text",
80+
},
81+
],
82+
"id": "5",
83+
"props": {
84+
"backgroundColor": "default",
85+
"textAlignment": "left",
86+
"textColor": "default",
87+
},
88+
"type": "paragraph",
89+
},
90+
{
91+
"children": [],
92+
"content": [
93+
{
94+
"styles": {},
95+
"text": " ",
96+
"type": "text",
97+
},
98+
],
99+
"id": "6",
100+
"props": {
101+
"backgroundColor": "default",
102+
"textAlignment": "left",
103+
"textColor": "default",
104+
},
105+
"type": "paragraph",
106+
},
107+
{
108+
"children": [],
109+
"content": [
110+
{
111+
"styles": {},
112+
"text": "Cela se fait automatiquement donc, lorsque nous répondons par chat, si l'utilisateur n'est plus là, Crisp renvoie le message alors par email et le canal de discussion se transforme en canal de discussion email.
113+
114+
Il est possible aussi de créer une conversation email directement le profil de l'utilisateur (bouton bleu en haut à droite de la conversation)",
115+
"type": "text",
116+
},
117+
],
118+
"id": "7",
119+
"props": {
120+
"backgroundColor": "default",
121+
"textAlignment": "left",
122+
"textColor": "default",
123+
},
124+
"type": "paragraph",
125+
},
126+
]

tests/src/unit/core/formatConversion/parse/parseTestInstances.ts

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,70 @@ console.log("Third Line")</code></pre>`,
10391039
},
10401040
executeTest: testParseHTML,
10411041
},
1042+
{
1043+
testCase: {
1044+
name: "msWordPaste",
1045+
content: `<html xmlns:o="urn:schemas-microsoft-com:office:office"
1046+
xmlns:w="urn:schemas-microsoft-com:office:word"
1047+
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
1048+
xmlns="http://www.w3.org/TR/REC-html40">
1049+
1050+
<head>
1051+
<meta http-equiv=Content-Type content="text/html; charset=utf-8">
1052+
<meta name=ProgId content=Word.Document>
1053+
<meta name=Generator content="Microsoft Word 15">
1054+
<meta name=Originator content="Microsoft Word 15">
1055+
<style>
1056+
<!--
1057+
/* Style Definitions */
1058+
p.MsoNormal, li.MsoNormal, div.MsoNormal
1059+
\t{margin-top:0cm;
1060+
\tmargin-right:0cm;
1061+
\tmargin-bottom:8.0pt;
1062+
\tmargin-left:0cm;
1063+
\tline-height:107%;
1064+
\tfont-size:11.0pt;
1065+
\tfont-family:"Calibri",sans-serif;}
1066+
-->
1067+
</style>
1068+
</head>
1069+
1070+
<body lang=en-NL style='tab-interval:36.0pt;word-wrap:break-word'>
1071+
<!--StartFragment-->
1072+
1073+
<p class=MsoNormal><b><u><span lang=FR>Que se passe-t-il si je réponds tard à
1074+
un message chat et que l'utilisateur n'est plus en ligne&nbsp;:<o:p></o:p></span></u></b></p>
1075+
1076+
<p class=MsoNormal><span lang=FR>Lorsque vous envoyez un message à un
1077+
utilisateur dans une conversation chat, et qu'il est encore en ligne, il
1078+
recevra le message sur sa bulle chatbot.<o:p></o:p></span></p>
1079+
1080+
<p class=MsoNormal style='margin-bottom:0cm;line-height:normal'><span lang=FR>Cependant
1081+
S'il n'est plus en ligne, votre message sera envoyé par email si :<o:p></o:p></span></p>
1082+
1083+
<p class=MsoNormal style='margin-bottom:0cm;line-height:normal'><span lang=FR>.
1084+
l'utilisateur n'a pas lu votre réponse après 2 minutes<o:p></o:p></span></p>
1085+
1086+
<p class=MsoNormal style='margin-bottom:0cm;line-height:normal'><span lang=FR>.
1087+
l'utilisateur n'est plus présent sur votre site web<o:p></o:p></span></p>
1088+
1089+
<p class=MsoNormal><span lang=FR><o:p>&nbsp;</o:p></span></p>
1090+
1091+
<p class=MsoNormal><span lang=FR>Cela se fait automatiquement donc, lorsque
1092+
nous répondons par chat, si l'utilisateur n'est plus là, Crisp renvoie le
1093+
message alors par email et le canal de discussion se transforme en canal de
1094+
discussion email.<br>
1095+
<br>
1096+
Il est possible aussi de créer une conversation email directement le profil de
1097+
l'utilisateur (bouton bleu en haut à droite de la conversation)<o:p></o:p></span></p>
1098+
1099+
<!--EndFragment-->
1100+
</body>
1101+
1102+
</html>`,
1103+
},
1104+
executeTest: testParseHTML,
1105+
},
10421106
];
10431107

10441108
export const parseTestInstancesMarkdown: TestInstance<

0 commit comments

Comments
 (0)