diff --git a/src/paste-markdown-html.ts b/src/paste-markdown-html.ts
index b860883..7644fc7 100644
--- a/src/paste-markdown-html.ts
+++ b/src/paste-markdown-html.ts
@@ -49,6 +49,10 @@ function onPaste(event: ClipboardEvent) {
const parser = new DOMParser()
const doc = parser.parseFromString(textHTMLClean, 'text/html')
+ // Replace all line-break elements with line break characters that will appear in `textContent`
+ for (const br of doc.querySelectorAll('br')) br.replaceWith('\n')
+ doc.normalize()
+
const markdown = convertToMarkdown(plaintext, doc)
// If no changes made by transforming
@@ -163,11 +167,20 @@ function hasHTML(transfer: DataTransfer): boolean {
/** Collapse whitespace in HTML to normalize it with the plain-text representation. Also convert nbsp into regular space. */
function normalizeHtmlWhitespace(text: string): string {
- // Collapse regular whitespace characters but preserve non-breaking spaces without collapsing
- return text
- .replace(/[\t\n\r ]+/g, ' ')
- .trim()
- .replace(/[\u00A0\uC2A0]/g, ' ')
+ // The problem is that the HTML is not actually rendered onto the page, so the browser does not do the normal
+ // whitespace normalizing. This means textContent and innerText both just return the raw text of the node, ignoring
+ // `br` tags. So to be able to compare the parsed HTML with the plain-text variant, we need to make the whitespace
+ // in the HTML match what it would look like when rendered.
+
+ // We don't need to handle block breaks like p tags since we will work across those as separate nodes.
+ return (
+ text
+ // Collapse whitespace that would be collapsed if rendered
+ .replace(/[\t\n\r ]+/g, ' ')
+ // Replace non-breaking space (nbsp) with regular space
+ .replace(/[\u00A0\uC2A0]/g, ' ')
+ .trim()
+ )
}
// Makes markdown link from a link element, avoiding special GitHub links