From 0ffe10cbef3a2bd0babc99f0a1491c582fc54082 Mon Sep 17 00:00:00 2001
From: Yilei Yang <hi@mangoumbrella.com>
Date: Sun, 2 Feb 2025 00:23:42 -0800
Subject: [PATCH] Fix a bug where markdown headers in later lines weren't
 stripped.

---
 markdown/extractor.go      | 2 +-
 markdown/extractor_test.go | 1 +
 markdown/tag.go            | 6 ++++--
 parser_test.go             | 9 +++++----
 4 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/markdown/extractor.go b/markdown/extractor.go
index bff5965..89cf913 100644
--- a/markdown/extractor.go
+++ b/markdown/extractor.go
@@ -33,7 +33,7 @@ func (e *Extractor) PlainText(input string) (*string, error) {
 		for _, fullTag := range listFullTag {
 			var plain = fullTag
 			if tag.StartRegex != nil {
-				plain = tag.StartRegex.ReplaceAll(plain, nil)
+				plain = tag.StartRegex.ReplaceAll(plain, tag.StartReplacement)
 			}
 			if tag.EndRegex != nil {
 				plain = tag.EndRegex.ReplaceAll(plain, nil)
diff --git a/markdown/extractor_test.go b/markdown/extractor_test.go
index 2f9d670..f4adabe 100644
--- a/markdown/extractor_test.go
+++ b/markdown/extractor_test.go
@@ -17,6 +17,7 @@ func TestExtract(t *testing.T) {
 		{"~~strikethrough~~", "strikethrough"},
 		{"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"},
 		{"# H1 \n new line", "H1 \n new line"},
+		{"# H1 \n new line \n## H2 \n new line", "H1 \n new line \nH2 \n new line"},
 		{"*italic*", "italic"},
 		{"**bold**", "bold"},
 		{"`code`", "code"},
diff --git a/markdown/tag.go b/markdown/tag.go
index 37b707b..0f0695f 100644
--- a/markdown/tag.go
+++ b/markdown/tag.go
@@ -6,14 +6,16 @@ type Tag struct {
 	Name       string
 	FullRegex  *regexp.Regexp
 	StartRegex *regexp.Regexp
+	StartReplacement []byte
 	EndRegex   *regexp.Regexp
 }
 
 var tags = []Tag{
 	{
 		Name:       "Header",
-		FullRegex:  regexp.MustCompile(`^#{1,6}\s+(.*)`),
-		StartRegex: regexp.MustCompile(`^#{1,6}\s+`),
+		FullRegex:  regexp.MustCompile(`(^|\n)#{1,6}\s+(.*)`),
+		StartRegex: regexp.MustCompile(`(^|\n)#{1,6}\s+`),
+		StartReplacement: []byte("$1"),
 		EndRegex:   nil,
 	},
 	{
diff --git a/parser_test.go b/parser_test.go
index fcbaec0..1464d4e 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -1,10 +1,11 @@
 package plaintext
 
 import (
+	"testing"
+
 	"github.com/huantt/plaintext-extractor/html"
 	"github.com/huantt/plaintext-extractor/markdown"
 	"github.com/stretchr/testify/assert"
-	"testing"
 )
 
 func TestParseHtml(t *testing.T) {
@@ -12,7 +13,7 @@ func TestParseHtml(t *testing.T) {
 		input    string
 		expected string
 	}{
-		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link"},
+		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link\n"},
 	}
 	for _, test := range tests {
 		extractor := NewHtmlExtractor()
@@ -44,8 +45,8 @@ func TestMultipleExtractors(t *testing.T) {
 		input    string
 		expected string
 	}{
-		{"<div> html </div> *markdown*", "html markdown"},
-		{"<div> *markdown in html* </div>", "markdown in html"},
+		{"<div> html </div> *markdown*", "html\nmarkdown"},
+		{"<div> *markdown in html* </div>", "markdown in html\n"},
 	}
 	for _, test := range tests {
 		markdownExtractor := markdown.NewExtractor()