huantt · yilei · Feb 2, 2025
diff --git a/markdown/extractor.go b/markdown/extractor.go
@@ -33,7 +33,7 @@ func (e *Extractor) PlainText(input string) (*string, error) {
 		for _, fullTag := range listFullTag {
 			var plain = fullTag
 			if tag.StartRegex != nil {
-				plain = tag.StartRegex.ReplaceAll(plain, nil)
+				plain = tag.StartRegex.ReplaceAll(plain, tag.StartReplacement)
 			}
 			if tag.EndRegex != nil {
 				plain = tag.EndRegex.ReplaceAll(plain, nil)

diff --git a/markdown/extractor_test.go b/markdown/extractor_test.go
@@ -17,6 +17,7 @@ func TestExtract(t *testing.T) {
 		{"~~strikethrough~~", "strikethrough"},
 		{"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"},
 		{"# H1 \n new line", "H1 \n new line"},
+		{"# H1 \n new line \n## H2 \n new line", "H1 \n new line \nH2 \n new line"},
 		{"*italic*", "italic"},
 		{"**bold**", "bold"},
 		{"`code`", "code"},

diff --git a/markdown/tag.go b/markdown/tag.go
@@ -6,14 +6,16 @@ type Tag struct {
 	Name       string
 	FullRegex  *regexp.Regexp
 	StartRegex *regexp.Regexp
+	StartReplacement []byte
 	EndRegex   *regexp.Regexp
 }
 
 var tags = []Tag{
 	{
 		Name:       "Header",
-		FullRegex:  regexp.MustCompile(`^#{1,6}\s+(.*)`),
-		StartRegex: regexp.MustCompile(`^#{1,6}\s+`),
+		FullRegex:  regexp.MustCompile(`(^|\n)#{1,6}\s+(.*)`),
+		StartRegex: regexp.MustCompile(`(^|\n)#{1,6}\s+`),
+		StartReplacement: []byte("$1"),
 		EndRegex:   nil,
 	},
 	{

diff --git a/parser_test.go b/parser_test.go
@@ -1,18 +1,19 @@
 package plaintext
 
 import (
+	"testing"
+
 	"github.com/huantt/plaintext-extractor/html"
 	"github.com/huantt/plaintext-extractor/markdown"
 	"github.com/stretchr/testify/assert"
-	"testing"
 )
 
 func TestParseHtml(t *testing.T) {
 	tests := []struct {
 		input    string
 		expected string
 	}{
-		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link"},
+		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link\n"},
 	}
 	for _, test := range tests {
 		extractor := NewHtmlExtractor()
@@ -44,8 +45,8 @@ func TestMultipleExtractors(t *testing.T) {
 		input    string
 		expected string
 	}{
-		{"<div> html </div> *markdown*", "html markdown"},
-		{"<div> *markdown in html* </div>", "markdown in html"},
+		{"<div> html </div> *markdown*", "html\nmarkdown"},
+		{"<div> *markdown in html* </div>", "markdown in html\n"},
 	}
 	for _, test := range tests {
 		markdownExtractor := markdown.NewExtractor()