From 0ffe10cbef3a2bd0babc99f0a1491c582fc54082 Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Sun, 2 Feb 2025 00:23:42 -0800 Subject: [PATCH] Fix a bug where markdown headers in later lines weren't stripped. --- markdown/extractor.go | 2 +- markdown/extractor_test.go | 1 + markdown/tag.go | 6 ++++-- parser_test.go | 9 +++++---- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/markdown/extractor.go b/markdown/extractor.go index bff5965..89cf913 100644 --- a/markdown/extractor.go +++ b/markdown/extractor.go @@ -33,7 +33,7 @@ func (e *Extractor) PlainText(input string) (*string, error) { for _, fullTag := range listFullTag { var plain = fullTag if tag.StartRegex != nil { - plain = tag.StartRegex.ReplaceAll(plain, nil) + plain = tag.StartRegex.ReplaceAll(plain, tag.StartReplacement) } if tag.EndRegex != nil { plain = tag.EndRegex.ReplaceAll(plain, nil) diff --git a/markdown/extractor_test.go b/markdown/extractor_test.go index 2f9d670..f4adabe 100644 --- a/markdown/extractor_test.go +++ b/markdown/extractor_test.go @@ -17,6 +17,7 @@ func TestExtract(t *testing.T) { {"~~strikethrough~~", "strikethrough"}, {"# H1 \n*italic* **bold** `code` `not code [link](https://example.com) ![image](https://image.com/image.png) ~~strikethrough~~", "H1 \nitalic bold code `not code link image strikethrough"}, {"# H1 \n new line", "H1 \n new line"}, + {"# H1 \n new line \n## H2 \n new line", "H1 \n new line \nH2 \n new line"}, {"*italic*", "italic"}, {"**bold**", "bold"}, {"`code`", "code"}, diff --git a/markdown/tag.go b/markdown/tag.go index 37b707b..0f0695f 100644 --- a/markdown/tag.go +++ b/markdown/tag.go @@ -6,14 +6,16 @@ type Tag struct { Name string FullRegex *regexp.Regexp StartRegex *regexp.Regexp + StartReplacement []byte EndRegex *regexp.Regexp } var tags = []Tag{ { Name: "Header", - FullRegex: regexp.MustCompile(`^#{1,6}\s+(.*)`), - StartRegex: regexp.MustCompile(`^#{1,6}\s+`), + FullRegex: regexp.MustCompile(`(^|\n)#{1,6}\s+(.*)`), + StartRegex: regexp.MustCompile(`(^|\n)#{1,6}\s+`), + StartReplacement: []byte("$1"), EndRegex: nil, }, { diff --git a/parser_test.go b/parser_test.go index fcbaec0..1464d4e 100644 --- a/parser_test.go +++ b/parser_test.go @@ -1,10 +1,11 @@ package plaintext import ( + "testing" + "github.com/huantt/plaintext-extractor/html" "github.com/huantt/plaintext-extractor/markdown" "github.com/stretchr/testify/assert" - "testing" ) func TestParseHtml(t *testing.T) { @@ -12,7 +13,7 @@ func TestParseHtml(t *testing.T) { input string expected string }{ - {`
This is a link
`, "This is a link"}, + {`
This is a link
`, "This is a link\n"}, } for _, test := range tests { extractor := NewHtmlExtractor() @@ -44,8 +45,8 @@ func TestMultipleExtractors(t *testing.T) { input string expected string }{ - {"
html
*markdown*", "html markdown"}, - {"
*markdown in html*
", "markdown in html"}, + {"
html
*markdown*", "html\nmarkdown"}, + {"
*markdown in html*
", "markdown in html\n"}, } for _, test := range tests { markdownExtractor := markdown.NewExtractor()