diff --git a/.gitignore b/.gitignore index 62c8935..a888a45 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.idea/ \ No newline at end of file +.idea/ +.tool-versions \ No newline at end of file diff --git a/html/extractor.go b/html/extractor.go index 25a3e4c..a862656 100644 --- a/html/extractor.go +++ b/html/extractor.go @@ -1,9 +1,11 @@ package html import ( - "golang.org/x/net/html" + "fmt" "regexp" "strings" + + "golang.org/x/net/html" ) // Extractor represents an HTML-specific plain text extractor. @@ -32,7 +34,7 @@ func (e *Extractor) PlainText(input string) (*string, error) { } var plainText strings.Builder - e.extractText(&plainText, doc) + e.extractText(&plainText, doc, 0) output := plainText.String() output = string(regexp.MustCompile("\n+\\s+").ReplaceAll([]byte(output), []byte("\n"))) @@ -40,7 +42,14 @@ func (e *Extractor) PlainText(input string) (*string, error) { } // Recursively extract plain text from the HTML nodes. -func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) { +func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node, idx int) { + liType := e.listItemType(node) + if liType == OrderedListItem { + plainText.WriteString(fmt.Sprintf("%d.", idx)) + } else if liType == UnorderedListItem { + plainText.WriteString("-") + } + if node.Type == html.TextNode { // Trim and append the text content text := strings.TrimSpace(node.Data) @@ -56,10 +65,40 @@ func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) { return } + i := 0 + var isList bool = node.DataAtom.String() == "ul" || node.DataAtom.String() == "ol" for child := node.FirstChild; child != nil; child = child.NextSibling { - e.extractText(plainText, child) + if isList { + i++ + } + e.extractText(plainText, child, i) } if found := e.blockTags[node.DataAtom.String()]; found { plainText.WriteString("\n") } } + +type ListItemType int + +const ( + Unknown ListItemType = iota + UnorderedListItem ListItemType = 1 + OrderedListItem ListItemType = 2 +) + +func (e *Extractor) listItemType(node *html.Node) ListItemType { + if node.DataAtom.String() != "li" { + return Unknown + } + + for p := node.Parent; p != nil; p = p.Parent { + if p.DataAtom.String() == "ul" { + return UnorderedListItem + } + if p.DataAtom.String() == "ol" { + return OrderedListItem + } + } + + return Unknown +} diff --git a/html/extractor_test.go b/html/extractor_test.go index b23e07e..02aa3ab 100644 --- a/html/extractor_test.go +++ b/html/extractor_test.go @@ -1,9 +1,9 @@ package html import ( - _ "embed" - "github.com/stretchr/testify/assert" "testing" + + "github.com/stretchr/testify/assert" ) func TestExtract(t *testing.T) { @@ -16,7 +16,8 @@ func TestExtract(t *testing.T) { {`a

b

`, "a\nb\n"}, {`link`, "link"}, {`
This is a link
`, "This is a link\n"}, - {"

Heading 1

Heading 2

", "Heading 1\nHeading 2\nItem 1\nItem 2\n"}, + {"

Heading 1

Heading 2

", "Heading 1\nHeading 2\n- Item 1\n- Item 2\n"}, + {"

Heading 1

Heading 2

  1. Item 1
  2. Item 2
", "Heading 1\nHeading 2\n1. Item 1\n2. Item 2\n"}, {"

ab

c", "a b\nc"}, {"a\n \nb", "a\nb"}, } diff --git a/parser_test.go b/parser_test.go index fcbaec0..1464d4e 100644 --- a/parser_test.go +++ b/parser_test.go @@ -1,10 +1,11 @@ package plaintext import ( + "testing" + "github.com/huantt/plaintext-extractor/html" "github.com/huantt/plaintext-extractor/markdown" "github.com/stretchr/testify/assert" - "testing" ) func TestParseHtml(t *testing.T) { @@ -12,7 +13,7 @@ func TestParseHtml(t *testing.T) { input string expected string }{ - {`
This is a link
`, "This is a link"}, + {`
This is a link
`, "This is a link\n"}, } for _, test := range tests { extractor := NewHtmlExtractor() @@ -44,8 +45,8 @@ func TestMultipleExtractors(t *testing.T) { input string expected string }{ - {"
html
*markdown*", "html markdown"}, - {"
*markdown in html*
", "markdown in html"}, + {"
html
*markdown*", "html\nmarkdown"}, + {"
*markdown in html*
", "markdown in html\n"}, } for _, test := range tests { markdownExtractor := markdown.NewExtractor()