huantt · armando-swarm · Jan 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-.idea/
+.idea/
+.tool-versions
diff --git a/html/extractor.go b/html/extractor.go
@@ -1,9 +1,11 @@
 package html
 
 import (
-	"golang.org/x/net/html"
+	"fmt"
 	"regexp"
 	"strings"
+
+	"golang.org/x/net/html"
 )
 
 // Extractor represents an HTML-specific plain text extractor.
@@ -32,15 +34,22 @@ func (e *Extractor) PlainText(input string) (*string, error) {
 	}
 
 	var plainText strings.Builder
-	e.extractText(&plainText, doc)
+	e.extractText(&plainText, doc, 0)
 
 	output := plainText.String()
 	output = string(regexp.MustCompile("\n+\\s+").ReplaceAll([]byte(output), []byte("\n")))
 	return &output, nil
 }
 
 // Recursively extract plain text from the HTML nodes.
-func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
+func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node, idx int) {
+	liType := e.listItemType(node)
+	if liType == OrderedListItem {
+		plainText.WriteString(fmt.Sprintf("%d.", idx))
+	} else if liType == UnorderedListItem {
+		plainText.WriteString("-")
+	}
+
 	if node.Type == html.TextNode {
 		// Trim and append the text content
 		text := strings.TrimSpace(node.Data)
@@ -56,10 +65,40 @@ func (e *Extractor) extractText(plainText *strings.Builder, node *html.Node) {
 		return
 	}
 
+	i := 0
+	var isList bool = node.DataAtom.String() == "ul" || node.DataAtom.String() == "ol"
 	for child := node.FirstChild; child != nil; child = child.NextSibling {
-		e.extractText(plainText, child)
+		if isList {
+			i++
+		}
+		e.extractText(plainText, child, i)
 	}
 	if found := e.blockTags[node.DataAtom.String()]; found {
 		plainText.WriteString("\n")
 	}
 }
+
+type ListItemType int
+
+const (
+	Unknown           ListItemType = iota
+	UnorderedListItem ListItemType = 1
+	OrderedListItem   ListItemType = 2
+)
+
+func (e *Extractor) listItemType(node *html.Node) ListItemType {
+	if node.DataAtom.String() != "li" {
+		return Unknown
+	}
+
+	for p := node.Parent; p != nil; p = p.Parent {
+		if p.DataAtom.String() == "ul" {
+			return UnorderedListItem
+		}
+		if p.DataAtom.String() == "ol" {
+			return OrderedListItem
+		}
+	}
+
+	return Unknown
+}
diff --git a/html/extractor_test.go b/html/extractor_test.go
@@ -1,9 +1,9 @@
 package html
 
 import (
-	_ "embed"
-	"github.com/stretchr/testify/assert"
 	"testing"
+
+	"github.com/stretchr/testify/assert"
 )
 
 func TestExtract(t *testing.T) {
@@ -16,7 +16,8 @@ func TestExtract(t *testing.T) {
 		{`a<br><h1>b</h1>`, "a\nb\n"},
 		{`<a href="https://example.com">link</a>`, "link"},
 		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link\n"},
-		{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\nItem 1\nItem 2\n"},
+		{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ul><li>Item 1</li><li>Item 2</li></ul></div>", "Heading 1\nHeading 2\n- Item 1\n- Item 2\n"},
+		{"<div><h1>Heading 1</h1><h2>Heading 2</h2><ol><li>Item 1</li><li>Item 2</li></ol></div>", "Heading 1\nHeading 2\n1. Item 1\n2. Item 2\n"},
 		{"<p><span>a</span><span>b</span></p> c", "a b\nc"},
 		{"a\n \nb", "a\nb"},
 	}

diff --git a/parser_test.go b/parser_test.go
@@ -1,18 +1,19 @@
 package plaintext
 
 import (
+	"testing"
+
 	"github.com/huantt/plaintext-extractor/html"
 	"github.com/huantt/plaintext-extractor/markdown"
 	"github.com/stretchr/testify/assert"
-	"testing"
 )
 
 func TestParseHtml(t *testing.T) {
 	tests := []struct {
 		input    string
 		expected string
 	}{
-		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link"},
+		{`<div>This is a <a href="https://example.com">link</a></div>`, "This is a link\n"},
 	}
 	for _, test := range tests {
 		extractor := NewHtmlExtractor()
@@ -44,8 +45,8 @@ func TestMultipleExtractors(t *testing.T) {
 		input    string
 		expected string
 	}{
-		{"<div> html </div> *markdown*", "html markdown"},
-		{"<div> *markdown in html* </div>", "markdown in html"},
+		{"<div> html </div> *markdown*", "html\nmarkdown"},
+		{"<div> *markdown in html* </div>", "markdown in html\n"},
 	}
 	for _, test := range tests {
 		markdownExtractor := markdown.NewExtractor()