Skip to content

Commit 55cf0d4

Browse files
authored
Improve cell read performance by optimizing XML parsing (qax-os#2116)
- Rows iterator speedup about 20%, memory allocation reduce about 10% - Update unit test - Extends time out to 50 minutes in GitHub Action for made TestZip64 stable
1 parent ce9061f commit 55cf0d4

File tree

3 files changed

+120
-2
lines changed

3 files changed

+120
-2
lines changed

.github/workflows/go.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
run: go build -v .
3030

3131
- name: Test
32-
run: env GO111MODULE=on go test -v -timeout 30m -race ./... -coverprofile='coverage.txt' -covermode=atomic
32+
run: env GO111MODULE=on go test -v -timeout 50m -race ./... -coverprofile='coverage.txt' -covermode=atomic
3333

3434
- name: Codecov
3535
uses: codecov/codecov-action@v5

rows.go

+58-1
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ func (rows *Rows) rowXMLHandler(rowIterator *rowXMLIterator, xmlElement *xml.Sta
231231
if rowIterator.inElement == "c" {
232232
rowIterator.cellCol++
233233
colCell := xlsxC{}
234-
_ = rows.decoder.DecodeElement(&colCell, xmlElement)
234+
colCell.cellXMLHandler(rows.decoder, xmlElement)
235235
if colCell.R != "" {
236236
if rowIterator.cellCol, _, rowIterator.err = CellNameToCoordinates(colCell.R); rowIterator.err != nil {
237237
return
@@ -244,6 +244,63 @@ func (rows *Rows) rowXMLHandler(rowIterator *rowXMLIterator, xmlElement *xml.Sta
244244
}
245245
}
246246

247+
// cellXMLAttrHandler parse the cell XML element attributes of the worksheet.
248+
func (cell *xlsxC) cellXMLAttrHandler(start *xml.StartElement) error {
249+
for _, attr := range start.Attr {
250+
switch attr.Name.Local {
251+
case "r":
252+
cell.R = attr.Value
253+
case "s":
254+
val, err := strconv.ParseInt(attr.Value, 10, 64)
255+
if err != nil {
256+
return err
257+
}
258+
if math.MinInt <= val && val <= math.MaxInt {
259+
cell.S = int(val)
260+
}
261+
case "t":
262+
cell.T = attr.Value
263+
default:
264+
}
265+
}
266+
return nil
267+
}
268+
269+
// cellXMLHandler parse the cell XML element of the worksheet.
270+
func (cell *xlsxC) cellXMLHandler(decoder *xml.Decoder, start *xml.StartElement) error {
271+
cell.XMLName = start.Name
272+
err := cell.cellXMLAttrHandler(start)
273+
if err != nil {
274+
return err
275+
}
276+
for {
277+
tok, err := decoder.Token()
278+
if err != nil {
279+
return err
280+
}
281+
var se xml.StartElement
282+
switch el := tok.(type) {
283+
case xml.StartElement:
284+
se = el
285+
switch se.Name.Local {
286+
case "v":
287+
err = decoder.DecodeElement(&cell.V, &se)
288+
case "f":
289+
err = decoder.DecodeElement(&cell.F, &se)
290+
case "is":
291+
err = decoder.DecodeElement(&cell.IS, &se)
292+
}
293+
if err != nil {
294+
return err
295+
}
296+
case xml.EndElement:
297+
if el == start.End() {
298+
return nil
299+
}
300+
}
301+
}
302+
}
303+
247304
// Rows returns a rows iterator, used for streaming reading data for a
248305
// worksheet with a large data. This function is concurrency safe. For
249306
// example:

rows_test.go

+61
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/xml"
66
"fmt"
77
"path/filepath"
8+
"strconv"
89
"testing"
910

1011
"github.com/stretchr/testify/assert"
@@ -1157,6 +1158,66 @@ func TestNumberFormats(t *testing.T) {
11571158
assert.Equal(t, "2019/3/19", result, "A1")
11581159
}
11591160

1161+
func TestCellXMLHandler(t *testing.T) {
1162+
var (
1163+
content = []byte(fmt.Sprintf(`<worksheet xmlns="%s"><sheetData><row r="1"><c r="A1" t="s"><v>10</v></c><c r="B1"><is><t>String</t></is></c></row><row r="2"><c r="A2" s="4" t="str"><f>2*A1</f><v>0</v></c><c r="C2" s="1"><f>A3</f><v>2422.3000000000002</v></c><c r="D2" t="d"><v>2022-10-22T15:05:29Z</v></c><c r="F2"></c><c r="G2"></c></row></sheetData></worksheet>`, NameSpaceSpreadSheet.Value))
1164+
expected, ws xlsxWorksheet
1165+
row *xlsxRow
1166+
)
1167+
assert.NoError(t, xml.Unmarshal(content, &expected))
1168+
decoder := xml.NewDecoder(bytes.NewReader(content))
1169+
rows := Rows{decoder: decoder}
1170+
for {
1171+
token, _ := decoder.Token()
1172+
if token == nil {
1173+
break
1174+
}
1175+
switch element := token.(type) {
1176+
case xml.StartElement:
1177+
if element.Name.Local == "row" {
1178+
r, err := strconv.Atoi(element.Attr[0].Value)
1179+
assert.NoError(t, err)
1180+
ws.SheetData.Row = append(ws.SheetData.Row, xlsxRow{R: r})
1181+
row = &ws.SheetData.Row[len(ws.SheetData.Row)-1]
1182+
}
1183+
if element.Name.Local == "c" {
1184+
colCell := xlsxC{}
1185+
assert.NoError(t, colCell.cellXMLHandler(rows.decoder, &element))
1186+
row.C = append(row.C, colCell)
1187+
}
1188+
}
1189+
}
1190+
assert.Equal(t, expected.SheetData.Row, ws.SheetData.Row)
1191+
1192+
for _, rowXML := range []string{
1193+
`<row spans="1:17" r="1"><c r="A1" t="s" s="A"><v>10</v></c></row></sheetData></worksheet>`, // s need number
1194+
`<row spans="1:17" r="1"><c r="A1"><v>10</v> </row></sheetData></worksheet>`, // missing </c>
1195+
`<row spans="1:17" r="1"><c r="B1"><is><t>`, // incorrect data
1196+
} {
1197+
ws := xlsxWorksheet{}
1198+
content := []byte(fmt.Sprintf(`<worksheet xmlns="%s"><sheetData>%s</sheetData></worksheet>`, NameSpaceSpreadSheet.Value, rowXML))
1199+
expected := xml.Unmarshal(content, &ws)
1200+
assert.Error(t, expected)
1201+
decoder := xml.NewDecoder(bytes.NewReader(content))
1202+
rows := Rows{decoder: decoder}
1203+
for {
1204+
token, _ := decoder.Token()
1205+
if token == nil {
1206+
break
1207+
}
1208+
switch element := token.(type) {
1209+
case xml.StartElement:
1210+
if element.Name.Local == "c" {
1211+
colCell := xlsxC{}
1212+
err := colCell.cellXMLHandler(rows.decoder, &element)
1213+
assert.Error(t, err)
1214+
assert.Equal(t, expected, err)
1215+
}
1216+
}
1217+
}
1218+
}
1219+
}
1220+
11601221
func BenchmarkRows(b *testing.B) {
11611222
f, _ := OpenFile(filepath.Join("test", "Book1.xlsx"))
11621223
for i := 0; i < b.N; i++ {

0 commit comments

Comments
 (0)