From 94cdb0e624329a41db48b7c9629e0c5bcdda52d7 Mon Sep 17 00:00:00 2001 From: "Jason E. Aten" Date: Sat, 27 May 2017 22:57:27 -0500 Subject: [PATCH] LineIndex translates offsets from Get() into line, byte column, and rune column, providing value locations in the original file. --- LICENSE | 3 +- linecol.go | 113 ++++++++++++++++++++++++++++++++++++++++++++++++ linecol_test.go | 105 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 linecol.go create mode 100644 linecol_test.go diff --git a/LICENSE b/LICENSE index ac25aeb..b91e714 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (c) 2016 Leonid Bugaev +Portions Copyright (c) 2016 Leonid Bugaev +Portions Copyright (c) 2016 Jason E. Aten Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/linecol.go b/linecol.go new file mode 100644 index 0000000..633373f --- /dev/null +++ b/linecol.go @@ -0,0 +1,113 @@ +package jsonparser + +import ( + "fmt" + "sort" + "unicode/utf8" +) + +// NewlineIndex holds the positions of all newlines +// in a given JSON blob. The JsonBlob must be utf8 text. +type LineIndex struct { + JsonBlob []byte + NewlinePos []int +} + +// NewLineIndex returns a new LineIndex whose +// NewlinePos member contains the byte-based +// locations of all newlines in the utf8 json. +func NewLineIndex(json []byte) *LineIndex { + li := &LineIndex{ + JsonBlob: json, + NewlinePos: []int{}, + } + li.FindNewlines() + return li +} + +// FindNewlines locates the newlines in the utf8 li.JsonBlob. +func (li *LineIndex) FindNewlines() { + + li.NewlinePos = []int{} + + // convert json to a string, in order to range over runes. + // c.f. https://blog.golang.org/strings + sj := string(li.JsonBlob) + for index, rune := range sj { + if rune == '\n' { + li.NewlinePos = append(li.NewlinePos, index) + } + } +} + +// OffsetToLineCol returns the line and column for a given offset, +// provided that li has been constructed by NewLineIndex so that +// li.NewlinePos is valid. It does so by binary search for offset +// on li.NewlinePos, so its time complexity is O(log q) where q +// is the number of newlines in li.JsonBlob. +// +// Note that bytecol is the byte index of the offset on the line, +// while runecol is the utf8 rune index on the line. +// +// OffsetToLineCol returns line of -1 if offset is out of bounds. +// +// Lines are numbered from 0, so offset 0 is at line 0, col 0. +// +func (li *LineIndex) OffsetToLineCol(offset int) (line int, bytecol int, runecol int) { + + if offset >= len(li.JsonBlob) || offset < 0 { + return -1, -1, -1 + } + if offset == 0 { + return 0, 0, 0 + } + n := len(li.NewlinePos) + + if n == 0 { + // no newlines in the indexed li.JsonBlob + return 0, offset, li.bytePosToRunePos(0, offset) + } + if offset >= li.NewlinePos[n-1] { + // on the last line + return n, offset - (li.NewlinePos[n-1] + 1), li.bytePosToRunePos(n, offset) + } + + // binary search to locate the line using the li.NewlinePos index: + // + // sort.Search returns the smallest index i in [0, n) at which f(i) is true, + // assuming that on the range [0, n), f(i) == true implies f(i+1) == true. + // + srch := sort.Search(n, func(i int) bool { + r := (offset < li.NewlinePos[i]) + return r + }) + linestart := li.NewlinePos[srch-1] + 1 + return srch, offset - linestart, li.bytePosToRunePos(srch, offset) +} + +// bytePosToRunePos expects linenoz to be zero-based line-number +// on which offset falls; i.e. that offset >= li.NewlinePos[linenoz-1]; +// and offset < li.NewlinePos[linenoz] assuming linenoz is valid. +// +// It then returns the character (utf8 rune) position of the +// offset on that line. +// +// Since it must parse bytes into utf8 characters, the time complexity of +// bytePosToRunePos is O(length of the line). +// +func (li *LineIndex) bytePosToRunePos(linenoz int, offset int) int { + var beg int + if linenoz > 0 { + beg = li.NewlinePos[linenoz-1] + 1 + } + s := string(li.JsonBlob[beg : offset+1]) + return utf8.RuneCountInString(s) - 1 +} + +func (li *LineIndex) DebugDump() { + fmt.Println() + for i := range li.NewlinePos { + fmt.Printf("li.NewlinePos[i=%v]: %v\n", i, li.NewlinePos[i]) + } + fmt.Println() +} diff --git a/linecol_test.go b/linecol_test.go new file mode 100644 index 0000000..98b1354 --- /dev/null +++ b/linecol_test.go @@ -0,0 +1,105 @@ +package jsonparser + +import ( + "bytes" + "testing" +) + +// TestGetLineCol turns an offset into a line/column position. +func TestGetLineCol(t *testing.T) { + runLineColTest(t, []byte("abc"), []int{}) + runLineColTest(t, []byte("\n"), []int{0}) + runLineColTest(t, []byte("\na\nb\n"), []int{0, 2, 4}) +} + +func runLineColTest(t *testing.T, input []byte, expected []int) { + li := NewLineIndex(input) + obs := li.NewlinePos + if len(expected) != len(obs) { + t.Errorf("runLineColTest failed at pos len(observed)==%v, "+ + "len(expected)=%v; obs='%#v'; expected='%#v'", + len(obs), len(expected), obs, expected) + } else { + for i := range expected { + if obs[i] != expected[i] { + t.Errorf("runLineColTest failed at pos %v, observed='%#v', expected='%#v'", + i, obs, expected) + } + } + } +} + +// TestOffsetToLineCol turns an offset into a line/column position. +func TestOffsetToLineCol(t *testing.T) { + + runOffsetToLineColTest(t, []byte(`{"a":"b"}`), []string{`a`}, []byte(`b`), 0, 5, 5, String) + runOffsetToLineColTest(t, []byte("\n"+`{"a":"b"}`), []string{`a`}, []byte(`b`), 1, 5, 5, String) + runOffsetToLineColTest(t, []byte("\n"+`{"a":"b"}`+"\n"), []string{`a`}, []byte(`b`), 1, 5, 5, String) + runOffsetToLineColTest(t, []byte("\n\n"+`{"a":"b"}`+"\n"), []string{`a`}, []byte(`b`), 2, 5, 5, String) + runOffsetToLineColTest(t, []byte("\n\n"+`{"a":"b"}`+"\n\n"), []string{`a`}, []byte(`b`), 2, 5, 5, String) + runOffsetToLineColTest(t, []byte("\n\n"+`{"a":`+"\n"+`"b"}`+"\n\n"), []string{`a`}, []byte(`b`), 3, 0, 0, String) + runOffsetToLineColTest(t, []byte("\n\n"+`{`+"\n"+`"a":`+"\n"+`"b"}`+"\n\n"), []string{`a`}, []byte(`b`), 4, 0, 0, String) + runOffsetToLineColTest(t, []byte(`{`+"\n"+`"a":`+"\n"+`"b"}`), []string{`a`}, []byte(`b`), 2, 0, 0, String) + runOffsetToLineColTest(t, []byte(`{`+"\n"+`"a":`+`"b"}`), []string{`a`}, []byte(`b`), 1, 4, 4, String) + + // multiline value + runOffsetToLineColTest(t, []byte(`{`+"\n"+`"a":"b`+"\n"+`ye"}`), []string{`a`}, []byte(`b`+"\n"+`ye`), 1, 4, 4, String) + + // multi-byte characters + runOffsetToLineColTest(t, []byte(`{"世界":"世界"}`), []string{`世界`}, []byte(`世界`), 0, 10, 6, String) + runOffsetToLineColTest(t, []byte(`{"世界":`+"\n"+`"世界"}`), []string{`世界`}, []byte(`世界`), 1, 0, 0, String) + +} + +func runOffsetToLineColTest(t *testing.T, input []byte, searchPath []string, + expectedValue []byte, + expectedLine, expectedByteCol, expectedRuneCol int, expectedDataType ValueType) { + + li := NewLineIndex(input) + obs, obsDataType, offs, err := Get(input, searchPath...) + + //fmt.Printf("\n Get(input='%s', searchPath='%#v') returned obs='%#v', obsDataType='%s', offs=%v, err=%v. len(obs)=%v\n", string(input), searchPath, string(obs), obsDataType, offs, err, len(obs)) + + // account for the double quotes around strings in their position + lenObs := len(obs) + if obsDataType == String { + lenObs += 2 + } + + if err != nil { + panic(err) + } + if bytes.Compare(obs, expectedValue) != 0 { + t.Errorf("runOffsetToLineColTest failed, obs != expectedValue, observed='%#v', expected='%#v'", + obs, expectedValue) + } + if obsDataType != expectedDataType { + t.Errorf("runOffsetToLineColTest failed, obsDataType != expectedDataType, observed='%#v', expected='%#v'", + obsDataType, expectedDataType) + } + + // the main event: the call to li.OffsetToLineCol() + // + // Note offs is where the key value *ends*, per the jsonparser.Get() docs. + // Hence we subtract the len(obs) to get the byte offset of the + // beginning of the value. + // + obsLine, obsByteCol, obsRuneCol := li.OffsetToLineCol(offs - lenObs) + + //fmt.Printf("li.OffsetToLineCol(offs=%#v) returned obsLine=%v, obsByteCol=%v, obsRuneCol=%v. len(obs)=%v\n", offs, obsLine, obsByteCol, obsRuneCol, len(obs)) + + if obsLine != expectedLine { + t.Errorf("runOffsetToLineColTest failed, obsLine != expectedLine, observed='%#v', expected='%#v'", + obsLine, expectedLine) + } + if obsByteCol != expectedByteCol { + t.Errorf("runOffsetToLineColTest failed, obsByteCol != expectedByteCol, observed='%#v', expected='%#v'", + obsByteCol, expectedByteCol) + } + + if obsRuneCol != expectedRuneCol { + t.Errorf("runOffsetToLineColTest failed, obsRuneCol != expectedRuneCol, observed='%#v', expected='%#v'", + obsRuneCol, expectedRuneCol) + } + +}