Skip to content

Commit 12ff730

Browse files
feat: Speed up symbol parsing by minimizing allocations (#258)
Changes the symbol parsing logic to minimize allocations. In particular, when we only care about validating symbols (e.g. during document canonicalization when ingesting uploads), there is really ~no need to allocate any strings at all. Validation and parsing share most of the underlying code -- the only change is we create "writer" types which will discard writes (and hence any internal buffer growth) when we're only in validation mode. For parsing mode, it is now possible to pass in pre-allocated values which will be overwritten as possible (unless the symbol is very large, in which case it still needs to allocate new Descriptors). See PR for benchmarks.
1 parent cf2bf08 commit 12ff730

26 files changed

+3645
-2410
lines changed

.github/workflows/golang.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ jobs:
1818
- uses: ./.github/actions/asdf
1919
with:
2020
golang: true
21-
- run: go test ./... -v
21+
- run: go test ./... -v -tags asserts

.prettierignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ reprolang/target
22
reprolang/src/grammar.json
33
reprolang/src/node-types.json
44
bindings/typescript
5+
docs/scip.md
56
.bin

.tool-versions

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
golang 1.20.14
1+
golang 1.22.0
22
nodejs 16.20.2
33
shellcheck 0.7.1
44
yarn 1.22.22

Development.md

+19
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
- [Project structure](#project-structure)
44
- [Code generation](#code-generation)
55
- [Debugging](#debugging)
6+
- [Benchmarking](#benchmarking)
67
- [Testing and adding new SCIP semantics](#testing-and-adding-new-scip-semantics)
78
- [Release a new version](#release-a-new-version)
89

@@ -69,6 +70,24 @@ and is not recommended for use in other settings.
6970
scip lint /path/to/index.scip
7071
```
7172

73+
## Benchmarking
74+
75+
For benchmarks, one can put test SCIP indexes under `dev/sample_indexes`.
76+
77+
Sourcegraph teammates can download several large indexes
78+
from this [Google drive folder](https://drive.google.com/drive/folders/1z62Se7eHaa5T89a16-y7s0Z1qbRY4VCg).
79+
80+
After that you can run:
81+
82+
```bash
83+
go run ./bindings/go/scip/speedtest
84+
```
85+
86+
to see the results.
87+
88+
Make sure to share benchmark results when making changes to
89+
the symbol parsing logic.
90+
7291
## Testing and adding new SCIP semantics
7392

7493
It is helpful to use reprolang to check the existing code navigation behavior,

bindings/go/scip/assertions.go

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//go:build asserts
2+
3+
package scip
4+
5+
func assert(cond bool, msg string) {
6+
if !cond {
7+
panic(msg)
8+
}
9+
}

bindings/go/scip/assertions_noop.go

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
//go:build !asserts
2+
3+
package scip
4+
5+
// assert is a noop in release builds - the implementation is in assertions.go
6+
func assert(cond bool, msg string) {}
+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package internal
2+
3+
import (
4+
"io"
5+
"os"
6+
"path/filepath"
7+
"sync/atomic"
8+
"testing"
9+
10+
"github.com/sourcegraph/beaut"
11+
"github.com/sourcegraph/beaut/lib/knownwf"
12+
conciter "github.com/sourcegraph/conc/iter"
13+
"github.com/sourcegraph/scip/bindings/go/scip"
14+
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
15+
"github.com/stretchr/testify/require"
16+
"google.golang.org/protobuf/proto"
17+
)
18+
19+
func TestParseCompat(t *testing.T) {
20+
for _, path := range shared.SampleIndexes() {
21+
t.Run(filepath.Base(path), func(t *testing.T) {
22+
t.Parallel()
23+
scipReader, err := os.Open(path)
24+
require.Nil(t, err)
25+
scipBytes, err := io.ReadAll(scipReader)
26+
require.Nil(t, err)
27+
scipIndex := scip.Index{}
28+
require.NoError(t, proto.Unmarshal(scipBytes, &scipIndex))
29+
var total atomic.Int64
30+
conciter.ForEach(scipIndex.Documents, func(docPtr **scip.Document) {
31+
document := *docPtr
32+
if total.Load() > 1000*1000 {
33+
return
34+
}
35+
total.Add(int64(len(document.Occurrences)))
36+
var newSym scip.Symbol
37+
for i := 0; i < len(document.Occurrences); i++ {
38+
occ := document.Occurrences[i]
39+
oldSym, oldErr := ParsePartialSymbolV1ToBeDeleted(occ.Symbol, true)
40+
var newErr error
41+
require.NotPanics(t, func() {
42+
str := beaut.NewUTF8StringUnchecked(occ.Symbol, knownwf.UTF8DeserializedFromProtobufString)
43+
newErr = scip.ParseSymbolUTF8With(str, scip.ParseSymbolOptions{
44+
IncludeDescriptors: true,
45+
RecordOutput: &newSym,
46+
})
47+
}, "panic for symbol: %q", occ.Symbol)
48+
if oldErr != nil {
49+
require.Error(t, newErr,
50+
"old parser gave error %v but parse was successful with new parser (symbol: %q)",
51+
oldErr.Error(), occ.Symbol)
52+
continue
53+
} else if newErr != nil {
54+
require.NoError(t, newErr,
55+
"new parser gave error %v but parse was successful with old parser (symbol: %q)",
56+
newErr.Error(), occ.Symbol)
57+
}
58+
require.Equal(t, oldSym.Scheme, newSym.Scheme)
59+
require.Equal(t, oldSym.Package, newSym.Package)
60+
require.Equalf(t, len(oldSym.Descriptors), len(newSym.Descriptors), "symbol: %v, d1: %+v, d2: %+v", occ.Symbol,
61+
oldSym.Descriptors, newSym.Descriptors)
62+
for i, d := range oldSym.Descriptors {
63+
dnew := newSym.Descriptors[i]
64+
require.Equal(t, d.Name, dnew.Name, "symbol: %v", occ.Symbol)
65+
require.Equal(t, d.Suffix, dnew.Suffix, "symbol: %v", occ.Symbol)
66+
require.Equal(t, d.Disambiguator, dnew.Disambiguator, "symbol: %v", occ.Symbol)
67+
}
68+
}
69+
})
70+
})
71+
}
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
package internal
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
7+
"github.com/cockroachdb/errors"
8+
"github.com/sourcegraph/scip/bindings/go/scip"
9+
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
10+
)
11+
12+
func tryParseLocalSymbol(symbol string) (string, error) {
13+
if !strings.HasPrefix(symbol, "local ") {
14+
return "", nil
15+
}
16+
suffix := symbol[6:]
17+
if len(suffix) > 0 && shared.IsSimpleIdentifier(suffix) {
18+
return suffix, nil
19+
}
20+
return "", errors.Newf("expected format 'local <simple-identifier>' but got: %v", symbol)
21+
}
22+
23+
// ParsePartialSymbolV1ToBeDeleted parses an SCIP string into the Symbol message
24+
// with the option to exclude the `.Descriptor` field.
25+
//
26+
// Nov 30 2024: This is currently only present for benchmarking + compatibility
27+
// reasons. We can remove this in the future once we're confident that the new
28+
// parser handles everything correctly.
29+
func ParsePartialSymbolV1ToBeDeleted(symbol string, includeDescriptors bool) (*scip.Symbol, error) {
30+
local, err := tryParseLocalSymbol(symbol)
31+
if err != nil {
32+
return nil, err
33+
}
34+
if local != "" {
35+
return &scip.Symbol{
36+
Scheme: "local",
37+
Descriptors: []*scip.Descriptor{
38+
{Name: local, Suffix: scip.Descriptor_Local},
39+
},
40+
}, nil
41+
}
42+
s := newSymbolParser(symbol)
43+
scheme, err := s.acceptSpaceEscapedIdentifier("scheme")
44+
if err != nil {
45+
return nil, err
46+
}
47+
manager, err := s.acceptSpaceEscapedIdentifier("package manager")
48+
if err != nil {
49+
return nil, err
50+
}
51+
if manager == "." {
52+
manager = ""
53+
}
54+
packageName, err := s.acceptSpaceEscapedIdentifier("package name")
55+
if err != nil {
56+
return nil, err
57+
}
58+
if packageName == "." {
59+
packageName = ""
60+
}
61+
packageVersion, err := s.acceptSpaceEscapedIdentifier("package version")
62+
if err != nil {
63+
return nil, err
64+
}
65+
if packageVersion == "." {
66+
packageVersion = ""
67+
}
68+
var descriptors []*scip.Descriptor
69+
if includeDescriptors {
70+
descriptors, err = s.parseDescriptors()
71+
}
72+
return &scip.Symbol{
73+
Scheme: scheme,
74+
Package: &scip.Package{
75+
Manager: manager,
76+
Name: packageName,
77+
Version: packageVersion,
78+
},
79+
Descriptors: descriptors,
80+
}, err
81+
}
82+
83+
type symbolParser struct {
84+
Symbol []rune
85+
index int
86+
SymbolString string
87+
}
88+
89+
func newSymbolParser(symbol string) *symbolParser {
90+
return &symbolParser{
91+
SymbolString: symbol,
92+
Symbol: []rune(symbol),
93+
index: 0,
94+
}
95+
}
96+
97+
func (s *symbolParser) error(message string) error {
98+
return errors.Newf("%s\n%s\n%s^", message, s.SymbolString, strings.Repeat("_", s.index))
99+
}
100+
101+
func (s *symbolParser) current() rune {
102+
if s.index < len(s.Symbol) {
103+
return s.Symbol[s.index]
104+
}
105+
return '\x00'
106+
}
107+
108+
func (s *symbolParser) peekNext() rune {
109+
if s.index+1 < len(s.Symbol) {
110+
return s.Symbol[s.index]
111+
}
112+
return 0
113+
}
114+
115+
func (s *symbolParser) parseDescriptors() ([]*scip.Descriptor, error) {
116+
var result []*scip.Descriptor
117+
for s.index < len(s.Symbol) {
118+
descriptor, err := s.parseDescriptor()
119+
if err != nil {
120+
return nil, err
121+
}
122+
result = append(result, descriptor)
123+
}
124+
return result, nil
125+
}
126+
127+
func (s *symbolParser) parseDescriptor() (*scip.Descriptor, error) {
128+
start := s.index
129+
switch s.peekNext() {
130+
case '(':
131+
s.index++
132+
name, err := s.acceptIdentifier("parameter name")
133+
if err != nil {
134+
return nil, err
135+
}
136+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Parameter}, s.acceptCharacter(')', "closing parameter name")
137+
case '[':
138+
s.index++
139+
name, err := s.acceptIdentifier("type parameter name")
140+
if err != nil {
141+
return nil, err
142+
}
143+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_TypeParameter}, s.acceptCharacter(']', "closing type parameter name")
144+
default:
145+
name, err := s.acceptIdentifier("descriptor name")
146+
if err != nil {
147+
return nil, err
148+
}
149+
suffix := s.current()
150+
s.index++
151+
switch suffix {
152+
case '(':
153+
disambiguator := ""
154+
if s.peekNext() != ')' {
155+
disambiguator, err = s.acceptIdentifier("method disambiguator")
156+
if err != nil {
157+
return nil, err
158+
}
159+
}
160+
err = s.acceptCharacter(')', "closing method")
161+
if err != nil {
162+
return nil, err
163+
}
164+
return &scip.Descriptor{Name: name, Disambiguator: disambiguator, Suffix: scip.Descriptor_Method}, s.acceptCharacter('.', "closing method")
165+
case '/':
166+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Namespace}, nil
167+
case '.':
168+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Term}, nil
169+
case '#':
170+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Type}, nil
171+
case ':':
172+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Meta}, nil
173+
case '!':
174+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Macro}, nil
175+
default:
176+
}
177+
}
178+
179+
end := s.index
180+
if s.index > len(s.Symbol) {
181+
end = len(s.Symbol)
182+
}
183+
return nil, errors.Newf("unrecognized descriptor %q", string(s.Symbol[start:end]))
184+
}
185+
186+
func (s *symbolParser) acceptIdentifier(what string) (string, error) {
187+
if s.current() == '`' {
188+
s.index++
189+
return s.acceptBacktickEscapedIdentifier(what)
190+
}
191+
start := s.index
192+
for s.index < len(s.Symbol) && shared.IsSimpleIdentifierCharacter(s.current()) {
193+
s.index++
194+
}
195+
if start == s.index {
196+
return "", s.error("empty identifier")
197+
}
198+
return string(s.Symbol[start:s.index]), nil
199+
}
200+
201+
func (s *symbolParser) acceptSpaceEscapedIdentifier(what string) (string, error) {
202+
return s.acceptEscapedIdentifier(what, ' ')
203+
}
204+
205+
func (s *symbolParser) acceptBacktickEscapedIdentifier(what string) (string, error) {
206+
return s.acceptEscapedIdentifier(what, '`')
207+
}
208+
209+
func (s *symbolParser) acceptEscapedIdentifier(what string, escapeCharacter rune) (string, error) {
210+
builder := strings.Builder{}
211+
for s.index < len(s.Symbol) {
212+
ch := s.current()
213+
if ch == escapeCharacter {
214+
s.index++
215+
if s.index >= len(s.Symbol) {
216+
break
217+
}
218+
if s.current() == escapeCharacter {
219+
// Escaped space character.
220+
builder.WriteRune(ch)
221+
} else {
222+
return builder.String(), nil
223+
}
224+
} else {
225+
builder.WriteRune(ch)
226+
}
227+
s.index++
228+
}
229+
return "", s.error(fmt.Sprintf("reached end of symbol while parsing <%s>, expected a '%v' character", what, string(escapeCharacter)))
230+
}
231+
232+
func (s *symbolParser) acceptCharacter(r rune, what string) error {
233+
if s.current() == r {
234+
s.index++
235+
return nil
236+
}
237+
return s.error(fmt.Sprintf("expected '%v', obtained '%v', while parsing %v", string(r), string(s.current()), what))
238+
}

0 commit comments

Comments
 (0)