Skip to content

Commit e1f9847

Browse files
committed
Rebootstrap support - Phase 1
Signed-off-by: Kevin Fox <[email protected]>
1 parent 75b9a6a commit e1f9847

File tree

9 files changed

+482
-165
lines changed

9 files changed

+482
-165
lines changed

cmd/spire-agent/cli/run/run.go

+26-114
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@ package run
22

33
import (
44
"context"
5-
"crypto/x509"
65
"errors"
76
"flag"
87
"fmt"
98
"io"
109
"net"
11-
"net/http"
1210
"net/url"
1311
"os"
1412
"os/signal"
@@ -26,18 +24,16 @@ import (
2624
"github.com/imdario/mergo"
2725
"github.com/mitchellh/cli"
2826
"github.com/sirupsen/logrus"
29-
"github.com/spiffe/go-spiffe/v2/spiffeid"
3027
"github.com/spiffe/spire/pkg/agent"
28+
"github.com/spiffe/spire/pkg/agent/trustbundlesources"
3129
"github.com/spiffe/spire/pkg/agent/workloadkey"
32-
"github.com/spiffe/spire/pkg/common/bundleutil"
3330
"github.com/spiffe/spire/pkg/common/catalog"
3431
common_cli "github.com/spiffe/spire/pkg/common/cli"
3532
"github.com/spiffe/spire/pkg/common/config"
3633
"github.com/spiffe/spire/pkg/common/fflag"
3734
"github.com/spiffe/spire/pkg/common/health"
3835
"github.com/spiffe/spire/pkg/common/idutil"
3936
"github.com/spiffe/spire/pkg/common/log"
40-
"github.com/spiffe/spire/pkg/common/pemutil"
4137
"github.com/spiffe/spire/pkg/common/telemetry"
4238
"github.com/spiffe/spire/pkg/common/tlspolicy"
4339
)
@@ -75,6 +71,8 @@ type agentConfig struct {
7571
AdminSocketPath string `hcl:"admin_socket_path"`
7672
InsecureBootstrap bool `hcl:"insecure_bootstrap"`
7773
RetryBootstrap bool `hcl:"retry_bootstrap"`
74+
Rebootstrap bool `hcl:"rebootstrap"`
75+
RebootstrapDelay string `hcl:"rebootstrap_delay"`
7876
JoinToken string `hcl:"join_token"`
7977
LogFile string `hcl:"log_file"`
8078
LogFormat string `hcl:"log_format"`
@@ -353,6 +351,8 @@ func parseFlags(name string, args []string, output io.Writer) (*agentConfig, err
353351
flags.BoolVar(&c.AllowUnauthenticatedVerifiers, "allowUnauthenticatedVerifiers", false, "If true, the agent permits the retrieval of X509 certificate bundles by unregistered clients")
354352
flags.BoolVar(&c.InsecureBootstrap, "insecureBootstrap", false, "If true, the agent bootstraps without verifying the server's identity")
355353
flags.BoolVar(&c.RetryBootstrap, "retryBootstrap", false, "If true, the agent retries bootstrap with backoff")
354+
flags.BoolVar(&c.Rebootstrap, "rebootstrap", false, "If true, the agent will retry bootstrapping after seeing an x509 cert mismatch from the server")
355+
flags.StringVar(&c.RebootstrapDelay, "rebootstrapDelay", "10m", "The time to delay after seeing a x509 cert mismatch from the server before rebootstrapping")
356356
flags.BoolVar(&c.ExpandEnv, "expandEnv", false, "Expand environment variables in SPIRE config file")
357357

358358
c.addOSFlags(flags)
@@ -387,103 +387,6 @@ func mergeInput(fileInput *Config, cliInput *agentConfig) (*Config, error) {
387387
return c, nil
388388
}
389389

390-
func parseTrustBundle(bundleBytes []byte, trustBundleContentType string) ([]*x509.Certificate, error) {
391-
switch trustBundleContentType {
392-
case bundleFormatPEM:
393-
bundle, err := pemutil.ParseCertificates(bundleBytes)
394-
if err != nil {
395-
return nil, err
396-
}
397-
return bundle, nil
398-
case bundleFormatSPIFFE:
399-
bundle, err := bundleutil.Unmarshal(spiffeid.TrustDomain{}, bundleBytes)
400-
if err != nil {
401-
return nil, fmt.Errorf("unable to parse SPIFFE trust bundle: %w", err)
402-
}
403-
return bundle.X509Authorities(), nil
404-
}
405-
406-
return nil, fmt.Errorf("unknown trust bundle format: %s", trustBundleContentType)
407-
}
408-
409-
func downloadTrustBundle(trustBundleURL string, trustBundleUnixSocket string) ([]byte, error) {
410-
var req *http.Request
411-
client := &http.Client{}
412-
if trustBundleUnixSocket != "" {
413-
client = &http.Client{
414-
Transport: &http.Transport{
415-
DialContext: func(_ context.Context, _, _ string) (net.Conn, error) {
416-
return net.Dial("unix", trustBundleUnixSocket)
417-
},
418-
},
419-
}
420-
}
421-
req, err := http.NewRequest("GET", trustBundleURL, nil)
422-
if err != nil {
423-
return nil, err
424-
}
425-
426-
// Download the trust bundle URL from the user specified URL
427-
// We use gosec -- the annotation below will disable a security check that URLs are not tainted
428-
/* #nosec G107 */
429-
resp, err := client.Do(req)
430-
if err != nil {
431-
return nil, fmt.Errorf("unable to fetch trust bundle URL %s: %w", trustBundleURL, err)
432-
}
433-
434-
defer resp.Body.Close()
435-
436-
if resp.StatusCode != http.StatusOK {
437-
return nil, fmt.Errorf("error downloading trust bundle: %s", resp.Status)
438-
}
439-
pemBytes, err := io.ReadAll(resp.Body)
440-
if err != nil {
441-
return nil, fmt.Errorf("unable to read from trust bundle URL %s: %w", trustBundleURL, err)
442-
}
443-
444-
return pemBytes, nil
445-
}
446-
447-
func setupTrustBundle(ac *agent.Config, c *Config) error {
448-
// Either download the trust bundle if TrustBundleURL is set, or read it
449-
// from disk if TrustBundlePath is set
450-
ac.InsecureBootstrap = c.Agent.InsecureBootstrap
451-
452-
var bundleBytes []byte
453-
var err error
454-
455-
switch {
456-
case c.Agent.TrustBundleURL != "":
457-
bundleBytes, err = downloadTrustBundle(c.Agent.TrustBundleURL, c.Agent.TrustBundleUnixSocket)
458-
if err != nil {
459-
return err
460-
}
461-
case c.Agent.TrustBundlePath != "":
462-
bundleBytes, err = loadTrustBundle(c.Agent.TrustBundlePath)
463-
if err != nil {
464-
return fmt.Errorf("could not parse trust bundle: %w", err)
465-
}
466-
default:
467-
// If InsecureBootstrap is configured, the bundle is not required
468-
if ac.InsecureBootstrap {
469-
return nil
470-
}
471-
}
472-
473-
bundle, err := parseTrustBundle(bundleBytes, c.Agent.TrustBundleFormat)
474-
if err != nil {
475-
return err
476-
}
477-
478-
if len(bundle) == 0 {
479-
return errors.New("no certificates found in trust bundle")
480-
}
481-
482-
ac.TrustBundle = bundle
483-
484-
return nil
485-
}
486-
487390
func NewAgentConfig(c *Config, logOptions []log.Option, allowUnknownConfig bool) (*agent.Config, error) {
488391
ac := &agent.Config{}
489392

@@ -492,6 +395,16 @@ func NewAgentConfig(c *Config, logOptions []log.Option, allowUnknownConfig bool)
492395
}
493396

494397
ac.RetryBootstrap = c.Agent.RetryBootstrap
398+
if c.Agent.Rebootstrap {
399+
delay, err := time.ParseDuration(c.Agent.RebootstrapDelay)
400+
if err != nil {
401+
return nil, fmt.Errorf("error parsing duration:", err)
402+
}
403+
ac.RebootstrapDelay = &delay
404+
if !ac.RetryBootstrap {
405+
return nil, fmt.Errorf("RetryBootstrap needs to be true to support rebootstrapping")
406+
}
407+
}
495408

496409
if c.Agent.Experimental.SyncInterval != "" {
497410
var err error
@@ -575,11 +488,19 @@ func NewAgentConfig(c *Config, logOptions []log.Option, allowUnknownConfig bool)
575488
}
576489
ac.DisableSPIFFECertValidation = c.Agent.SDS.DisableSPIFFECertValidation
577490

578-
err = setupTrustBundle(ac, c)
579-
if err != nil {
580-
return nil, err
491+
ts := &trustbundlesources.Config{
492+
InsecureBootstrap: c.Agent.InsecureBootstrap,
493+
TrustBundleFormat: c.Agent.TrustBundleFormat,
494+
TrustBundlePath: c.Agent.TrustBundlePath,
495+
TrustBundleURL: c.Agent.TrustBundleURL,
496+
TrustBundleUnixSocket: c.Agent.TrustBundleURL,
497+
TrustDomain: c.Agent.TrustDomain,
498+
ServerAddress: c.Agent.ServerAddress,
499+
ServerPort: c.Agent.ServerPort,
581500
}
582501

502+
ac.TrustBundleSources = trustbundlesources.New(ts, ac.Log.WithField("Logger", "TrustBundleSources"))
503+
583504
ac.WorkloadKeyType = workloadkey.ECP256
584505
if c.Agent.WorkloadX509SVIDKeyType != "" {
585506
ac.WorkloadKeyType, err = workloadkey.KeyTypeFromString(c.Agent.WorkloadX509SVIDKeyType)
@@ -737,12 +658,3 @@ func defaultConfig() *Config {
737658

738659
return c
739660
}
740-
741-
func loadTrustBundle(path string) ([]byte, error) {
742-
bundleBytes, err := os.ReadFile(path)
743-
if err != nil {
744-
return nil, err
745-
}
746-
747-
return bundleBytes, nil
748-
}

pkg/agent/agent.go

+94-21
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package agent
22

33
import (
44
"context"
5+
"crypto/x509"
56
"errors"
67
"fmt"
78
"net/http"
@@ -33,15 +34,17 @@ import (
3334
"github.com/spiffe/spire/pkg/common/uptime"
3435
"github.com/spiffe/spire/pkg/common/util"
3536
"github.com/spiffe/spire/pkg/common/version"
37+
"github.com/spiffe/spire/pkg/common/x509util"
3638
_ "golang.org/x/net/trace" // registers handlers on the DefaultServeMux
3739
"google.golang.org/grpc"
3840
"google.golang.org/grpc/codes"
3941
"google.golang.org/grpc/status"
4042
)
4143

4244
const (
43-
bootstrapBackoffInterval = 5 * time.Second
44-
bootstrapBackoffMaxElapsedTime = 1 * time.Minute
45+
bootstrapBackoffInterval = 5 * time.Second
46+
// FIXME KMF what to do...
47+
bootstrapBackoffMaxElapsedTime = 24 * time.Hour // 1 *time.Minute
4548
)
4649

4750
type Agent struct {
@@ -105,22 +108,60 @@ func (a *Agent) Run(ctx context.Context) error {
105108

106109
var as *node_attestor.AttestationResult
107110

111+
err = a.c.TrustBundleSources.SetStorage(sto)
112+
if err != nil {
113+
return err
114+
}
115+
108116
if a.c.RetryBootstrap {
109117
attBackoffClock := clock.New()
110118
attBackoff := backoff.NewBackoff(
111119
attBackoffClock,
112120
bootstrapBackoffInterval,
113121
backoff.WithMaxElapsedTime(bootstrapBackoffMaxElapsedTime),
122+
// FIXME KMF how to ignore max time if rebootstrapping
114123
)
115124

116125
for {
117-
as, err = a.attest(ctx, sto, cat, metrics, nodeAttestor)
118-
if err == nil {
119-
break
126+
InsecureBootstrap := false
127+
BootstrapTrustBundle, err := sto.LoadBundle()
128+
if errors.Is(err, storage.ErrNotCached) {
129+
BootstrapTrustBundle, InsecureBootstrap, err = a.c.TrustBundleSources.GetBundle()
120130
}
121-
122-
if status.Code(err) == codes.PermissionDenied {
123-
return err
131+
if err == nil {
132+
as, err = a.attest(ctx, sto, cat, metrics, nodeAttestor, BootstrapTrustBundle, InsecureBootstrap)
133+
if err == nil {
134+
err = a.c.TrustBundleSources.SetSuccess()
135+
if err != nil {
136+
return err
137+
}
138+
break
139+
}
140+
141+
if x509util.IsUnknownAuthorityError(err) {
142+
if a.c.TrustBundleSources.IsBootstrap() {
143+
fmt.Printf("Trust Bandle and Server dont agree.... bootstrapping again")
144+
} else if a.c.RebootstrapDelay != nil {
145+
startTime, err := a.c.TrustBundleSources.GetStartTime()
146+
if err != nil {
147+
return nil
148+
}
149+
seconds := time.Since(startTime)
150+
if seconds < *a.c.RebootstrapDelay {
151+
fmt.Printf("Trust Bandle and Server dont agree.... Ignoring for now. Rebootstrap timeout left: %s\n", *a.c.RebootstrapDelay-seconds)
152+
} else {
153+
fmt.Printf("Trust Bandle and Server dont agree.... rebootstrapping\n")
154+
err = sto.StoreBundle(nil)
155+
if err != nil {
156+
return err
157+
}
158+
}
159+
}
160+
}
161+
162+
if status.Code(err) == codes.PermissionDenied {
163+
return err
164+
}
124165
}
125166

126167
nextDuration := attBackoff.NextBackOff()
@@ -141,7 +182,15 @@ func (a *Agent) Run(ctx context.Context) error {
141182
}
142183
}
143184
} else {
144-
as, err = a.attest(ctx, sto, cat, metrics, nodeAttestor)
185+
InsecureBootstrap := false
186+
BootstrapTrustBundle, err := sto.LoadBundle()
187+
if errors.Is(err, storage.ErrNotCached) {
188+
BootstrapTrustBundle, InsecureBootstrap, err = a.c.TrustBundleSources.GetBundle()
189+
}
190+
if err != nil {
191+
return err
192+
}
193+
as, err = a.attest(ctx, sto, cat, metrics, nodeAttestor, BootstrapTrustBundle, InsecureBootstrap)
145194
if err != nil {
146195
return err
147196
}
@@ -249,19 +298,19 @@ func (a *Agent) setupProfiling(ctx context.Context) (stop func()) {
249298
}
250299
}
251300

252-
func (a *Agent) attest(ctx context.Context, sto storage.Storage, cat catalog.Catalog, metrics telemetry.Metrics, na nodeattestor.NodeAttestor) (*node_attestor.AttestationResult, error) {
301+
func (a *Agent) attest(ctx context.Context, sto storage.Storage, cat catalog.Catalog, metrics telemetry.Metrics, na nodeattestor.NodeAttestor, bootstrapTrustBundle []*x509.Certificate, insecureBootstrap bool) (*node_attestor.AttestationResult, error) {
253302
config := node_attestor.Config{
254-
Catalog: cat,
255-
Metrics: metrics,
256-
JoinToken: a.c.JoinToken,
257-
TrustDomain: a.c.TrustDomain,
258-
TrustBundle: a.c.TrustBundle,
259-
InsecureBootstrap: a.c.InsecureBootstrap,
260-
Storage: sto,
261-
Log: a.c.Log.WithField(telemetry.SubsystemName, telemetry.Attestor),
262-
ServerAddress: a.c.ServerAddress,
263-
NodeAttestor: na,
264-
TLSPolicy: a.c.TLSPolicy,
303+
Catalog: cat,
304+
Metrics: metrics,
305+
JoinToken: a.c.JoinToken,
306+
TrustDomain: a.c.TrustDomain,
307+
BootstrapTrustBundle: bootstrapTrustBundle,
308+
InsecureBootstrap: insecureBootstrap,
309+
Storage: sto,
310+
Log: a.c.Log.WithField(telemetry.SubsystemName, telemetry.Attestor),
311+
ServerAddress: a.c.ServerAddress,
312+
NodeAttestor: na,
313+
TLSPolicy: a.c.TLSPolicy,
265314
}
266315
return node_attestor.New(&config).Attest(ctx)
267316
}
@@ -279,6 +328,8 @@ func (a *Agent) newManager(ctx context.Context, sto storage.Storage, cat catalog
279328
Metrics: metrics,
280329
WorkloadKeyType: a.c.WorkloadKeyType,
281330
Storage: sto,
331+
TrustBundleSources: a.c.TrustBundleSources,
332+
RebootstrapDelay: a.c.RebootstrapDelay,
282333
SyncInterval: a.c.SyncInterval,
283334
UseSyncAuthorizedEntries: a.c.UseSyncAuthorizedEntries,
284335
X509SVIDCacheMaxSize: a.c.X509SVIDCacheMaxSize,
@@ -296,13 +347,35 @@ func (a *Agent) newManager(ctx context.Context, sto storage.Storage, cat catalog
296347
initBackoffClock,
297348
bootstrapBackoffInterval,
298349
backoff.WithMaxElapsedTime(bootstrapBackoffMaxElapsedTime),
350+
// FIXME KMF how to ignore max time if rebootstrapping
299351
)
300352

301353
for {
302354
err := mgr.Initialize(ctx)
303355
if err == nil {
356+
err = a.c.TrustBundleSources.SetSuccessIfRunning()
357+
if err != nil {
358+
return nil, err
359+
}
304360
return mgr, nil
305361
}
362+
if x509util.IsUnknownAuthorityError(err) && a.c.RebootstrapDelay != nil {
363+
startTime, err := a.c.TrustBundleSources.GetStartTime()
364+
if err != nil {
365+
return nil, err
366+
}
367+
seconds := time.Since(startTime)
368+
if seconds < *a.c.RebootstrapDelay {
369+
fmt.Printf("Trust Bandle and Server dont agree.... Ignoring for now. Rebootstrap timeout left: %s\n", *a.c.RebootstrapDelay-seconds)
370+
} else {
371+
fmt.Printf("Trust Bandle and Server dont agree.... rebootstrapping")
372+
err = a.c.TrustBundleSources.SetForceRebootstrap()
373+
if err != nil {
374+
return nil, err
375+
}
376+
return nil, errors.New("Agent needs to rebootstrap. shutting down")
377+
}
378+
}
306379

307380
if nodeutil.ShouldAgentReattest(err) || nodeutil.ShouldAgentShutdown(err) {
308381
return nil, err

0 commit comments

Comments
 (0)