Skip to content

Commit fdb0a20

Browse files
[processor/redaction] Support hashing instead of masking values (#38161)
<!--Ex. Fixing a bug - Describe the bug and how this fixes the issue. Ex. Adding a feature - Explain what this achieves.--> #### Description - added `hash_function` parameter to support hashing the values (or substrings of values) instead of masking them with a fixed string. By default fixed string masking is performed <!-- Issue number (e.g. #1234) or full URL to issue, if applicable. --> #### Link to tracking issue Fixes #35830 #### Depends on #37664 --------- Signed-off-by: odubajDT <[email protected]> Co-authored-by: Evan Bradley <[email protected]>
1 parent 4c72228 commit fdb0a20

File tree

10 files changed

+252
-3
lines changed

10 files changed

+252
-3
lines changed

.chloggen/redaction-hash.yaml

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: processor/redaction
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Support hashing instead of masking values via 'hash_function' parameter"
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [35830]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

processor/redactionprocessor/README.md

+9
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ processors:
8787
# blocked span attributes. Values that match are not masked.
8888
allowed_values:
8989
90+
# hash_function defines the function for hashing the values instead of
91+
# masking them with a fixed string. By default, no hash function is used
92+
# and masking with a fixed string is performed.
93+
hash_function: md5
9094
# summary controls the verbosity level of the diagnostic attributes that
9195
# the processor adds to the spans/logs/datapoints when it redacts or masks other
9296
# attributes. In some contexts a list of redacted attributes leaks
@@ -119,6 +123,11 @@ part of the value is masked with a fixed length of asterisks.
119123
`blocked_key_patterns` applies to the values of the keys matching one of the patterns.
120124
The value is then masked according to the configuration.
121125

126+
`hash_function` defines the function for hashing values of matched keys or matches in values
127+
instead of masking them with a fixed string. By default, no hash function is used
128+
and masking with a fixed string is performed. The supported hash functions
129+
are `md5`, `sha1` and `sha3` (SHA-256).
130+
122131
For example, if `notes` is on the list of allowed keys, then the `notes`
123132
attribute is retained. However, if there is a value such as a credit card
124133
number in the `notes` field that matched a regular expression on the list of

processor/redactionprocessor/config.go

+51
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,24 @@
33

44
package redactionprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor"
55

6+
import (
7+
"encoding"
8+
"errors"
9+
"fmt"
10+
"strings"
11+
)
12+
13+
var _ encoding.TextUnmarshaler = (*HashFunction)(nil)
14+
15+
type HashFunction string
16+
17+
const (
18+
None HashFunction = ""
19+
SHA1 HashFunction = "sha1"
20+
SHA3 HashFunction = "sha3"
21+
MD5 HashFunction = "md5"
22+
)
23+
624
type Config struct {
725
// AllowAllKeys is a flag to allow all span attribute keys. Setting this
826
// to true disables the AllowedKeys list. The list of BlockedValues is
@@ -18,6 +36,11 @@ type Config struct {
1836
// matching the regexes on the list are masked.
1937
BlockedKeyPatterns []string `mapstructure:"blocked_key_patterns"`
2038

39+
// HashFunction defines the function for hashing the values instead of
40+
// masking them with a fixed string. By default, no hash function is used
41+
// and masking with a fixed string is performed.
42+
HashFunction HashFunction `mapstructure:"hash_function"`
43+
2144
// IgnoredKeys is a list of span attribute keys that are not redacted.
2245
// Span attributes in this list are allowed to pass through the filter
2346
// without being changed or removed.
@@ -38,3 +61,31 @@ type Config struct {
3861
// configuration. Possible values are `debug`, `info`, and `silent`.
3962
Summary string `mapstructure:"summary"`
4063
}
64+
65+
func (u HashFunction) String() string {
66+
return string(u)
67+
}
68+
69+
// UnmarshalText unmarshalls text to a HashFunction.
70+
func (u *HashFunction) UnmarshalText(text []byte) error {
71+
if u == nil {
72+
return errors.New("cannot unmarshal to a nil *HashFunction")
73+
}
74+
75+
str := strings.ToLower(string(text))
76+
switch str {
77+
case strings.ToLower(SHA1.String()):
78+
*u = SHA1
79+
return nil
80+
case strings.ToLower(MD5.String()):
81+
*u = MD5
82+
return nil
83+
case strings.ToLower(SHA3.String()):
84+
*u = SHA3
85+
return nil
86+
case strings.ToLower(None.String()):
87+
*u = None
88+
return nil
89+
}
90+
return fmt.Errorf("unknown HashFunction %s, allowed functions are %s, %s and %s", str, SHA1, SHA3, MD5)
91+
}

processor/redactionprocessor/config_test.go

+37
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package redactionprocessor
55

66
import (
7+
"errors"
78
"path/filepath"
89
"testing"
910

@@ -31,6 +32,7 @@ func TestLoadConfig(t *testing.T) {
3132
IgnoredKeys: []string{"safe_attribute"},
3233
BlockedValues: []string{"4[0-9]{12}(?:[0-9]{3})?", "(5[1-5][0-9]{14})"},
3334
BlockedKeyPatterns: []string{".*token.*", ".*api_key.*"},
35+
HashFunction: MD5,
3436
AllowedValues: []string{"[email protected]"},
3537
Summary: debug,
3638
},
@@ -58,3 +60,38 @@ func TestLoadConfig(t *testing.T) {
5860
})
5961
}
6062
}
63+
64+
func TestValidateConfig(t *testing.T) {
65+
tests := []struct {
66+
name string
67+
hash HashFunction
68+
expected error
69+
}{
70+
{
71+
name: "valid",
72+
hash: MD5,
73+
},
74+
{
75+
name: "empty",
76+
hash: None,
77+
},
78+
{
79+
name: "invalid",
80+
hash: "hash",
81+
expected: errors.New("unknown HashFunction hash, allowed functions are sha1, sha3 and md5"),
82+
},
83+
}
84+
85+
for _, tt := range tests {
86+
t.Run(tt.name, func(t *testing.T) {
87+
var h HashFunction
88+
err := h.UnmarshalText([]byte(tt.hash))
89+
if tt.expected != nil {
90+
assert.Error(t, err)
91+
} else {
92+
assert.NoError(t, err)
93+
assert.Equal(t, tt.hash, h)
94+
}
95+
})
96+
}
97+
}

processor/redactionprocessor/factory_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ func TestDefaultConfiguration(t *testing.T) {
2020
assert.Empty(t, c.BlockedValues)
2121
assert.Empty(t, c.AllowedValues)
2222
assert.Empty(t, c.BlockedKeyPatterns)
23+
assert.Empty(t, c.HashFunction)
2324
}
2425

2526
func TestCreateTestProcessor(t *testing.T) {

processor/redactionprocessor/go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ require (
1515
go.opentelemetry.io/collector/processor/processortest v0.121.0
1616
go.uber.org/goleak v1.3.0
1717
go.uber.org/zap v1.27.0
18+
golang.org/x/crypto v0.31.0
1819
)
1920

2021
require (

processor/redactionprocessor/go.sum

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

processor/redactionprocessor/processor.go

+34-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,14 @@
33

44
package redactionprocessor // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor"
55

6+
//nolint:gosec
67
import (
78
"context"
9+
"crypto/md5"
10+
"crypto/sha1"
11+
"encoding/hex"
812
"fmt"
13+
"hash"
914
"regexp"
1015
"sort"
1116
"strings"
@@ -15,6 +20,7 @@ import (
1520
"go.opentelemetry.io/collector/pdata/pmetric"
1621
"go.opentelemetry.io/collector/pdata/ptrace"
1722
"go.uber.org/zap"
23+
"golang.org/x/crypto/sha3"
1824
)
1925

2026
const attrValuesSeparator = ","
@@ -30,6 +36,8 @@ type redaction struct {
3036
allowRegexList map[string]*regexp.Regexp
3137
// Attribute keys blocked in a span
3238
blockKeyRegexList map[string]*regexp.Regexp
39+
// Hash function to hash blocked values
40+
hashFunction HashFunction
3341
// Redaction processor configuration
3442
config *Config
3543
// Logger
@@ -63,6 +71,7 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
6371
blockRegexList: blockRegexList,
6472
allowRegexList: allowRegexList,
6573
blockKeyRegexList: blockKeysRegexList,
74+
hashFunction: config.HashFunction,
6675
config: config,
6776
logger: logger,
6877
}, nil
@@ -226,7 +235,7 @@ func (s *redaction) processAttrs(_ context.Context, attributes pcommon.Map) {
226235
for _, compiledRE := range s.blockKeyRegexList {
227236
if match := compiledRE.MatchString(k); match {
228237
toBlock = append(toBlock, k)
229-
maskedValue := compiledRE.ReplaceAllString(strVal, "****")
238+
maskedValue := s.maskValue(strVal, regexp.MustCompile(".*"))
230239
value.SetStr(maskedValue)
231240
return true
232241
}
@@ -235,13 +244,13 @@ func (s *redaction) processAttrs(_ context.Context, attributes pcommon.Map) {
235244
// Mask any blocked values for the other attributes
236245
var matched bool
237246
for _, compiledRE := range s.blockRegexList {
238-
if match := compiledRE.MatchString(strVal); match {
247+
if compiledRE.MatchString(strVal) {
239248
if !matched {
240249
matched = true
241250
toBlock = append(toBlock, k)
242251
}
243252

244-
maskedValue := compiledRE.ReplaceAllString(strVal, "****")
253+
maskedValue := s.maskValue(strVal, compiledRE)
245254
value.SetStr(maskedValue)
246255
strVal = maskedValue
247256
}
@@ -260,6 +269,28 @@ func (s *redaction) processAttrs(_ context.Context, attributes pcommon.Map) {
260269
s.addMetaAttrs(ignoring, attributes, "", ignoredKeyCount)
261270
}
262271

272+
//nolint:gosec
273+
func (s *redaction) maskValue(val string, regex *regexp.Regexp) string {
274+
hashFunc := func(match string) string {
275+
switch s.hashFunction {
276+
case SHA1:
277+
return hashString(match, sha1.New())
278+
case SHA3:
279+
return hashString(match, sha3.New256())
280+
case MD5:
281+
return hashString(match, md5.New())
282+
default:
283+
return "****"
284+
}
285+
}
286+
return regex.ReplaceAllStringFunc(val, hashFunc)
287+
}
288+
289+
func hashString(input string, hasher hash.Hash) string {
290+
hasher.Write([]byte(input))
291+
return hex.EncodeToString(hasher.Sum(nil))
292+
}
293+
263294
// addMetaAttrs adds diagnostic information about redacted or masked attribute keys
264295
func (s *redaction) addMetaAttrs(redactedAttrs []string, attributes pcommon.Map, valuesAttr, countAttr string) {
265296
redactedCount := int64(len(redactedAttrs))

processor/redactionprocessor/processor_test.go

+86
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,92 @@ func TestRedactSummaryDebug(t *testing.T) {
271271
}
272272
}
273273

274+
func TestRedactSummaryDebugHashMD5(t *testing.T) {
275+
testConfig := TestConfig{
276+
config: &Config{
277+
AllowedKeys: []string{"id", "group", "name", "group.id", "member (id)", "token_some", "api_key_some", "email"},
278+
BlockedValues: []string{"4[0-9]{12}(?:[0-9]{3})?"},
279+
HashFunction: MD5,
280+
IgnoredKeys: []string{"safe_attribute"},
281+
BlockedKeyPatterns: []string{".*token.*", ".*api_key.*"},
282+
Summary: "debug",
283+
},
284+
allowed: map[string]pcommon.Value{
285+
"id": pcommon.NewValueInt(5),
286+
"group.id": pcommon.NewValueStr("some.valid.id"),
287+
"member (id)": pcommon.NewValueStr("some other valid id"),
288+
},
289+
masked: map[string]pcommon.Value{
290+
"name": pcommon.NewValueStr("placeholder 4111111111111111"),
291+
},
292+
ignored: map[string]pcommon.Value{
293+
"safe_attribute": pcommon.NewValueStr("harmless 4111111111111112"),
294+
},
295+
redacted: map[string]pcommon.Value{
296+
"credit_card": pcommon.NewValueStr("4111111111111111"),
297+
},
298+
blockedKeys: map[string]pcommon.Value{
299+
"token_some": pcommon.NewValueStr("tokenize"),
300+
"api_key_some": pcommon.NewValueStr("apinize"),
301+
},
302+
allowedValues: map[string]pcommon.Value{
303+
"email": pcommon.NewValueStr("[email protected]"),
304+
},
305+
}
306+
307+
outTraces := runTest(t, testConfig)
308+
outLogs := runLogsTest(t, testConfig)
309+
outMetricsGauge := runMetricsTest(t, testConfig, pmetric.MetricTypeGauge)
310+
outMetricsSum := runMetricsTest(t, testConfig, pmetric.MetricTypeSum)
311+
outMetricsHistogram := runMetricsTest(t, testConfig, pmetric.MetricTypeHistogram)
312+
outMetricsExponentialHistogram := runMetricsTest(t, testConfig, pmetric.MetricTypeExponentialHistogram)
313+
outMetricsSummary := runMetricsTest(t, testConfig, pmetric.MetricTypeSummary)
314+
315+
attrs := []pcommon.Map{
316+
outTraces.ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(0).Attributes(),
317+
outLogs.ResourceLogs().At(0).ScopeLogs().At(0).LogRecords().At(0).Attributes(),
318+
outMetricsGauge.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes(),
319+
outMetricsSum.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Sum().DataPoints().At(0).Attributes(),
320+
outMetricsHistogram.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Histogram().DataPoints().At(0).Attributes(),
321+
outMetricsExponentialHistogram.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).ExponentialHistogram().DataPoints().At(0).Attributes(),
322+
outMetricsSummary.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Summary().DataPoints().At(0).Attributes(),
323+
}
324+
325+
for _, attr := range attrs {
326+
deleted := make([]string, 0, len(testConfig.redacted))
327+
for k := range testConfig.redacted {
328+
_, ok := attr.Get(k)
329+
assert.False(t, ok)
330+
deleted = append(deleted, k)
331+
}
332+
maskedKeys, ok := attr.Get(redactedKeys)
333+
assert.True(t, ok)
334+
sort.Strings(deleted)
335+
assert.Equal(t, strings.Join(deleted, ","), maskedKeys.Str())
336+
maskedKeyCount, ok := attr.Get(redactedKeyCount)
337+
assert.True(t, ok)
338+
assert.Equal(t, int64(len(deleted)), maskedKeyCount.Int())
339+
340+
ignoredKeyCount, ok := attr.Get(ignoredKeyCount)
341+
assert.True(t, ok)
342+
assert.Equal(t, int64(len(testConfig.ignored)), ignoredKeyCount.Int())
343+
344+
blockedKeys := []string{"api_key_some", "name", "token_some"}
345+
maskedValues, ok := attr.Get(maskedValues)
346+
assert.True(t, ok)
347+
assert.Equal(t, strings.Join(blockedKeys, ","), maskedValues.Str())
348+
maskedValueCount, ok := attr.Get(maskedValueCount)
349+
assert.True(t, ok)
350+
assert.Equal(t, int64(3), maskedValueCount.Int())
351+
value, _ := attr.Get("name")
352+
assert.Equal(t, "placeholder 5910f4ea0062a0e29afd3dccc741e3ce", value.Str())
353+
value, _ = attr.Get("api_key_some")
354+
assert.Equal(t, "93a699237950bde9eb9d25c7ead025f3", value.Str())
355+
value, _ = attr.Get("token_some")
356+
assert.Equal(t, "77e9ef3680c5518785ef0121d3884c3d", value.Str())
357+
}
358+
}
359+
274360
// TestRedactSummaryInfo validates that the processor writes a verbose summary
275361
// of any attributes it deleted to the new redaction.redacted.count span
276362
// attribute (but not to redaction.redacted.keys) when set to the info level

processor/redactionprocessor/testdata/config.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ redaction:
3030
# blocked span attributes. Values that match are not masked.
3131
allowed_values:
3232
33+
# hash_function defines the function for hashing the values instead of
34+
# masking them with a fixed string. By default, no hash function is used
35+
# and masking with a fixed string is performed.
36+
hash_function: md5
3337
# Summary controls the verbosity level of the diagnostic attributes that
3438
# the processor adds to the spans when it redacts or masks other
3539
# attributes. In some contexts a list of redacted attributes leaks

0 commit comments

Comments
 (0)