Skip to content

Commit af72c08

Browse files
authored
Merge pull request #16306 from github/nickrolfe/js-sensitive
JS: do fewer regexp matches in SensitiveActions
2 parents de58ee5 + 003d208 commit af72c08

File tree

5 files changed

+117
-27
lines changed

5 files changed

+117
-27
lines changed

javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll

+25-27
Original file line numberDiff line numberDiff line change
@@ -86,39 +86,37 @@ private predicate writesProperty(DataFlow::Node node, string name) {
8686

8787
/** A write to a variable or property that might contain sensitive data. */
8888
private class BasicSensitiveWrite extends SensitiveWrite {
89-
SensitiveDataClassification classification;
89+
string name;
9090

9191
BasicSensitiveWrite() {
92-
exists(string name |
93-
/*
94-
* PERFORMANCE OPTIMISATION:
95-
* `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
96-
* To carry out a regex match, we must first compute the Cartesian product
97-
* of all possible `name`s and regexes, then match.
98-
* To keep this product as small as possible,
99-
* we want to filter `name` as much as possible before the product.
100-
*
101-
* Do this by factoring out a helper predicate containing the filtering
102-
* logic that restricts `name`. This helper predicate will get picked first
103-
* in the join order, since it is the only call here that binds `name`.
104-
*/
105-
106-
writesProperty(this, name) and
107-
nameIndicatesSensitiveData(name, classification)
108-
)
92+
/*
93+
* PERFORMANCE OPTIMISATION:
94+
* `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
95+
* To carry out a regex match, we must first compute the Cartesian product
96+
* of all possible `name`s and regexes, then match.
97+
* To keep this product as small as possible,
98+
* we want to filter `name` as much as possible before the product.
99+
*
100+
* Do this by factoring out a helper predicate containing the filtering
101+
* logic that restricts `name`. This helper predicate will get picked first
102+
* in the join order, since it is the only call here that binds `name`.
103+
*/
104+
105+
writesProperty(this, name) and
106+
nameIndicatesSensitiveData(name)
109107
}
110108

111109
/** Gets a classification of the kind of sensitive data the write might handle. */
112-
SensitiveDataClassification getClassification() { result = classification }
110+
SensitiveDataClassification getClassification() { nameIndicatesSensitiveData(name, result) }
113111
}
114112

115113
/** An access to a variable or property that might contain sensitive data. */
116114
private class BasicSensitiveVariableAccess extends SensitiveVariableAccess {
117-
SensitiveDataClassification classification;
118-
119-
BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name, classification) }
115+
BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name) }
120116

121-
override SensitiveDataClassification getClassification() { result = classification }
117+
override SensitiveDataClassification getClassification() {
118+
nameIndicatesSensitiveData(name, result)
119+
}
122120
}
123121

124122
/** A function name that suggests it may be sensitive. */
@@ -138,11 +136,11 @@ abstract class SensitiveDataFunctionName extends SensitiveFunctionName {
138136

139137
/** A method that might return sensitive data, based on the name. */
140138
class CredentialsFunctionName extends SensitiveDataFunctionName {
141-
SensitiveDataClassification classification;
142-
143-
CredentialsFunctionName() { nameIndicatesSensitiveData(this, classification) }
139+
CredentialsFunctionName() { nameIndicatesSensitiveData(this) }
144140

145-
override SensitiveDataClassification getClassification() { result = classification }
141+
override SensitiveDataClassification getClassification() {
142+
nameIndicatesSensitiveData(this, result)
143+
}
146144
}
147145

148146
/**

javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll

+23
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,25 @@ module HeuristicNames {
106106
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
107107
}
108108

109+
/**
110+
* Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
111+
* the data is in fact non-sensitive (for example since it is hashed or encrypted).
112+
*
113+
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
114+
* classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
115+
*/
116+
bindingset[name]
117+
predicate nameIndicatesSensitiveData(string name) {
118+
exists(string combinedRegexp |
119+
// Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
120+
combinedRegexp =
121+
"(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
122+
|
123+
name.regexpMatch(combinedRegexp)
124+
) and
125+
not name.regexpMatch(notSensitiveRegexp())
126+
}
127+
109128
/**
110129
* Holds if `name` may indicate the presence of sensitive data, and
111130
* `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
115134
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
116135
* given classification), and none of the regexps from `notSensitiveRegexp` matches
117136
* `name`.
137+
*
138+
* When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
139+
* pass, since that combines all the regexps into one, and should be faster. Then call this
140+
* predicate to get the classification(s).
118141
*/
119142
bindingset[name]
120143
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {

python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll

+23
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,25 @@ module HeuristicNames {
106106
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
107107
}
108108

109+
/**
110+
* Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
111+
* the data is in fact non-sensitive (for example since it is hashed or encrypted).
112+
*
113+
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
114+
* classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
115+
*/
116+
bindingset[name]
117+
predicate nameIndicatesSensitiveData(string name) {
118+
exists(string combinedRegexp |
119+
// Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
120+
combinedRegexp =
121+
"(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
122+
|
123+
name.regexpMatch(combinedRegexp)
124+
) and
125+
not name.regexpMatch(notSensitiveRegexp())
126+
}
127+
109128
/**
110129
* Holds if `name` may indicate the presence of sensitive data, and
111130
* `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
115134
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
116135
* given classification), and none of the regexps from `notSensitiveRegexp` matches
117136
* `name`.
137+
*
138+
* When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
139+
* pass, since that combines all the regexps into one, and should be faster. Then call this
140+
* predicate to get the classification(s).
118141
*/
119142
bindingset[name]
120143
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {

ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll

+23
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,25 @@ module HeuristicNames {
106106
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
107107
}
108108

109+
/**
110+
* Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
111+
* the data is in fact non-sensitive (for example since it is hashed or encrypted).
112+
*
113+
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
114+
* classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
115+
*/
116+
bindingset[name]
117+
predicate nameIndicatesSensitiveData(string name) {
118+
exists(string combinedRegexp |
119+
// Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
120+
combinedRegexp =
121+
"(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
122+
|
123+
name.regexpMatch(combinedRegexp)
124+
) and
125+
not name.regexpMatch(notSensitiveRegexp())
126+
}
127+
109128
/**
110129
* Holds if `name` may indicate the presence of sensitive data, and
111130
* `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
115134
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
116135
* given classification), and none of the regexps from `notSensitiveRegexp` matches
117136
* `name`.
137+
*
138+
* When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
139+
* pass, since that combines all the regexps into one, and should be faster. Then call this
140+
* predicate to get the classification(s).
118141
*/
119142
bindingset[name]
120143
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {

swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll

+23
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,25 @@ module HeuristicNames {
106106
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
107107
}
108108

109+
/**
110+
* Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
111+
* the data is in fact non-sensitive (for example since it is hashed or encrypted).
112+
*
113+
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
114+
* classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
115+
*/
116+
bindingset[name]
117+
predicate nameIndicatesSensitiveData(string name) {
118+
exists(string combinedRegexp |
119+
// Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
120+
combinedRegexp =
121+
"(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
122+
|
123+
name.regexpMatch(combinedRegexp)
124+
) and
125+
not name.regexpMatch(notSensitiveRegexp())
126+
}
127+
109128
/**
110129
* Holds if `name` may indicate the presence of sensitive data, and
111130
* `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
115134
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
116135
* given classification), and none of the regexps from `notSensitiveRegexp` matches
117136
* `name`.
137+
*
138+
* When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
139+
* pass, since that combines all the regexps into one, and should be faster. Then call this
140+
* predicate to get the classification(s).
118141
*/
119142
bindingset[name]
120143
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {

0 commit comments

Comments
 (0)