Merge pull request #16306 from github/nickrolfe/js-sensitive

nickrolfe · web-flow · commit af72c0848eb5 · 2024-04-24T09:49:44.000+01:00
JS: do fewer regexp matches in SensitiveActions
diff --git a/javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll b/javascript/ql/lib/semmle/javascript/security/SensitiveActions.qll
@@ -86,39 +86,37 @@ private predicate writesProperty(DataFlow::Node node, string name) {
 
 /** A write to a variable or property that might contain sensitive data. */
 private class BasicSensitiveWrite extends SensitiveWrite {
-  SensitiveDataClassification classification;
+  string name;
 
   BasicSensitiveWrite() {
-    exists(string name |
-      /*
-       * PERFORMANCE OPTIMISATION:
-       * `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
-       * To carry out a regex match, we must first compute the Cartesian product
-       * of all possible `name`s and regexes, then match.
-       * To keep this product as small as possible,
-       * we want to filter `name` as much as possible before the product.
-       *
-       * Do this by factoring out a helper predicate containing the filtering
-       * logic that restricts `name`. This helper predicate will get picked first
-       * in the join order, since it is the only call here that binds `name`.
-       */
-
-      writesProperty(this, name) and
-      nameIndicatesSensitiveData(name, classification)
-    )
+    /*
+     * PERFORMANCE OPTIMISATION:
+     * `nameIndicatesSensitiveData` performs a `regexpMatch` on `name`.
+     * To carry out a regex match, we must first compute the Cartesian product
+     * of all possible `name`s and regexes, then match.
+     * To keep this product as small as possible,
+     * we want to filter `name` as much as possible before the product.
+     *
+     * Do this by factoring out a helper predicate containing the filtering
+     * logic that restricts `name`. This helper predicate will get picked first
+     * in the join order, since it is the only call here that binds `name`.
+     */
+
+    writesProperty(this, name) and
+    nameIndicatesSensitiveData(name)
   }
 
   /** Gets a classification of the kind of sensitive data the write might handle. */
-  SensitiveDataClassification getClassification() { result = classification }
+  SensitiveDataClassification getClassification() { nameIndicatesSensitiveData(name, result) }
 }
 
 /** An access to a variable or property that might contain sensitive data. */
 private class BasicSensitiveVariableAccess extends SensitiveVariableAccess {
-  SensitiveDataClassification classification;
-
-  BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name, classification) }
+  BasicSensitiveVariableAccess() { nameIndicatesSensitiveData(name) }
 
-  override SensitiveDataClassification getClassification() { result = classification }
+  override SensitiveDataClassification getClassification() {
+    nameIndicatesSensitiveData(name, result)
+  }
 }
 
 /** A function name that suggests it may be sensitive. */
@@ -138,11 +136,11 @@ abstract class SensitiveDataFunctionName extends SensitiveFunctionName {
 
 /** A method that might return sensitive data, based on the name. */
 class CredentialsFunctionName extends SensitiveDataFunctionName {
-  SensitiveDataClassification classification;
-
-  CredentialsFunctionName() { nameIndicatesSensitiveData(this, classification) }
+  CredentialsFunctionName() { nameIndicatesSensitiveData(this) }
 
-  override SensitiveDataClassification getClassification() { result = classification }
+  override SensitiveDataClassification getClassification() {
+    nameIndicatesSensitiveData(this, result)
+  }
 }
 
 /**
diff --git a/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll b/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
       "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
   }
 
+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
   /**
    * Holds if `name` may indicate the presence of sensitive data, and
    * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
    * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
    * given classification), and none of the regexps from `notSensitiveRegexp` matches
    * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
    */
   bindingset[name]
   predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
diff --git a/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll b/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
       "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
   }
 
+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
   /**
    * Holds if `name` may indicate the presence of sensitive data, and
    * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
    * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
    * given classification), and none of the regexps from `notSensitiveRegexp` matches
    * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
    */
   bindingset[name]
   predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
diff --git a/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll b/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
       "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
   }
 
+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
   /**
    * Holds if `name` may indicate the presence of sensitive data, and
    * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
    * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
    * given classification), and none of the regexps from `notSensitiveRegexp` matches
    * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
    */
   bindingset[name]
   predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
diff --git a/swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll b/swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll
@@ -106,6 +106,25 @@ module HeuristicNames {
       "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
   }
 
+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
+   * the data is in fact non-sensitive (for example since it is hashed or encrypted).
+   *
+   * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
+   * classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name) {
+    exists(string combinedRegexp |
+      // Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
+      combinedRegexp =
+        "(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
+    |
+      name.regexpMatch(combinedRegexp)
+    ) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+
   /**
    * Holds if `name` may indicate the presence of sensitive data, and
    * `name` does not indicate that the data is in fact non-sensitive (for example since
@@ -115,6 +134,10 @@ module HeuristicNames {
    * That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
    * given classification), and none of the regexps from `notSensitiveRegexp` matches
    * `name`.
+   *
+   * When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
+   * pass, since that combines all the regexps into one, and should be faster. Then call this
+   * predicate to get the classification(s).
    */
   bindingset[name]
   predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {