add inverted index for faster cpe search #92

muchdogesec · Jan 23, 2025 · 02b1c91 · 02b1c91
1 parent 5f79f39
commit 02b1c91
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 22 deletions.
diff --git a/vulmatch/server/arango_helpers.py b/vulmatch/server/arango_helpers.py
@@ -123,6 +123,7 @@
 
 CPE_RELATIONSHIP_TYPES = {"vulnerable-to": "exploits", "in-pattern": "relies-on"}
 CPE_REL_SORT_FIELDS = ["modified_descending", "modified_ascending", "created_descending", "created_ascending"]
+CPE_SORT_FIELDS = ['part_descending', 'part_ascending', 'vendor_descending', 'vendor_ascending', 'product_ascending', 'product_descending', 'version_ascending', 'version_descending']
 CVE_BUNDLE_TYPES = set([
   "vulnerability",
   "indicator",
@@ -176,7 +177,10 @@ def query_as_bool(self, key, default=True):
             return default
         return query_str.lower() == 'true'
 
-
+    @classmethod
+    def like_string(cls, string: str):
+        return '%'+string+'%'
+
     @classmethod
     def get_page_params(cls, request):
         kwargs = request.GET.copy()
@@ -524,29 +528,22 @@ def get_softwares(self):
                 "FILTER doc.id in @ids"
             )
 
-        if value := self.query_as_array('cpe_match_string'):
-            bind_vars['cpe_match_string'] = value
+        if value := self.query.get('cpe_match_string'):
+            bind_vars['cpe_match_string'] = self.like_string(value)
             filters.append(
-                "FILTER @cpe_match_string[? ANY FILTER CONTAINS(doc.cpe, CURRENT)]"
+                "FILTER doc.cpe LIKE @cpe_match_string"
             )
 
         struct_match = {}
         if value := self.query.get('product_type'):
             struct_match['part'] = value[0]
             filters.append('FILTER doc.x_cpe_struct.part == @struct_match.part')
 
-        if value := self.query.get('product'):
-            struct_match['product'] = value.lower()
-            filters.append('FILTER CONTAINS(doc.x_cpe_struct.product, @struct_match.product)')
-
-        if value := self.query.get('vendor'):
-            struct_match['vendor'] = value
-            filters.append('FILTER CONTAINS(doc.x_cpe_struct.vendor, @struct_match.vendor)')
 
-        for k in ['version', 'update', 'edition', 'language', 'sw_edition', 'target_sw', 'target_hw', 'other']:
+        for k in ['product', 'vendor', 'version', 'update', 'edition', 'language', 'sw_edition', 'target_sw', 'target_hw', 'other']:
             if v := self.query.get(k):
-                struct_match[k] = v
-                filters.append(f'FILTER CONTAINS(doc.x_cpe_struct.{k}, @struct_match.{k})')
+                struct_match[k] = self.like_string(v)
+                filters.append(f'FILTER doc.x_cpe_struct.{k} LIKE @struct_match.{k}')
 
         if struct_match:
             bind_vars['struct_match'] = struct_match
@@ -568,14 +565,17 @@ def get_softwares(self):
             filters.append('FILTER CONTAINS(doc.name, @name)')
 
         query = """
-            FOR doc in @@collection
-            FILTER doc.type == 'software' AND doc._is_latest
+            FOR doc in @@collection OPTIONS {indexHint: "cpe_search_inv", forceIndexHint: true}
+            FILTER doc.type == 'software' AND doc._is_latest == TRUE
             LET cve_matches = (FOR d in nvd_cve_edge_collection FILTER d._to == doc._id AND d.relationship_type IN ['exploits', 'relies-on'] RETURN [d.relationship_type, d.external_references[0].external_id])
 
             @filters
+            @sort_stmt
             LIMIT @offset, @count
             RETURN KEEP(doc, KEYS(doc, true))
-        """.replace('@filters', '\n'.join(filters))
+        """.replace('@filters', '\n'.join(filters))\
+            .replace('@sort_stmt', self.get_sort_stmt(CPE_SORT_FIELDS, doc_name='doc.x_cpe_struct'))
+        # return HttpResponse(f"""{query}\n// {json.dumps(bind_vars)}""")
         return self.execute_query(query, bind_vars=bind_vars)
 
     def get_relationships(self, docs_query, binds):

diff --git a/vulmatch/server/views.py b/vulmatch/server/views.py
@@ -2,7 +2,7 @@
 from django.shortcuts import render
 from rest_framework import viewsets, filters, status, decorators
 
-from vulmatch.server.arango_helpers import ATLAS_TYPES, CPE_REL_SORT_FIELDS, CPE_RELATIONSHIP_TYPES, CVE_BUNDLE_TYPES, CVE_SORT_FIELDS, LOCATION_TYPES, TLP_TYPES, ArangoDBHelper, ATTACK_TYPES, CWE_TYPES, SOFTWARE_TYPES, CAPEC_TYPES
+from vulmatch.server.arango_helpers import ATLAS_TYPES, CPE_REL_SORT_FIELDS, CPE_RELATIONSHIP_TYPES, CPE_SORT_FIELDS, CVE_BUNDLE_TYPES, CVE_SORT_FIELDS, LOCATION_TYPES, TLP_TYPES, ArangoDBHelper, ATTACK_TYPES, CWE_TYPES, SOFTWARE_TYPES, CAPEC_TYPES
 from vulmatch.server.autoschema import DEFAULT_400_ERROR
 from vulmatch.server.utils import Pagination, Response, Ordering, split_mitre_version
 from vulmatch.worker.tasks import new_task
@@ -394,7 +394,13 @@ class filterset_class(FilterSet):
         target_sw = CharFilter(help_text='Characterises the software computing environment within which the product operates (this is the 10th value in the CPE URI).')
         target_hw = CharFilter(help_text='Characterises the instruction set architecture (e.g., x86) on which the product being described or identified operates (this is the 11th value in the CPE URI).')
         other = CharFilter(help_text='Capture any other general descriptive or identifying information which is vendor- or product-specific and which does not logically fit in any other attribute value (this is the 12th value in the CPE URI).')
-
+
+
+    @extend_schema(
+            parameters=[
+                OpenApiParameter('sort', enum=CPE_SORT_FIELDS, description="Sort results by"),
+            ]
+    )
     @decorators.action(methods=['GET'], url_path="objects", detail=False)
     def list_objects(self, request, *args, **kwargs):
         return ArangoDBHelper('', request).get_softwares()

diff --git a/vulmatch/worker/populate_dbs.py b/vulmatch/worker/populate_dbs.py
@@ -34,9 +34,18 @@ def create_indexes(db: StandardDatabase):
     time = int(datetime.now().timestamp())
     for sorter in "created modified name cpe".split():
         vertex_collection.add_index(dict(type='persistent', fields=["type", "_is_latest", sorter], inBackground=True, name=f"vulmatch_cve_sort_{sorter}_{time}"))
-    vertex_collection.add_index(dict(type='persistent', fields=["cpe"], storedValues=["id"], inBackground=True, name=f"vulmatch_cpe"))
-    vertex_collection.add_index(dict(type='persistent', fields=["name"], inBackground=True, name=f"vulmatch_name"))    
-
+    vertex_collection.add_index(dict(type='persistent', fields=["cpe"], storedValues=["id"], inBackground=True, name=f"vulmatch_cpe", sparse=True))
+    vertex_collection.add_index(dict(type='persistent', fields=["type", "cpe"], storedValues=["id"], inBackground=True, name=f"vulmatch_type_cpe"))
+    vertex_collection.add_index(dict(type='persistent', fields=["name"], inBackground=True, name=f"vulmatch_name"))
+    db.create_analyzer('norm_en', analyzer_type='norm', properties={ "locale": "en", "accent": False, "case": "lower" })
+    vertex_collection.add_index(dict(type='inverted', name='cpe_search_inv', fields=[
+        dict(name='cpe', analyzer='norm_en'),
+        "id",
+        "type",
+        *[dict(name=f'x_cpe_struct.{name}', analyzer='norm_en') for name in ['product', 'vendor', 'version', 'update', 'edition', 'language', 'sw_edition', 'target_sw', 'target_hw', 'other']],
+        "x_cpe_struct.part"
+    ], inBackground=True))
+
 
 def create_collections():
     #create db/collections