From 25fbbeb076cf06958f0b33254c80cd311ecc1821 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Thu, 27 Feb 2025 16:39:51 +0000 Subject: [PATCH 1/6] minor cleanup --- assemblyline/datastore/collection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py index cb0838443..43ebfd3a2 100644 --- a/assemblyline/datastore/collection.py +++ b/assemblyline/datastore/collection.py @@ -15,7 +15,7 @@ from datetime import datetime from enum import Enum from os import environ -from typing import Dict, Any, Union, TypeVar, Generic +from typing import Dict, Any, Union, TypeVar, Generic, Optional import elasticsearch import elasticsearch.helpers @@ -222,8 +222,8 @@ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, ar if field.store: self.stored_fields[name] = field - def is_archive_index(self, index): - return self.archive_name and index.startswith(self.archive_name) + def is_archive_index(self, index) -> bool: + return bool(self.archive_name and index.startswith(self.archive_name)) def get_index_list(self, index_type): # Default value @@ -2032,17 +2032,17 @@ def __get_possible_fields(self, field): return field_types - def _check_fields(self, model=None): + def _check_fields(self, target_model: Optional[odm.Model] = None): if not self.validate: return - if model is None: + if target_model is None: if self.model_class: return self._check_fields(self.model_class) return fields = self.fields() - model = self.model_class.flat_fields(skip_mappings=True) + model = target_model.flat_fields(skip_mappings=True) missing = set(model.keys()) - set(fields.keys()) if missing: @@ -2071,7 +2071,7 @@ def _ensure_collection(self): index = f"{alias}_hot" # Create HOT index if not self.with_retries(self.datastore.client.indices.exists, index=alias): - log.debug(f"Index {alias.upper()} does not exists. Creating it now...") + log.debug("Index %s does not exists. Creating it now...", alias.upper()) try: self.with_retries(self.datastore.client.indices.create, index=index, mappings=self._get_index_mappings(), @@ -2079,7 +2079,7 @@ def _ensure_collection(self): except elasticsearch.exceptions.RequestError as e: if "resource_already_exists_exception" not in str(e): raise - log.warning(f"Tried to create an index template that already exists: {alias.upper()}") + log.warning("Tried to create an index template that already exists: %s", alias.upper()) self.with_retries(self.datastore.client.indices.put_alias, index=index, name=alias) elif not self.with_retries(self.datastore.client.indices.exists, index=index) and \ From 24f728f64bfc6ca35697fd59387f1edb55814624 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Thu, 27 Feb 2025 18:00:50 +0000 Subject: [PATCH 2/6] support wildcard and long the way we want --- assemblyline/datastore/support/build.py | 8 ++- assemblyline/odm/__init__.py | 7 ++- assemblyline/odm/base.py | 61 +++++++++--------- test/test_odm_mapping.py | 83 +++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 33 deletions(-) create mode 100644 test/test_odm_mapping.py diff --git a/assemblyline/datastore/support/build.py b/assemblyline/datastore/support/build.py index 4f713100b..1b0af6f5c 100644 --- a/assemblyline/datastore/support/build.py +++ b/assemblyline/datastore/support/build.py @@ -1,5 +1,5 @@ from assemblyline.odm.base import _Field -from assemblyline.odm import Keyword, Text, List, Compound, Date, Integer, Long, \ +from assemblyline.odm import Keyword, Wildcard, Text, List, Compound, Date, Integer, Long, \ Float, Boolean, Mapping, Classification, Enum, Any, UUID, Optional, IP, Domain, URI, URIPath, MAC, PhoneNumber, \ SSDeepHash, SHA1, SHA256, MD5, Platform, Processor, ClassificationString, FlattenedObject, Email, UpperKeyword, \ Json, ValidatedKeyword, UNCPath @@ -7,6 +7,7 @@ # Simple types can be resolved by a direct mapping __type_mapping = { Keyword: 'keyword', + Wildcard: 'wildcard', Boolean: 'boolean', Integer: 'integer', Long: 'long', @@ -111,6 +112,11 @@ def set_mapping(temp_field: _Field, body): "analyzer": __analyzer_mapping[field.__class__] }) + elif isinstance(field, Wildcard): + es_data_type = __type_mapping[field.__class__] + data = {'type': es_data_type} + mappings[name.strip(".")] = data + elif isinstance(field, Keyword): es_data_type = __type_mapping[field.__class__] data = {'type': es_data_type} diff --git a/assemblyline/odm/__init__.py b/assemblyline/odm/__init__.py index 88bef276a..6a99ec36c 100644 --- a/assemblyline/odm/__init__.py +++ b/assemblyline/odm/__init__.py @@ -5,7 +5,8 @@ # Imports that have the same effect as some part of the one above so that # type checking can use this file properly. -from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, Date, Long, Enum +from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, \ + Date, Long, Enum, Wildcard from datetime import datetime _InnerType = typing.TypeVar("_InnerType") @@ -27,6 +28,10 @@ def keyword(*args, **kwargs) -> str: return typing.cast(str, Keyword(*args, **kwargs)) +def wildcard(*args, **kwargs) -> str: + return typing.cast(str, Wildcard(*args, **kwargs)) + + def date(*args, **kwargs) -> datetime: return typing.cast(datetime, Date(*args, **kwargs)) diff --git a/assemblyline/odm/base.py b/assemblyline/odm/base.py index 6a3ff54ff..7b5928626 100644 --- a/assemblyline/odm/base.py +++ b/assemblyline/odm/base.py @@ -19,6 +19,7 @@ import sys import unicodedata from datetime import datetime +import typing from typing import Any as _Any from typing import Dict, Tuple, Union @@ -290,6 +291,31 @@ def check(self, value, **kwargs): return str(value) +class Wildcard(Keyword): + """ + A keyword with enhanced indexing to support more complex queries. + """ + + def check(self, value, **kwargs): + if self.optional and value is None: + return None + + # We have a special case for bytes here due to how often strings and bytes + # get mixed up in python apis + if isinstance(value, bytes): + raise ValueError(f"[{self.name or self.parent_name}] Keyword doesn't accept bytes values") + + if value == '' or value is None: + if self.default_set: + value = self.default + else: + raise ValueError(f"[{self.name or self.parent_name}] Empty strings are not allowed without defaults") + + if value is None: + return None + + return str(value) + class EmptyableKeyword(_Field): """ A keyword which allow to differentiate between empty and None values. @@ -638,9 +664,9 @@ def check(self, value, **kwargs): class Integer(_Field): - """A field storing an integer value.""" + """A field storing a signed 32 bit integer value.""" - def __init__(self, max: int = None, min: int = None, *args, **kwargs): + def __init__(self, max: typing.Optional[int] = None, min: typing.Optional[int] = None, *args, **kwargs): super().__init__(*args, **kwargs) self.max = max self.min = min @@ -668,35 +694,8 @@ def check(self, value, **kwargs): return ret_val -class Long(_Field): - """A field storing an integer value.""" - - def __init__(self, max: int = None, min: int = None, *args, **kwargs): - super().__init__(*args, **kwargs) - self.max = max - self.min = min - - def check(self, value, **kwargs): - if self.optional and value is None: - return None - - if value is None or value == "": - if self.default_set: - ret_val = self.default - else: - raise ValueError(f"[{self.name or self.parent_name}] No value provided and no default value set.") - else: - ret_val = int(value) - - # Test min/max - if self.max is not None and ret_val > self.max: - raise ValueError( - f"[{self.name or self.parent_name}] Value bigger then the max value. ({value} > {self.max})") - if self.min is not None and ret_val < self.min: - raise ValueError( - f"[{self.name or self.parent_name}] Value smaller then the min value. ({value} < {self.max})") - - return ret_val +class Long(Integer): + """A field storing a signed 64 bit integer value.""" class Float(_Field): diff --git a/test/test_odm_mapping.py b/test/test_odm_mapping.py new file mode 100644 index 000000000..236047163 --- /dev/null +++ b/test/test_odm_mapping.py @@ -0,0 +1,83 @@ +from assemblyline import odm +from assemblyline.datastore.collection import ESCollection +from assemblyline.datastore.support.build import build_mapping + + +@odm.model(index=True) +class OdmTestMapping1(odm.Model): + stable_text_field = odm.keyword() + swapped_text_field = odm.keyword() + stable_number_field = odm.integer() + swapped_number_field = odm.integer() + + +@odm.model(index=True) +class OdmTestMapping2(odm.Model): + stable_text_field = odm.keyword() + swapped_text_field = odm.wildcard() + stable_number_field = odm.integer() + swapped_number_field = odm.long() + + +def test_example_mapping_type(): + """Test that the example models produce the expected mapping types""" + properties, dynamic = build_mapping(OdmTestMapping1.fields().values()) + + # There should be no dynamic mappings, just one rule forbidding implicit mappings + assert len(dynamic) == 1 + assert 'refuse_all_implicit_mappings' in dynamic[0] + + # Check that the static fields have the mapping type we want + assert len(properties) == 4 + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'keyword' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'integer' + + properties, dynamic = build_mapping(OdmTestMapping2.fields().values()) + + # There should be no dynamic mappings, just one rule forbidding implicit mappings + assert len(dynamic) == 1 + assert 'refuse_all_implicit_mappings' in dynamic[0] + + # Check that the static fields have the mapping type we want + assert len(properties) == 4 + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'wildcard' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'long' + + +def test_field_upgrade_ok(datastore_connection): + """Test that changing a field from keyword to wildcard doesn't break anything.""" + # Clean up from any previous runs + collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=False) + collection.wipe(recreate=False) + + # Create the collection in elastic + collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=True) + properties = collection.fields() + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'keyword' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'integer' + + # Open that same collection using the new mapping + collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping2, validate=True) + + # Check that the fields haven't changed + properties = collection.fields() + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'keyword' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'integer' + + # Reindex + collection.reindex() + + # Check that the fields match the new model + properties = collection.fields() + assert properties['stable_text_field']['type'] == 'keyword' + assert properties['swapped_text_field']['type'] == 'wildcard' + assert properties['stable_number_field']['type'] == 'integer' + assert properties['swapped_number_field']['type'] == 'long' From 27ba72975d4e7715de906acfb41a318882bd2606 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Thu, 27 Feb 2025 18:08:13 +0000 Subject: [PATCH 3/6] switch file sizes to long --- assemblyline/odm/messages/task.py | 2 +- assemblyline/odm/models/alert.py | 2 +- assemblyline/odm/models/badlist.py | 2 +- assemblyline/odm/models/file.py | 2 +- assemblyline/odm/models/safelist.py | 2 +- assemblyline/odm/models/submission.py | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/assemblyline/odm/messages/task.py b/assemblyline/odm/messages/task.py index 27e63253e..253471b33 100644 --- a/assemblyline/odm/messages/task.py +++ b/assemblyline/odm/messages/task.py @@ -15,7 +15,7 @@ class FileInfo(odm.Model): mime = odm.Optional(odm.Keyword(), description="The libmagic mime type") sha1 = odm.SHA1(description="SHA1 hash of the file") sha256 = odm.SHA256(description="SHA256 hash of the file") - size = odm.Integer(description="Size of the file in bytes") + size = odm.long(description="Size of the file in bytes") ssdeep = odm.Optional(odm.SSDeepHash(description="SSDEEP hash of the file")) tlsh = odm.Optional(odm.Keyword(description="TLSH hash of the file")) type = odm.Keyword(description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/alert.py b/assemblyline/odm/models/alert.py index affb0a8a6..904646cbf 100644 --- a/assemblyline/odm/models/alert.py +++ b/assemblyline/odm/models/alert.py @@ -115,7 +115,7 @@ class File(odm.Model): name = odm.Keyword(copyto="__text__", description="The original name of the file as submitted.") sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file.") sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file.") - size = odm.Integer(store=False, description="The size of the file in bytes.") + size = odm.long(store=False, description="The size of the file in bytes.") type = odm.Keyword(copyto="__text__", description=" The file type as identified by Assemblyline's analysis.") screenshots = odm.List(odm.Compound(Screenshot), default=[], description="Screenshots taken of the file during analysis, if applicable.") diff --git a/assemblyline/odm/models/badlist.py b/assemblyline/odm/models/badlist.py index 117f6c08b..a557bfd5e 100644 --- a/assemblyline/odm/models/badlist.py +++ b/assemblyline/odm/models/badlist.py @@ -30,7 +30,7 @@ class Hashes(odm.Model): class File(odm.Model): name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[], description="List of names seen for that file") - size = odm.Optional(odm.Integer(), description="Size of the file in bytes") + size = odm.Optional(odm.long(), description="Size of the file in bytes") type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/file.py b/assemblyline/odm/models/file.py index aa433b086..1948a6e00 100644 --- a/assemblyline/odm/models/file.py +++ b/assemblyline/odm/models/file.py @@ -102,7 +102,7 @@ class File(odm.Model): seen = odm.Compound(Seen, default={}, description="Records the frequency and timestamps of when the file was encountered.", ai=False) sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file, providing a more secure alternative to MD5 for integrity checks.", ai=False) sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file, offering a high level of security for integrity verification.") - size = odm.Integer(description="Size of the file in bytes.") + size = odm.long(description="Size of the file in bytes.") ssdeep = odm.SSDeepHash(store=False, description="The fuzzy hash of the file using SSDEEP, which is useful for identifying similar files.", ai=False) type = odm.Keyword(copyto="__text__", description="The file type as determined by the AssemblyLine file type identification service.") tlsh = odm.Optional(odm.Keyword(copyto="__text__"), description="A locality-sensitive hash (TLSH) of the file's content, useful for similarity comparisons.", ai=False) diff --git a/assemblyline/odm/models/safelist.py b/assemblyline/odm/models/safelist.py index 0d6329efe..81900e8d0 100644 --- a/assemblyline/odm/models/safelist.py +++ b/assemblyline/odm/models/safelist.py @@ -17,7 +17,7 @@ class Hashes(odm.Model): class File(odm.Model): name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[], description="List of names seen for that file") - size = odm.Optional(odm.Integer(), description="Size of the file in bytes") + size = odm.Optional(odm.long(), description="Size of the file in bytes") type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/submission.py b/assemblyline/odm/models/submission.py index 23615a4a0..66bceebb3 100644 --- a/assemblyline/odm/models/submission.py +++ b/assemblyline/odm/models/submission.py @@ -12,7 +12,7 @@ @odm.model(index=True, store=False, description="File Model of Submission") class File(odm.Model): name = odm.Keyword(copyto="__text__", description="Name of the file") - size = odm.Optional(odm.Integer(), description="Size of the file in bytes") + size = odm.Optional(odm.long(), description="Size of the file in bytes") sha256 = odm.SHA256(copyto="__text__", description="SHA256 hash of the file") @@ -142,7 +142,7 @@ class Submission(odm.Model): max_score = odm.Integer(description="Maximum score of all the files in the scan") metadata = odm.FlattenedObject(store=False, description="Metadata associated to the submission") params: SubmissionParams = odm.Compound(SubmissionParams, description="Submission parameter details", ai=False) - results: list[str] = odm.List(odm.Keyword(), store=False, description="List of result keys", ai=False) + results: list[str] = odm.List(odm.wildcard(), store=False, description="List of result keys", ai=False) sid: str = odm.UUID(copyto="__text__", description="Submission ID") state = odm.Enum(values=SUBMISSION_STATES, description="Status of the submission", ai=False) to_be_deleted = odm.Boolean( From f1815b5cb1ed298c3d30ee29ccf245eb5a7a4f09 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Thu, 27 Feb 2025 19:16:10 +0000 Subject: [PATCH 4/6] also file size config --- assemblyline/odm/models/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assemblyline/odm/models/config.py b/assemblyline/odm/models/config.py index b9d767a1f..25f3fced0 100644 --- a/assemblyline/odm/models/config.py +++ b/assemblyline/odm/models/config.py @@ -2014,7 +2014,7 @@ class Submission(odm.Model): emptyresult_dtl: int = odm.Integer(min=0, description="Number of days emptyresult will remain in the system") max_dtl: int = odm.Integer(min=0, description="Maximum number of days submissions will remain in the system") max_extraction_depth: int = odm.Integer(description="Maximum files extraction depth") - max_file_size: int = odm.Integer(description="Maximum size for files submitted in the system") + max_file_size: int = odm.long(description="Maximum size for files submitted in the system") max_metadata_length: int = odm.Integer(description="Maximum length for each metadata values") max_temp_data_length: int = odm.Integer(description="Maximum length for each temporary data values") metadata: MetadataConfig = odm.Compound(MetadataConfig, default=DEFAULT_METADATA_CONFIGURATION, From 9a6ec30dfa0610c5e73e40c8d090600dba90a0b5 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Fri, 28 Feb 2025 15:34:06 +0000 Subject: [PATCH 5/6] remove model changes --- assemblyline/datastore/collection.py | 8 ++++---- assemblyline/odm/messages/task.py | 2 +- assemblyline/odm/models/alert.py | 2 +- assemblyline/odm/models/badlist.py | 2 +- assemblyline/odm/models/file.py | 2 +- assemblyline/odm/models/safelist.py | 2 +- assemblyline/odm/models/submission.py | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py index 43ebfd3a2..4f64dd78a 100644 --- a/assemblyline/datastore/collection.py +++ b/assemblyline/datastore/collection.py @@ -8,19 +8,19 @@ import typing import warnings -from copy import deepcopy -from assemblyline.common.isotime import now_as_iso -from datemath import dm -from datemath.helpers import DateMathException from datetime import datetime from enum import Enum from os import environ from typing import Dict, Any, Union, TypeVar, Generic, Optional +from copy import deepcopy +from datemath import dm +from datemath.helpers import DateMathException import elasticsearch import elasticsearch.helpers from assemblyline import odm +from assemblyline.common.isotime import now_as_iso from assemblyline.common.dict_utils import recursive_update from assemblyline.datastore.bulk import ElasticBulkPlan from assemblyline.datastore.exceptions import ( diff --git a/assemblyline/odm/messages/task.py b/assemblyline/odm/messages/task.py index 253471b33..27e63253e 100644 --- a/assemblyline/odm/messages/task.py +++ b/assemblyline/odm/messages/task.py @@ -15,7 +15,7 @@ class FileInfo(odm.Model): mime = odm.Optional(odm.Keyword(), description="The libmagic mime type") sha1 = odm.SHA1(description="SHA1 hash of the file") sha256 = odm.SHA256(description="SHA256 hash of the file") - size = odm.long(description="Size of the file in bytes") + size = odm.Integer(description="Size of the file in bytes") ssdeep = odm.Optional(odm.SSDeepHash(description="SSDEEP hash of the file")) tlsh = odm.Optional(odm.Keyword(description="TLSH hash of the file")) type = odm.Keyword(description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/alert.py b/assemblyline/odm/models/alert.py index 904646cbf..affb0a8a6 100644 --- a/assemblyline/odm/models/alert.py +++ b/assemblyline/odm/models/alert.py @@ -115,7 +115,7 @@ class File(odm.Model): name = odm.Keyword(copyto="__text__", description="The original name of the file as submitted.") sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file.") sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file.") - size = odm.long(store=False, description="The size of the file in bytes.") + size = odm.Integer(store=False, description="The size of the file in bytes.") type = odm.Keyword(copyto="__text__", description=" The file type as identified by Assemblyline's analysis.") screenshots = odm.List(odm.Compound(Screenshot), default=[], description="Screenshots taken of the file during analysis, if applicable.") diff --git a/assemblyline/odm/models/badlist.py b/assemblyline/odm/models/badlist.py index a557bfd5e..117f6c08b 100644 --- a/assemblyline/odm/models/badlist.py +++ b/assemblyline/odm/models/badlist.py @@ -30,7 +30,7 @@ class Hashes(odm.Model): class File(odm.Model): name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[], description="List of names seen for that file") - size = odm.Optional(odm.long(), description="Size of the file in bytes") + size = odm.Optional(odm.Integer(), description="Size of the file in bytes") type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/file.py b/assemblyline/odm/models/file.py index 1948a6e00..aa433b086 100644 --- a/assemblyline/odm/models/file.py +++ b/assemblyline/odm/models/file.py @@ -102,7 +102,7 @@ class File(odm.Model): seen = odm.Compound(Seen, default={}, description="Records the frequency and timestamps of when the file was encountered.", ai=False) sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file, providing a more secure alternative to MD5 for integrity checks.", ai=False) sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file, offering a high level of security for integrity verification.") - size = odm.long(description="Size of the file in bytes.") + size = odm.Integer(description="Size of the file in bytes.") ssdeep = odm.SSDeepHash(store=False, description="The fuzzy hash of the file using SSDEEP, which is useful for identifying similar files.", ai=False) type = odm.Keyword(copyto="__text__", description="The file type as determined by the AssemblyLine file type identification service.") tlsh = odm.Optional(odm.Keyword(copyto="__text__"), description="A locality-sensitive hash (TLSH) of the file's content, useful for similarity comparisons.", ai=False) diff --git a/assemblyline/odm/models/safelist.py b/assemblyline/odm/models/safelist.py index 81900e8d0..0d6329efe 100644 --- a/assemblyline/odm/models/safelist.py +++ b/assemblyline/odm/models/safelist.py @@ -17,7 +17,7 @@ class Hashes(odm.Model): class File(odm.Model): name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[], description="List of names seen for that file") - size = odm.Optional(odm.long(), description="Size of the file in bytes") + size = odm.Optional(odm.Integer(), description="Size of the file in bytes") type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline") diff --git a/assemblyline/odm/models/submission.py b/assemblyline/odm/models/submission.py index 66bceebb3..23615a4a0 100644 --- a/assemblyline/odm/models/submission.py +++ b/assemblyline/odm/models/submission.py @@ -12,7 +12,7 @@ @odm.model(index=True, store=False, description="File Model of Submission") class File(odm.Model): name = odm.Keyword(copyto="__text__", description="Name of the file") - size = odm.Optional(odm.long(), description="Size of the file in bytes") + size = odm.Optional(odm.Integer(), description="Size of the file in bytes") sha256 = odm.SHA256(copyto="__text__", description="SHA256 hash of the file") @@ -142,7 +142,7 @@ class Submission(odm.Model): max_score = odm.Integer(description="Maximum score of all the files in the scan") metadata = odm.FlattenedObject(store=False, description="Metadata associated to the submission") params: SubmissionParams = odm.Compound(SubmissionParams, description="Submission parameter details", ai=False) - results: list[str] = odm.List(odm.wildcard(), store=False, description="List of result keys", ai=False) + results: list[str] = odm.List(odm.Keyword(), store=False, description="List of result keys", ai=False) sid: str = odm.UUID(copyto="__text__", description="Submission ID") state = odm.Enum(values=SUBMISSION_STATES, description="Status of the submission", ai=False) to_be_deleted = odm.Boolean( From 4f8c6be0b7031e836eefb2af8595f94341d9e235 Mon Sep 17 00:00:00 2001 From: Adam Douglass Date: Wed, 5 Mar 2025 21:23:02 +0000 Subject: [PATCH 6/6] add changes to support metadata remapping --- assemblyline/datastore/collection.py | 2 +- assemblyline/datastore/support/build.py | 8 +++-- test/test_odm_mapping.py | 42 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py index 4f64dd78a..8df4a8555 100644 --- a/assemblyline/datastore/collection.py +++ b/assemblyline/datastore/collection.py @@ -2004,7 +2004,7 @@ def _get_index_mappings(self) -> dict: if self.model_class: mappings['properties'], mappings['dynamic_templates'] = \ build_mapping(self.model_class.fields().values()) - mappings['dynamic_templates'].insert(0, default_dynamic_strings) + mappings['dynamic_templates'].append(default_dynamic_strings) else: mappings['dynamic_templates'] = deepcopy(default_dynamic_templates) diff --git a/assemblyline/datastore/support/build.py b/assemblyline/datastore/support/build.py index 1b0af6f5c..2f377a877 100644 --- a/assemblyline/datastore/support/build.py +++ b/assemblyline/datastore/support/build.py @@ -204,14 +204,18 @@ def build_templates(name, field, nested_template=False, index=True) -> list: return [{f"nested_{name}": main_template}] else: + mapping = __type_mapping[field.__class__] field_template = { "path_match": name, "mapping": { - "type": __type_mapping[field.__class__], + "type": mapping, } } - field_template['mapping']['index'] = field.index + # Wildcard doesn't suport setting index, its _always_ indexed + if mapping != 'wildcard': + field_template['mapping']['index'] = field.index + if field.copyto: assert len(field.copyto) == 1 field_template['mapping']['copy_to'] = field.copyto[0] diff --git a/test/test_odm_mapping.py b/test/test_odm_mapping.py index 236047163..425a6cee8 100644 --- a/test/test_odm_mapping.py +++ b/test/test_odm_mapping.py @@ -81,3 +81,45 @@ def test_field_upgrade_ok(datastore_connection): assert properties['swapped_text_field']['type'] == 'wildcard' assert properties['stable_number_field']['type'] == 'integer' assert properties['swapped_number_field']['type'] == 'long' + + +def test_metadata_indexing(datastore_connection): + + @odm.model(index=True) + class TestMapping(odm.Model): + metadata = odm.Mapping(odm.wildcard(copyto='__text__')) + + # Clean up from any previous runs + collection = ESCollection(datastore_connection.ds, "test_metadata_indexing", TestMapping, validate=False) + collection.wipe(recreate=False) + + print(build_mapping(TestMapping.fields().values())) + + # Create with new mapping configuration + collection = ESCollection(datastore_connection.ds, "test_metadata_indexing", TestMapping, validate=True) + + # Insert data to trigger dynamic field creation + collection.save("1", {"metadata": {'field1': 123}}) + collection.save("2", {"metadata": {'field2': "123"}}) + collection.save("3", {"metadata": {'field3': {'subfield': "cat dog cat"}}}) + collection.save("4", {"metadata": {'address': "https://cyber.gc.ca"}}) + collection.commit() + + # Check if those fields are the type and config we want + fields = collection.fields() + fields.pop('id') + + assert len(fields) == 4 + for field_name, field in fields.items(): + assert field['type'] == 'wildcard', (field_name, field) + assert field['indexed'] + assert field['default'], (field_name, field) + + # Check that copyto and regex work + search = collection.search("cyber.gc.ca") + assert search['total'] == 1 + assert search['items'][0].id == "4" + + search = collection.search("address: /http[s]://cyber\\.(gc\\.ca|com)/") + assert search['total'] == 1 + assert search['items'][0].id == "4"