From 25fbbeb076cf06958f0b33254c80cd311ecc1821 Mon Sep 17 00:00:00 2001
From: Adam Douglass <Adam.Douglass@cyber.gc.ca>
Date: Thu, 27 Feb 2025 16:39:51 +0000
Subject: [PATCH 1/6] minor cleanup

---
 assemblyline/datastore/collection.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py
index cb0838443..43ebfd3a2 100644
--- a/assemblyline/datastore/collection.py
+++ b/assemblyline/datastore/collection.py
@@ -15,7 +15,7 @@
 from datetime import datetime
 from enum import Enum
 from os import environ
-from typing import Dict, Any, Union, TypeVar, Generic
+from typing import Dict, Any, Union, TypeVar, Generic, Optional
 
 import elasticsearch
 import elasticsearch.helpers
@@ -222,8 +222,8 @@ def __init__(self, datastore: ESStore, name, model_class=None, validate=True, ar
                 if field.store:
                     self.stored_fields[name] = field
 
-    def is_archive_index(self, index):
-        return self.archive_name and index.startswith(self.archive_name)
+    def is_archive_index(self, index) -> bool:
+        return bool(self.archive_name and index.startswith(self.archive_name))
 
     def get_index_list(self, index_type):
         # Default value
@@ -2032,17 +2032,17 @@ def __get_possible_fields(self, field):
 
         return field_types
 
-    def _check_fields(self, model=None):
+    def _check_fields(self, target_model: Optional[odm.Model] = None):
         if not self.validate:
             return
 
-        if model is None:
+        if target_model is None:
             if self.model_class:
                 return self._check_fields(self.model_class)
             return
 
         fields = self.fields()
-        model = self.model_class.flat_fields(skip_mappings=True)
+        model = target_model.flat_fields(skip_mappings=True)
 
         missing = set(model.keys()) - set(fields.keys())
         if missing:
@@ -2071,7 +2071,7 @@ def _ensure_collection(self):
             index = f"{alias}_hot"
             # Create HOT index
             if not self.with_retries(self.datastore.client.indices.exists, index=alias):
-                log.debug(f"Index {alias.upper()} does not exists. Creating it now...")
+                log.debug("Index %s does not exists. Creating it now...", alias.upper())
                 try:
                     self.with_retries(self.datastore.client.indices.create, index=index,
                                       mappings=self._get_index_mappings(),
@@ -2079,7 +2079,7 @@ def _ensure_collection(self):
                 except elasticsearch.exceptions.RequestError as e:
                     if "resource_already_exists_exception" not in str(e):
                         raise
-                    log.warning(f"Tried to create an index template that already exists: {alias.upper()}")
+                    log.warning("Tried to create an index template that already exists: %s", alias.upper())
 
                 self.with_retries(self.datastore.client.indices.put_alias, index=index, name=alias)
             elif not self.with_retries(self.datastore.client.indices.exists, index=index) and \

From 24f728f64bfc6ca35697fd59387f1edb55814624 Mon Sep 17 00:00:00 2001
From: Adam Douglass <Adam.Douglass@cyber.gc.ca>
Date: Thu, 27 Feb 2025 18:00:50 +0000
Subject: [PATCH 2/6] support wildcard and long the way we want

---
 assemblyline/datastore/support/build.py |  8 ++-
 assemblyline/odm/__init__.py            |  7 ++-
 assemblyline/odm/base.py                | 61 +++++++++---------
 test/test_odm_mapping.py                | 83 +++++++++++++++++++++++++
 4 files changed, 126 insertions(+), 33 deletions(-)
 create mode 100644 test/test_odm_mapping.py

diff --git a/assemblyline/datastore/support/build.py b/assemblyline/datastore/support/build.py
index 4f713100b..1b0af6f5c 100644
--- a/assemblyline/datastore/support/build.py
+++ b/assemblyline/datastore/support/build.py
@@ -1,5 +1,5 @@
 from assemblyline.odm.base import _Field
-from assemblyline.odm import Keyword, Text, List, Compound, Date, Integer, Long, \
+from assemblyline.odm import Keyword, Wildcard, Text, List, Compound, Date, Integer, Long, \
     Float, Boolean, Mapping, Classification, Enum, Any, UUID, Optional, IP, Domain, URI, URIPath, MAC, PhoneNumber, \
     SSDeepHash, SHA1, SHA256, MD5, Platform, Processor, ClassificationString, FlattenedObject, Email, UpperKeyword, \
     Json, ValidatedKeyword, UNCPath
@@ -7,6 +7,7 @@
 # Simple types can be resolved by a direct mapping
 __type_mapping = {
     Keyword: 'keyword',
+    Wildcard: 'wildcard',
     Boolean: 'boolean',
     Integer: 'integer',
     Long: 'long',
@@ -111,6 +112,11 @@ def set_mapping(temp_field: _Field, body):
                 "analyzer": __analyzer_mapping[field.__class__]
             })
 
+        elif isinstance(field, Wildcard):
+            es_data_type = __type_mapping[field.__class__]
+            data = {'type': es_data_type}
+            mappings[name.strip(".")] = data
+
         elif isinstance(field, Keyword):
             es_data_type = __type_mapping[field.__class__]
             data = {'type': es_data_type}
diff --git a/assemblyline/odm/__init__.py b/assemblyline/odm/__init__.py
index 88bef276a..6a99ec36c 100644
--- a/assemblyline/odm/__init__.py
+++ b/assemblyline/odm/__init__.py
@@ -5,7 +5,8 @@
 
 # Imports that have the same effect as some part of the one above so that
 # type checking can use this file properly.
-from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, Date, Long, Enum
+from assemblyline.odm.base import Keyword, Optional, Boolean, Integer, List, Compound, Mapping, \
+    Date, Long, Enum, Wildcard
 from datetime import datetime
 
 _InnerType = typing.TypeVar("_InnerType")
@@ -27,6 +28,10 @@ def keyword(*args, **kwargs) -> str:
     return typing.cast(str, Keyword(*args, **kwargs))
 
 
+def wildcard(*args, **kwargs) -> str:
+    return typing.cast(str, Wildcard(*args, **kwargs))
+
+
 def date(*args, **kwargs) -> datetime:
     return typing.cast(datetime, Date(*args, **kwargs))
 
diff --git a/assemblyline/odm/base.py b/assemblyline/odm/base.py
index 6a3ff54ff..7b5928626 100644
--- a/assemblyline/odm/base.py
+++ b/assemblyline/odm/base.py
@@ -19,6 +19,7 @@
 import sys
 import unicodedata
 from datetime import datetime
+import typing
 from typing import Any as _Any
 from typing import Dict, Tuple, Union
 
@@ -290,6 +291,31 @@ def check(self, value, **kwargs):
         return str(value)
 
 
+class Wildcard(Keyword):
+    """
+    A keyword with enhanced indexing to support more complex queries.
+    """
+
+    def check(self, value, **kwargs):
+        if self.optional and value is None:
+            return None
+
+        # We have a special case for bytes here due to how often strings and bytes
+        # get mixed up in python apis
+        if isinstance(value, bytes):
+            raise ValueError(f"[{self.name or self.parent_name}] Keyword doesn't accept bytes values")
+
+        if value == '' or value is None:
+            if self.default_set:
+                value = self.default
+            else:
+                raise ValueError(f"[{self.name or self.parent_name}] Empty strings are not allowed without defaults")
+
+        if value is None:
+            return None
+
+        return str(value)
+
 class EmptyableKeyword(_Field):
     """
     A keyword which allow to differentiate between empty and None values.
@@ -638,9 +664,9 @@ def check(self, value, **kwargs):
 
 
 class Integer(_Field):
-    """A field storing an integer value."""
+    """A field storing a signed 32 bit integer value."""
 
-    def __init__(self, max: int = None, min: int = None, *args, **kwargs):
+    def __init__(self, max: typing.Optional[int] = None, min: typing.Optional[int] = None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.max = max
         self.min = min
@@ -668,35 +694,8 @@ def check(self, value, **kwargs):
         return ret_val
 
 
-class Long(_Field):
-    """A field storing an integer value."""
-
-    def __init__(self, max: int = None, min: int = None, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.max = max
-        self.min = min
-
-    def check(self, value, **kwargs):
-        if self.optional and value is None:
-            return None
-
-        if value is None or value == "":
-            if self.default_set:
-                ret_val = self.default
-            else:
-                raise ValueError(f"[{self.name or self.parent_name}] No value provided and no default value set.")
-        else:
-            ret_val = int(value)
-
-        # Test min/max
-        if self.max is not None and ret_val > self.max:
-            raise ValueError(
-                f"[{self.name or self.parent_name}] Value bigger then the max value. ({value} > {self.max})")
-        if self.min is not None and ret_val < self.min:
-            raise ValueError(
-                f"[{self.name or self.parent_name}] Value smaller then the min value. ({value} < {self.max})")
-
-        return ret_val
+class Long(Integer):
+    """A field storing a signed 64 bit integer value."""
 
 
 class Float(_Field):
diff --git a/test/test_odm_mapping.py b/test/test_odm_mapping.py
new file mode 100644
index 000000000..236047163
--- /dev/null
+++ b/test/test_odm_mapping.py
@@ -0,0 +1,83 @@
+from assemblyline import odm
+from assemblyline.datastore.collection import ESCollection
+from assemblyline.datastore.support.build import build_mapping
+
+
+@odm.model(index=True)
+class OdmTestMapping1(odm.Model):
+    stable_text_field = odm.keyword()
+    swapped_text_field = odm.keyword()
+    stable_number_field = odm.integer()
+    swapped_number_field = odm.integer()
+
+
+@odm.model(index=True)
+class OdmTestMapping2(odm.Model):
+    stable_text_field = odm.keyword()
+    swapped_text_field = odm.wildcard()
+    stable_number_field = odm.integer()
+    swapped_number_field = odm.long()
+
+
+def test_example_mapping_type():
+    """Test that the example models produce the expected mapping types"""
+    properties, dynamic = build_mapping(OdmTestMapping1.fields().values())
+
+    # There should be no dynamic mappings, just one rule forbidding implicit mappings
+    assert len(dynamic) == 1
+    assert 'refuse_all_implicit_mappings' in dynamic[0]
+
+    # Check that the static fields have the mapping type we want
+    assert len(properties) == 4
+    assert properties['stable_text_field']['type'] == 'keyword'
+    assert properties['swapped_text_field']['type'] == 'keyword'
+    assert properties['stable_number_field']['type'] == 'integer'
+    assert properties['swapped_number_field']['type'] == 'integer'
+
+    properties, dynamic = build_mapping(OdmTestMapping2.fields().values())
+
+    # There should be no dynamic mappings, just one rule forbidding implicit mappings
+    assert len(dynamic) == 1
+    assert 'refuse_all_implicit_mappings' in dynamic[0]
+
+    # Check that the static fields have the mapping type we want
+    assert len(properties) == 4
+    assert properties['stable_text_field']['type'] == 'keyword'
+    assert properties['swapped_text_field']['type'] == 'wildcard'
+    assert properties['stable_number_field']['type'] == 'integer'
+    assert properties['swapped_number_field']['type'] == 'long'
+
+
+def test_field_upgrade_ok(datastore_connection):
+    """Test that changing a field from keyword to wildcard doesn't break anything."""
+    # Clean up from any previous runs
+    collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=False)
+    collection.wipe(recreate=False)
+
+    # Create the collection in elastic 
+    collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping1, validate=True)
+    properties = collection.fields()
+    assert properties['stable_text_field']['type'] == 'keyword'
+    assert properties['swapped_text_field']['type'] == 'keyword'
+    assert properties['stable_number_field']['type'] == 'integer'
+    assert properties['swapped_number_field']['type'] == 'integer'
+
+    # Open that same collection using the new mapping
+    collection = ESCollection(datastore_connection.ds, "testmapping", OdmTestMapping2, validate=True)
+
+    # Check that the fields haven't changed
+    properties = collection.fields()
+    assert properties['stable_text_field']['type'] == 'keyword'
+    assert properties['swapped_text_field']['type'] == 'keyword'
+    assert properties['stable_number_field']['type'] == 'integer'
+    assert properties['swapped_number_field']['type'] == 'integer'
+
+    # Reindex
+    collection.reindex()
+
+    # Check that the fields match the new model
+    properties = collection.fields()
+    assert properties['stable_text_field']['type'] == 'keyword'
+    assert properties['swapped_text_field']['type'] == 'wildcard'
+    assert properties['stable_number_field']['type'] == 'integer'
+    assert properties['swapped_number_field']['type'] == 'long'

From 27ba72975d4e7715de906acfb41a318882bd2606 Mon Sep 17 00:00:00 2001
From: Adam Douglass <Adam.Douglass@cyber.gc.ca>
Date: Thu, 27 Feb 2025 18:08:13 +0000
Subject: [PATCH 3/6] switch file sizes to long

---
 assemblyline/odm/messages/task.py     | 2 +-
 assemblyline/odm/models/alert.py      | 2 +-
 assemblyline/odm/models/badlist.py    | 2 +-
 assemblyline/odm/models/file.py       | 2 +-
 assemblyline/odm/models/safelist.py   | 2 +-
 assemblyline/odm/models/submission.py | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/assemblyline/odm/messages/task.py b/assemblyline/odm/messages/task.py
index 27e63253e..253471b33 100644
--- a/assemblyline/odm/messages/task.py
+++ b/assemblyline/odm/messages/task.py
@@ -15,7 +15,7 @@ class FileInfo(odm.Model):
     mime = odm.Optional(odm.Keyword(), description="The libmagic mime type")
     sha1 = odm.SHA1(description="SHA1 hash of the file")
     sha256 = odm.SHA256(description="SHA256 hash of the file")
-    size = odm.Integer(description="Size of the file in bytes")
+    size = odm.long(description="Size of the file in bytes")
     ssdeep = odm.Optional(odm.SSDeepHash(description="SSDEEP hash of the file"))
     tlsh = odm.Optional(odm.Keyword(description="TLSH hash of the file"))
     type = odm.Keyword(description="Type of file as identified by Assemblyline")
diff --git a/assemblyline/odm/models/alert.py b/assemblyline/odm/models/alert.py
index affb0a8a6..904646cbf 100644
--- a/assemblyline/odm/models/alert.py
+++ b/assemblyline/odm/models/alert.py
@@ -115,7 +115,7 @@ class File(odm.Model):
     name = odm.Keyword(copyto="__text__", description="The original name of the file as submitted.")
     sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file.")
     sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file.")
-    size = odm.Integer(store=False, description="The size of the file in bytes.")
+    size = odm.long(store=False, description="The size of the file in bytes.")
     type = odm.Keyword(copyto="__text__", description="	The file type as identified by Assemblyline's analysis.")
     screenshots = odm.List(odm.Compound(Screenshot), default=[], description="Screenshots taken of the file during analysis, if applicable.")
 
diff --git a/assemblyline/odm/models/badlist.py b/assemblyline/odm/models/badlist.py
index 117f6c08b..a557bfd5e 100644
--- a/assemblyline/odm/models/badlist.py
+++ b/assemblyline/odm/models/badlist.py
@@ -30,7 +30,7 @@ class Hashes(odm.Model):
 class File(odm.Model):
     name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[],
                     description="List of names seen for that file")
-    size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
+    size = odm.Optional(odm.long(), description="Size of the file in bytes")
     type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline")
 
 
diff --git a/assemblyline/odm/models/file.py b/assemblyline/odm/models/file.py
index aa433b086..1948a6e00 100644
--- a/assemblyline/odm/models/file.py
+++ b/assemblyline/odm/models/file.py
@@ -102,7 +102,7 @@ class File(odm.Model):
     seen = odm.Compound(Seen, default={}, description="Records the frequency and timestamps of when the file was encountered.", ai=False)
     sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file, providing a more secure alternative to MD5 for integrity checks.", ai=False)
     sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file, offering a high level of security for integrity verification.")
-    size = odm.Integer(description="Size of the file in bytes.")
+    size = odm.long(description="Size of the file in bytes.")
     ssdeep = odm.SSDeepHash(store=False, description="The fuzzy hash of the file using SSDEEP, which is useful for identifying similar files.", ai=False)
     type = odm.Keyword(copyto="__text__", description="The file type as determined by the AssemblyLine file type identification service.")
     tlsh = odm.Optional(odm.Keyword(copyto="__text__"), description="A locality-sensitive hash (TLSH) of the file's content, useful for similarity comparisons.", ai=False)
diff --git a/assemblyline/odm/models/safelist.py b/assemblyline/odm/models/safelist.py
index 0d6329efe..81900e8d0 100644
--- a/assemblyline/odm/models/safelist.py
+++ b/assemblyline/odm/models/safelist.py
@@ -17,7 +17,7 @@ class Hashes(odm.Model):
 class File(odm.Model):
     name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[],
                     description="List of names seen for that file")
-    size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
+    size = odm.Optional(odm.long(), description="Size of the file in bytes")
     type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline")
 
 
diff --git a/assemblyline/odm/models/submission.py b/assemblyline/odm/models/submission.py
index 23615a4a0..66bceebb3 100644
--- a/assemblyline/odm/models/submission.py
+++ b/assemblyline/odm/models/submission.py
@@ -12,7 +12,7 @@
 @odm.model(index=True, store=False, description="File Model of Submission")
 class File(odm.Model):
     name = odm.Keyword(copyto="__text__", description="Name of the file")
-    size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
+    size = odm.Optional(odm.long(), description="Size of the file in bytes")
     sha256 = odm.SHA256(copyto="__text__", description="SHA256 hash of the file")
 
 
@@ -142,7 +142,7 @@ class Submission(odm.Model):
     max_score = odm.Integer(description="Maximum score of all the files in the scan")
     metadata = odm.FlattenedObject(store=False, description="Metadata associated to the submission")
     params: SubmissionParams = odm.Compound(SubmissionParams, description="Submission parameter details", ai=False)
-    results: list[str] = odm.List(odm.Keyword(), store=False, description="List of result keys", ai=False)
+    results: list[str] = odm.List(odm.wildcard(), store=False, description="List of result keys", ai=False)
     sid: str = odm.UUID(copyto="__text__", description="Submission ID")
     state = odm.Enum(values=SUBMISSION_STATES, description="Status of the submission", ai=False)
     to_be_deleted = odm.Boolean(

From f1815b5cb1ed298c3d30ee29ccf245eb5a7a4f09 Mon Sep 17 00:00:00 2001
From: Adam Douglass <Adam.Douglass@cyber.gc.ca>
Date: Thu, 27 Feb 2025 19:16:10 +0000
Subject: [PATCH 4/6] also file size config

---
 assemblyline/odm/models/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assemblyline/odm/models/config.py b/assemblyline/odm/models/config.py
index b9d767a1f..25f3fced0 100644
--- a/assemblyline/odm/models/config.py
+++ b/assemblyline/odm/models/config.py
@@ -2014,7 +2014,7 @@ class Submission(odm.Model):
     emptyresult_dtl:  int = odm.Integer(min=0, description="Number of days emptyresult will remain in the system")
     max_dtl: int = odm.Integer(min=0, description="Maximum number of days submissions will remain in the system")
     max_extraction_depth: int = odm.Integer(description="Maximum files extraction depth")
-    max_file_size: int = odm.Integer(description="Maximum size for files submitted in the system")
+    max_file_size: int = odm.long(description="Maximum size for files submitted in the system")
     max_metadata_length: int = odm.Integer(description="Maximum length for each metadata values")
     max_temp_data_length: int = odm.Integer(description="Maximum length for each temporary data values")
     metadata: MetadataConfig = odm.Compound(MetadataConfig, default=DEFAULT_METADATA_CONFIGURATION,

From 9a6ec30dfa0610c5e73e40c8d090600dba90a0b5 Mon Sep 17 00:00:00 2001
From: Adam Douglass <Adam.Douglass@cyber.gc.ca>
Date: Fri, 28 Feb 2025 15:34:06 +0000
Subject: [PATCH 5/6] remove model changes

---
 assemblyline/datastore/collection.py  | 8 ++++----
 assemblyline/odm/messages/task.py     | 2 +-
 assemblyline/odm/models/alert.py      | 2 +-
 assemblyline/odm/models/badlist.py    | 2 +-
 assemblyline/odm/models/file.py       | 2 +-
 assemblyline/odm/models/safelist.py   | 2 +-
 assemblyline/odm/models/submission.py | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py
index 43ebfd3a2..4f64dd78a 100644
--- a/assemblyline/datastore/collection.py
+++ b/assemblyline/datastore/collection.py
@@ -8,19 +8,19 @@
 import typing
 import warnings
 
-from copy import deepcopy
-from assemblyline.common.isotime import now_as_iso
-from datemath import dm
-from datemath.helpers import DateMathException
 from datetime import datetime
 from enum import Enum
 from os import environ
 from typing import Dict, Any, Union, TypeVar, Generic, Optional
+from copy import deepcopy
 
+from datemath import dm
+from datemath.helpers import DateMathException
 import elasticsearch
 import elasticsearch.helpers
 
 from assemblyline import odm
+from assemblyline.common.isotime import now_as_iso
 from assemblyline.common.dict_utils import recursive_update
 from assemblyline.datastore.bulk import ElasticBulkPlan
 from assemblyline.datastore.exceptions import (
diff --git a/assemblyline/odm/messages/task.py b/assemblyline/odm/messages/task.py
index 253471b33..27e63253e 100644
--- a/assemblyline/odm/messages/task.py
+++ b/assemblyline/odm/messages/task.py
@@ -15,7 +15,7 @@ class FileInfo(odm.Model):
     mime = odm.Optional(odm.Keyword(), description="The libmagic mime type")
     sha1 = odm.SHA1(description="SHA1 hash of the file")
     sha256 = odm.SHA256(description="SHA256 hash of the file")
-    size = odm.long(description="Size of the file in bytes")
+    size = odm.Integer(description="Size of the file in bytes")
     ssdeep = odm.Optional(odm.SSDeepHash(description="SSDEEP hash of the file"))
     tlsh = odm.Optional(odm.Keyword(description="TLSH hash of the file"))
     type = odm.Keyword(description="Type of file as identified by Assemblyline")
diff --git a/assemblyline/odm/models/alert.py b/assemblyline/odm/models/alert.py
index 904646cbf..affb0a8a6 100644
--- a/assemblyline/odm/models/alert.py
+++ b/assemblyline/odm/models/alert.py
@@ -115,7 +115,7 @@ class File(odm.Model):
     name = odm.Keyword(copyto="__text__", description="The original name of the file as submitted.")
     sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file.")
     sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file.")
-    size = odm.long(store=False, description="The size of the file in bytes.")
+    size = odm.Integer(store=False, description="The size of the file in bytes.")
     type = odm.Keyword(copyto="__text__", description="	The file type as identified by Assemblyline's analysis.")
     screenshots = odm.List(odm.Compound(Screenshot), default=[], description="Screenshots taken of the file during analysis, if applicable.")
 
diff --git a/assemblyline/odm/models/badlist.py b/assemblyline/odm/models/badlist.py
index a557bfd5e..117f6c08b 100644
--- a/assemblyline/odm/models/badlist.py
+++ b/assemblyline/odm/models/badlist.py
@@ -30,7 +30,7 @@ class Hashes(odm.Model):
 class File(odm.Model):
     name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[],
                     description="List of names seen for that file")
-    size = odm.Optional(odm.long(), description="Size of the file in bytes")
+    size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
     type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline")
 
 
diff --git a/assemblyline/odm/models/file.py b/assemblyline/odm/models/file.py
index 1948a6e00..aa433b086 100644
--- a/assemblyline/odm/models/file.py
+++ b/assemblyline/odm/models/file.py
@@ -102,7 +102,7 @@ class File(odm.Model):
     seen = odm.Compound(Seen, default={}, description="Records the frequency and timestamps of when the file was encountered.", ai=False)
     sha1 = odm.SHA1(copyto="__text__", description="The SHA1 hash of the file, providing a more secure alternative to MD5 for integrity checks.", ai=False)
     sha256 = odm.SHA256(copyto="__text__", description="The SHA256 hash of the file, offering a high level of security for integrity verification.")
-    size = odm.long(description="Size of the file in bytes.")
+    size = odm.Integer(description="Size of the file in bytes.")
     ssdeep = odm.SSDeepHash(store=False, description="The fuzzy hash of the file using SSDEEP, which is useful for identifying similar files.", ai=False)
     type = odm.Keyword(copyto="__text__", description="The file type as determined by the AssemblyLine file type identification service.")
     tlsh = odm.Optional(odm.Keyword(copyto="__text__"), description="A locality-sensitive hash (TLSH) of the file's content, useful for similarity comparisons.", ai=False)
diff --git a/assemblyline/odm/models/safelist.py b/assemblyline/odm/models/safelist.py
index 81900e8d0..0d6329efe 100644
--- a/assemblyline/odm/models/safelist.py
+++ b/assemblyline/odm/models/safelist.py
@@ -17,7 +17,7 @@ class Hashes(odm.Model):
 class File(odm.Model):
     name = odm.List(odm.Keyword(store=True, copyto="__text__"), default=[],
                     description="List of names seen for that file")
-    size = odm.Optional(odm.long(), description="Size of the file in bytes")
+    size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
     type = odm.Optional(odm.Keyword(), description="Type of file as identified by Assemblyline")
 
 
diff --git a/assemblyline/odm/models/submission.py b/assemblyline/odm/models/submission.py
index 66bceebb3..23615a4a0 100644
--- a/assemblyline/odm/models/submission.py
+++ b/assemblyline/odm/models/submission.py
@@ -12,7 +12,7 @@
 @odm.model(index=True, store=False, description="File Model of Submission")
 class File(odm.Model):
     name = odm.Keyword(copyto="__text__", description="Name of the file")
-    size = odm.Optional(odm.long(), description="Size of the file in bytes")
+    size = odm.Optional(odm.Integer(), description="Size of the file in bytes")
     sha256 = odm.SHA256(copyto="__text__", description="SHA256 hash of the file")
 
 
@@ -142,7 +142,7 @@ class Submission(odm.Model):
     max_score = odm.Integer(description="Maximum score of all the files in the scan")
     metadata = odm.FlattenedObject(store=False, description="Metadata associated to the submission")
     params: SubmissionParams = odm.Compound(SubmissionParams, description="Submission parameter details", ai=False)
-    results: list[str] = odm.List(odm.wildcard(), store=False, description="List of result keys", ai=False)
+    results: list[str] = odm.List(odm.Keyword(), store=False, description="List of result keys", ai=False)
     sid: str = odm.UUID(copyto="__text__", description="Submission ID")
     state = odm.Enum(values=SUBMISSION_STATES, description="Status of the submission", ai=False)
     to_be_deleted = odm.Boolean(

From 4f8c6be0b7031e836eefb2af8595f94341d9e235 Mon Sep 17 00:00:00 2001
From: Adam Douglass <Adam.Douglass@cyber.gc.ca>
Date: Wed, 5 Mar 2025 21:23:02 +0000
Subject: [PATCH 6/6] add changes to support metadata remapping

---
 assemblyline/datastore/collection.py    |  2 +-
 assemblyline/datastore/support/build.py |  8 +++--
 test/test_odm_mapping.py                | 42 +++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/assemblyline/datastore/collection.py b/assemblyline/datastore/collection.py
index 4f64dd78a..8df4a8555 100644
--- a/assemblyline/datastore/collection.py
+++ b/assemblyline/datastore/collection.py
@@ -2004,7 +2004,7 @@ def _get_index_mappings(self) -> dict:
         if self.model_class:
             mappings['properties'], mappings['dynamic_templates'] = \
                 build_mapping(self.model_class.fields().values())
-            mappings['dynamic_templates'].insert(0, default_dynamic_strings)
+            mappings['dynamic_templates'].append(default_dynamic_strings)
         else:
             mappings['dynamic_templates'] = deepcopy(default_dynamic_templates)
 
diff --git a/assemblyline/datastore/support/build.py b/assemblyline/datastore/support/build.py
index 1b0af6f5c..2f377a877 100644
--- a/assemblyline/datastore/support/build.py
+++ b/assemblyline/datastore/support/build.py
@@ -204,14 +204,18 @@ def build_templates(name, field, nested_template=False, index=True) -> list:
 
             return [{f"nested_{name}": main_template}]
         else:
+            mapping = __type_mapping[field.__class__]
             field_template = {
                 "path_match": name,
                 "mapping": {
-                    "type": __type_mapping[field.__class__],
+                    "type": mapping,
                 }
             }
 
-            field_template['mapping']['index'] = field.index
+            # Wildcard doesn't suport setting index, its _always_ indexed
+            if mapping != 'wildcard':
+                field_template['mapping']['index'] = field.index
+    
             if field.copyto:
                 assert len(field.copyto) == 1
                 field_template['mapping']['copy_to'] = field.copyto[0]
diff --git a/test/test_odm_mapping.py b/test/test_odm_mapping.py
index 236047163..425a6cee8 100644
--- a/test/test_odm_mapping.py
+++ b/test/test_odm_mapping.py
@@ -81,3 +81,45 @@ def test_field_upgrade_ok(datastore_connection):
     assert properties['swapped_text_field']['type'] == 'wildcard'
     assert properties['stable_number_field']['type'] == 'integer'
     assert properties['swapped_number_field']['type'] == 'long'
+
+
+def test_metadata_indexing(datastore_connection):
+
+    @odm.model(index=True)
+    class TestMapping(odm.Model):
+        metadata = odm.Mapping(odm.wildcard(copyto='__text__'))
+
+    # Clean up from any previous runs
+    collection = ESCollection(datastore_connection.ds, "test_metadata_indexing", TestMapping, validate=False)
+    collection.wipe(recreate=False)
+
+    print(build_mapping(TestMapping.fields().values()))
+
+    # Create with new mapping configuration
+    collection = ESCollection(datastore_connection.ds, "test_metadata_indexing", TestMapping, validate=True)
+
+    # Insert data to trigger dynamic field creation
+    collection.save("1", {"metadata": {'field1': 123}})
+    collection.save("2", {"metadata": {'field2': "123"}})
+    collection.save("3", {"metadata": {'field3': {'subfield': "cat dog cat"}}})
+    collection.save("4", {"metadata": {'address': "https://cyber.gc.ca"}})
+    collection.commit()
+
+    # Check if those fields are the type and config we want
+    fields = collection.fields()
+    fields.pop('id')
+
+    assert len(fields) == 4
+    for field_name, field in fields.items():
+        assert field['type'] == 'wildcard', (field_name, field)
+        assert field['indexed']
+        assert field['default'], (field_name, field)
+
+    # Check that copyto and regex work
+    search = collection.search("cyber.gc.ca")
+    assert search['total'] == 1
+    assert search['items'][0].id == "4"
+
+    search = collection.search("address: /http[s]://cyber\\.(gc\\.ca|com)/")
+    assert search['total'] == 1
+    assert search['items'][0].id == "4"