From 3c7c33f0bc7416cf6bc2221bc66de050cbc38889 Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Wed, 27 Nov 2024 06:04:15 +0000 Subject: [PATCH 1/3] Leverage multiprocessing to handle importing of extractors and their metadata --- configextractor/frameworks/base.py | 64 +++++++++++++++++++++++++----- configextractor/frameworks/maco.py | 28 ++----------- configextractor/frameworks/mwcp.py | 19 ++------- configextractor/main.py | 59 ++++++++++++++++----------- tests/requirements.txt | 2 +- 5 files changed, 98 insertions(+), 74 deletions(-) diff --git a/configextractor/frameworks/base.py b/configextractor/frameworks/base.py index 279e0a5..7e68564 100644 --- a/configextractor/frameworks/base.py +++ b/configextractor/frameworks/base.py @@ -5,19 +5,30 @@ class Extractor: - def __init__(self, id, framework, module, module_path, root_directory, yara_rule, venv=None) -> None: + def __init__(self, id, author, description, sharing, framework, module_path, yara_rule, venv=None) -> None: self.id = id + self.author = author + self.description = description self.framework = framework - self.module = module self.module_path = module_path - self.root_directory = root_directory self.rule = yara_rule + self.sharing = sharing self.venv = venv class Framework: - def __init__(self, logger: Logger, yara_attr_name=None): + def __init__( + self, + logger: Logger, + author_attr_name=None, + description_attr_name=None, + sharing_attr_name=None, + yara_attr_name=None, + ): self.log = logger + self.author_attr_name = author_attr_name + self.description_attr_name = description_attr_name + self.sharing_attr_name = sharing_attr_name self.yara_attr_name = yara_attr_name self.venv_script = "" self.yara_rule = "" @@ -25,19 +36,50 @@ def __init__(self, logger: Logger, yara_attr_name=None): @staticmethod # Get classification of module def get_classification(extractor: Extractor) -> str: - return None + return extractor.sharing @staticmethod # Get name of module def get_name(extractor: Extractor): - return extractor.module.__name__.split(".")[-1] + return extractor.id.split(".")[-1] # Define a template for results from this Extractor def result_template(self, extractor: Extractor, yara_matches: List[yara.Match]) -> Dict[str, str]: - return dict(id=extractor.id, yara_hits=[y.rule for y in yara_matches]) + return dict( + author=extractor.author, + description=extractor.description, + id=extractor.id, + yara_hits=[y.rule for y in yara_matches], + ) + + def extract_metadata_from_module(self, decoder: object) -> Dict[str, str]: + return { + "author": self.extract_author(decoder), + "description": self.extract_description(decoder), + "sharing": self.extract_sharing(decoder), + "yara_rule": self.extract_yara(decoder), + } + + # Extract author from module + def extract_author(self, decoder: object) -> str: + if self.author_attr_name and hasattr(decoder, self.author_attr_name): + # Author information found + return getattr(decoder, self.author_attr_name) + + # Extract description from module + def extract_description(self, decoder: object) -> str: + if self.description_attr_name and hasattr(decoder, self.description_attr_name): + # Extractor description found + return getattr(decoder, self.description_attr_name) + + # Extract sharing from module + def extract_sharing(self, decoder: object) -> str: + if self.sharing_attr_name and hasattr(decoder, self.sharing_attr_name): + # Sharing information found + return getattr(decoder, self.sharing_attr_name) # Extract YARA rules from module - def extract_yara_from_module(self, decoder: object) -> str: + def extract_yara(self, decoder: object) -> str: if self.yara_attr_name and hasattr(decoder, self.yara_attr_name): # YARA rule found return getattr(decoder, self.yara_attr_name) @@ -52,9 +94,11 @@ def run(self, sample_path: str, parsers: Dict[Extractor, List[yara.Match]]) -> L def run_in_venv(self, sample_path: str, extractor: Extractor) -> Dict[str, dict]: # Run in extractor with sample in virtual enviroment using the MACO utility - return utils.run_in_venv( + module_name, extractor_class = extractor.id.rsplit(".", 1) + return utils.run_extractor( sample_path, - extractor.module, + module_name, + extractor_class, extractor.module_path, extractor.venv, self.venv_script, diff --git a/configextractor/frameworks/maco.py b/configextractor/frameworks/maco.py index 3e4a170..07e5adb 100644 --- a/configextractor/frameworks/maco.py +++ b/configextractor/frameworks/maco.py @@ -3,35 +3,21 @@ from typing import Any, Dict, List, Union from maco.model import ExtractorModel -from maco.utils import VENV_SCRIPT as MACO_VENV_SCRIPT, maco_extractor_validation, Base64Decoder, MACO_YARA_RULE +from maco.utils import MACO_YARA_RULE, Base64Decoder, maco_extractor_validation +from maco.utils import VENV_SCRIPT as MACO_VENV_SCRIPT from configextractor.frameworks.base import Extractor, Framework class MACO(Framework): def __init__(self, logger: Logger): - super().__init__(logger, "yara_rule") + super().__init__(logger, "author", "__doc__", "sharing", "yara_rule") self.venv_script = MACO_VENV_SCRIPT self.yara_rule = MACO_YARA_RULE - @staticmethod - def get_classification(extractor: Extractor): - if hasattr(extractor.module, "sharing"): - return extractor.module.sharing - def validate(self, module: Any) -> bool: return maco_extractor_validation(module) - def result_template(self, extractor: Extractor, yara_matches: List) -> Dict[str, str]: - template = super().result_template(extractor, yara_matches) - template.update( - { - "author": extractor.module.author, - "description": extractor.module.__doc__, - } - ) - return template - def run(self, sample_path: str, parsers: Dict[Extractor, List[str]]) -> List[dict]: results = list() for extractor, yara_matches in parsers.items(): @@ -39,13 +25,7 @@ def run(self, sample_path: str, parsers: Dict[Extractor, List[str]]) -> List[dic result = self.result_template(extractor, yara_matches) # Run MaCo parser with YARA matches - r: ExtractorModel = None - if extractor.venv: - # Run in special mode using the virtual environment detected - r = self.run_in_venv(sample_path, extractor) - else: - with open(sample_path, "rb") as f: - r = extractor.module().run(f, matches=yara_matches) + r: ExtractorModel = self.run_in_venv(sample_path, extractor) if not (r or yara_matches): # Nothing to report diff --git a/configextractor/frameworks/mwcp.py b/configextractor/frameworks/mwcp.py index dba07bf..5ecf896 100644 --- a/configextractor/frameworks/mwcp.py +++ b/configextractor/frameworks/mwcp.py @@ -1,15 +1,14 @@ # MWCP framework import inspect +import re as regex from logging import Logger -from typing import Any, Dict, List +from typing import Any import mwcp -import re as regex from maco.model import ConnUsageEnum, Encryption, ExtractorModel -from mwcp import Parser -from configextractor.frameworks.base import Extractor, Framework +from configextractor.frameworks.base import Framework IP_REGEX_ONLY = r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" @@ -200,7 +199,7 @@ def handle_encryption(meta: dict) -> dict: class MWCP(Framework): def __init__(self, logger: Logger): - super().__init__(logger, "yara_rule") + super().__init__(logger, "AUTHOR", "DESCRIPTION", None, "yara_rule") self.venv_script = """ import importlib import os @@ -223,16 +222,6 @@ def validate(self, module: Any) -> bool: # 'DESCRIPTION' has to be implemented otherwise will raise an exception according to MWCP return hasattr(module, "DESCRIPTION") and module.DESCRIPTION - def result_template(self, extractor: Extractor, yara_matches: List) -> Dict[str, str]: - template = super().result_template(extractor, yara_matches) - template.update( - { - "author": extractor.module.AUTHOR, - "description": extractor.module.DESCRIPTION, - } - ) - return template - def run(self, sample_path, parsers): results = list() diff --git a/configextractor/main.py b/configextractor/main.py index a2dcc65..f02c652 100644 --- a/configextractor/main.py +++ b/configextractor/main.py @@ -1,15 +1,16 @@ # Main module for ConfigExtractor library -import cart import inspect import re as regex import tempfile - from collections import defaultdict from logging import Logger, getLogger -from maco import utils, yara +from multiprocessing import Manager, Process from typing import Dict, List from urllib.parse import urlparse +import cart +from maco import utils, yara + from configextractor.frameworks import MACO, MWCP from configextractor.frameworks.base import Extractor, Framework @@ -31,10 +32,10 @@ def __init__( } self.parsers: Dict[str, Extractor] = dict() - namespaced_yara_rules: Dict[str, List[str]] = dict() block_regex = regex.compile("|".join(parser_blocklist)) if parser_blocklist else None scanner = yara.compile("\n".join([fw_class.yara_rule for fw_class in self.FRAMEWORK_LIBRARY_MAPPING.values()])) - for parsers_dir in parsers_dirs: + with Manager() as manager: + parsers = manager.dict() def extractor_module_callback(module, venv): # Check to see if we're blocking this potential extractor @@ -59,23 +60,33 @@ def extractor_module_callback(module, venv): if block_regex and block_regex.match(module_id): return - rules = fw_class.extract_yara_from_module(member) - if rules: - namespaced_yara_rules[module_id] = rules - - self.parsers[module_id] = Extractor( - module_id, - fw_name, - member, - module.__file__, - parsers_dir, - rules, - venv, + parsers[module_id] = dict( + id=module_id, + framework=fw_name, + module_path=module.__file__, + venv=venv, + **fw_class.extract_metadata_from_module(member), ) - utils.import_extractors(parsers_dir, scanner, extractor_module_callback, logger, create_venv) - - self.yara = yara.compile(sources={ns: rules for ns, rules in namespaced_yara_rules.items()}) + # Launch importing extractors as separate processes + processes = [] + for parsers_dir in parsers_dirs: + p = Process( + target=utils.import_extractors, + args=(parsers_dir, scanner, extractor_module_callback, logger, create_venv), + ) + processes.append(p) + p.start() + + # Wait for all the processes to terminate + for p in processes: + p.join() + + self.parsers = {id: Extractor(**extractor_kwargs) for id, extractor_kwargs in dict(parsers).items()} + + self.yara = yara.compile( + sources={name: extractor.rule for name, extractor in self.parsers.items() if extractor.rule} + ) for fw_name in self.FRAMEWORK_LIBRARY_MAPPING: self.log.debug( f"# of YARA-dependent parsers under {fw_name}: " @@ -141,16 +152,16 @@ def run_parsers(self, sample, parser_blocklist=[]): for yara_match in self.yara.match(sample_copy.name): # Retrieve relevant parser information extractor = self.parsers[yara_match.namespace] - if block_regex and block_regex.match(extractor.module.__name__): - self.log.info(f"Blocking {extractor.module.__name__} based on passed blocklist regex list") + if block_regex and block_regex.match(extractor.id): + self.log.info(f"Blocking {extractor.id} based on passed blocklist regex list") continue # Pass in yara.Match objects since some framework can leverage it parsers_to_run[extractor.framework][extractor].append(yara_match) # Add standalone parsers that should run on any file for parser in [p for p in self.parsers.values() if not p.rule]: - if block_regex and block_regex.match(parser.module.__name__): - self.log.info(f"Blocking {parser.module.__name__} based on passed blocklist regex list") + if block_regex and block_regex.match(parser.id): + self.log.info(f"Blocking {parser.id} based on passed blocklist regex list") continue parsers_to_run[parser.framework][parser].extend([]) diff --git a/tests/requirements.txt b/tests/requirements.txt index 55f18c3..871ca7a 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -2,4 +2,4 @@ pytest git-python # Test with latest MACO code -git+https://github.com/CybercentreCanada/Maco.git +git+https://github.com/CybercentreCanada/Maco.git@bugfixes From 844a959b7a6e4809a54778d9edb5e68858edc9d1 Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:33:35 +0000 Subject: [PATCH 2/3] Allow exceptions during import to propagate to main process --- configextractor/main.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/configextractor/main.py b/configextractor/main.py index f02c652..468ed20 100644 --- a/configextractor/main.py +++ b/configextractor/main.py @@ -5,7 +5,10 @@ from collections import defaultdict from logging import Logger, getLogger from multiprocessing import Manager, Process -from typing import Dict, List +from multiprocessing.managers import ListProxy +from traceback import format_exc +from types import ModuleType +from typing import Callable, Dict, List from urllib.parse import urlparse import cart @@ -15,6 +18,20 @@ from configextractor.frameworks.base import Extractor, Framework +def import_extractors( + root_directory: str, + scanner: yara.Rules, + extractor_module_callback: Callable[[ModuleType, str], None], + logger: Logger, + create_venv: bool, + exceptions: ListProxy, +): + try: + utils.import_extractors(root_directory, scanner, extractor_module_callback, logger, create_venv) + except Exception: + exceptions.append(format_exc()) + + class ConfigExtractor: def __init__( self, @@ -36,6 +53,7 @@ def __init__( scanner = yara.compile("\n".join([fw_class.yara_rule for fw_class in self.FRAMEWORK_LIBRARY_MAPPING.values()])) with Manager() as manager: parsers = manager.dict() + exceptions = manager.list() def extractor_module_callback(module, venv): # Check to see if we're blocking this potential extractor @@ -72,8 +90,8 @@ def extractor_module_callback(module, venv): processes = [] for parsers_dir in parsers_dirs: p = Process( - target=utils.import_extractors, - args=(parsers_dir, scanner, extractor_module_callback, logger, create_venv), + target=import_extractors, + args=(parsers_dir, scanner, extractor_module_callback, logger, create_venv, exceptions), ) processes.append(p) p.start() @@ -82,6 +100,10 @@ def extractor_module_callback(module, venv): for p in processes: p.join() + exceptions = list(exceptions) + if exceptions: + raise Exception(f"Exception occurred while importing extractors: {exceptions}") + self.parsers = {id: Extractor(**extractor_kwargs) for id, extractor_kwargs in dict(parsers).items()} self.yara = yara.compile( From 0e3475179e3dd64cebd39ca1394880333de59018 Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:58:23 +0000 Subject: [PATCH 3/3] Update requirements --- requirements.txt | 2 +- tests/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c2c9cc9..5a58b28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ cart click -maco >= 1.2.0 +maco >= 1.2.1 mwcfg mwcp setuptools diff --git a/tests/requirements.txt b/tests/requirements.txt index 871ca7a..55f18c3 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -2,4 +2,4 @@ pytest git-python # Test with latest MACO code -git+https://github.com/CybercentreCanada/Maco.git@bugfixes +git+https://github.com/CybercentreCanada/Maco.git