From 4ce45c2def58e8b1aee1671acfd9ce2c6234e40d Mon Sep 17 00:00:00 2001 From: sydp Date: Wed, 17 Apr 2024 23:08:12 +1000 Subject: [PATCH] Updates to LevelDB record parsing (#38) * Updates * Update docs * Update version --- README.md | 64 ++++++++++- dfindexeddb/indexeddb/chromium/record.py | 32 +++--- dfindexeddb/indexeddb/cli.py | 118 ++++++++++++++++++-- dfindexeddb/leveldb/cli.py | 12 ++- dfindexeddb/leveldb/definitions.py | 2 + dfindexeddb/leveldb/record.py | 130 +++++++++++++++++++---- dfindexeddb/version.py | 2 +- pyproject.toml | 2 +- 8 files changed, 313 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 45b2db2..3f2e7fe 100644 --- a/README.md +++ b/README.md @@ -61,14 +61,61 @@ installation: ``` $ dfindexeddb -h -usage: dfindexeddb [-h] -s SOURCE [-o {json,jsonl,repr}] +usage: dfindexeddb [-h] {db,ldb,log} ... A cli tool for parsing indexeddb files +positional arguments: + {db,ldb,log} + db Parse a directory as indexeddb. + ldb Parse a ldb file as indexeddb. + log Parse a log file as indexeddb. + +options: + -h, --help show this help message and exit +``` + +To parse Indexeddb records from a LevelDB folder, use the following command: + +``` +dfindexeddb db -h +usage: dfindexeddb db [-h] -s SOURCE [--use_manifest] [-o {json,jsonl,repr}] + options: -h, --help show this help message and exit -s SOURCE, --source SOURCE The source leveldb folder + --use_manifest Use manifest file to determine active/recovered records. + -o {json,jsonl,repr}, --output {json,jsonl,repr} + Output format. Default is json +``` + +To parse Indexeddb records from a LevelDB ldb (.ldb) file, use the following +command: + +``` +dfindexeddb ldb -h +usage: dfindexeddb ldb [-h] -s SOURCE [-o {json,jsonl,repr}] + +options: + -h, --help show this help message and exit + -s SOURCE, --source SOURCE + The source .ldb file. + -o {json,jsonl,repr}, --output {json,jsonl,repr} + Output format. Default is json +``` + +To parse Indexeddb records from a LevelDB log (.log) file, use the following +command: + +``` +dfindexeddb log -h +usage: dfindexeddb log [-h] -s SOURCE [-o {json,jsonl,repr}] + +options: + -h, --help show this help message and exit + -s SOURCE, --source SOURCE + The source .log file. -o {json,jsonl,repr}, --output {json,jsonl,repr} Output format. Default is json ``` @@ -92,6 +139,21 @@ options: -h, --help show this help message and exit ``` +To parse records from a LevelDB folder, use the following command: + +``` +dfindexeddb db -h +usage: dfindexeddb db [-h] -s SOURCE [--use_manifest] [-o {json,jsonl,repr}] + +options: + -h, --help show this help message and exit + -s SOURCE, --source SOURCE + The source leveldb folder + --use_manifest Use manifest file to determine active/recovered records. + -o {json,jsonl,repr}, --output {json,jsonl,repr} + Output format. Default is json +``` + To parse records from a LevelDB log (.log) file, use the following command: ``` diff --git a/dfindexeddb/indexeddb/chromium/record.py b/dfindexeddb/indexeddb/chromium/record.py index 6ee0520..1a1b7ee 100644 --- a/dfindexeddb/indexeddb/chromium/record.py +++ b/dfindexeddb/indexeddb/chromium/record.py @@ -22,8 +22,7 @@ from dfindexeddb import errors from dfindexeddb.indexeddb.chromium import blink from dfindexeddb.indexeddb.chromium import definitions -from dfindexeddb.leveldb import ldb -from dfindexeddb.leveldb import log +from dfindexeddb.leveldb import record from dfindexeddb.leveldb import utils @@ -1337,24 +1336,33 @@ class IndexedDBRecord: value: the value of the record. sequence_number: if available, the sequence number of the record. type: the type of the record. + level: the leveldb level, None indicates the record came from a log file. + recovered: True if the record is a recovered record. """ + path: str offset: int key: Any value: Any - sequence_number: int + sequence_number: Optional[int] type: int + level: Optional[int] + recovered: Optional[bool] @classmethod def FromLevelDBRecord( - cls, record: Union[ldb.KeyValueRecord, log.ParsedInternalKey] + cls, db_record: record.LevelDBRecord ) -> IndexedDBRecord: """Returns an IndexedDBRecord from a ParsedInternalKey.""" - idb_key = IndexedDbKey.FromBytes(record.key, base_offset=record.offset) - idb_value = idb_key.ParseValue(record.value) + idb_key = IndexedDbKey.FromBytes( + db_record.record.key, base_offset=db_record.record.offset) + idb_value = idb_key.ParseValue(db_record.record.value) return cls( - offset=record.offset, - key=idb_key, - value=idb_value, - sequence_number=record.sequence_number if hasattr( - record, 'sequence_number') else None, - type=record.record_type) + path=db_record.path, + offset=db_record.record.offset, + key=idb_key, + value=idb_value, + sequence_number=db_record.record.sequence_number if hasattr( + db_record.record, 'sequence_number') else None, + type=db_record.record.record_type, + level=db_record.level, + recovered=db_record.recovered) diff --git a/dfindexeddb/indexeddb/cli.py b/dfindexeddb/indexeddb/cli.py index faf8c0c..377df52 100644 --- a/dfindexeddb/indexeddb/cli.py +++ b/dfindexeddb/indexeddb/cli.py @@ -73,23 +73,78 @@ def _Output(structure, output): print(structure) -def IndexeddbCommand(args): - """The CLI for processing a log/ldb file as indexeddb.""" - for db_record in leveldb_record.LevelDBRecord.FromDir(args.source): +def DbCommand(args): + """The CLI for processing a directory as indexeddb.""" + if args.use_manifest: + for db_record in leveldb_record.LevelDBRecord.FromManifest(args.source): + record = db_record.record + try: + idb_record = chromium_record.IndexedDBRecord.FromLevelDBRecord( + db_record) + except( + errors.ParserError, + errors.DecoderError, + NotImplementedError) as err: + print(( + f'Error parsing Indexeddb record {record.__class__.__name__}: {err}' + f' at offset {record.offset} in {db_record.path}'), file=sys.stderr) + print(f'Traceback: {traceback.format_exc()}', file=sys.stderr) + continue + _Output(idb_record, output=args.output) + else: + for db_record in leveldb_record.LevelDBRecord.FromDir(args.source): + record = db_record.record + try: + idb_record = chromium_record.IndexedDBRecord.FromLevelDBRecord( + db_record) + except( + errors.ParserError, + errors.DecoderError, + NotImplementedError) as err: + print(( + f'Error parsing Indexeddb record {record.__class__.__name__}: {err}' + f' at offset {record.offset} in {db_record.path}'), file=sys.stderr) + print(f'Traceback: {traceback.format_exc()}', file=sys.stderr) + continue + _Output(idb_record, output=args.output) + + +def LdbCommand(args): + """The CLI for processing a leveldb table (.ldb) file as indexeddb.""" + for db_record in leveldb_record.LevelDBRecord.FromFile(args.source): + record = db_record.record + try: + idb_record = chromium_record.IndexedDBRecord.FromLevelDBRecord( + db_record) + except( + errors.ParserError, + errors.DecoderError, + NotImplementedError) as err: + print( + (f'Error parsing Indexeddb record {record.__class__.__name__}: {err} ' + f'at offset {record.offset} in {db_record.path}'), file=sys.stderr) + print(f'Traceback: {traceback.format_exc()}', file=sys.stderr) + continue + _Output(idb_record, output=args.output) + + +def LogCommand(args): + """The CLI for processing a leveldb log file as indexeddb.""" + for db_record in leveldb_record.LevelDBRecord.FromFile(args.source): record = db_record.record try: - db_record.record = chromium_record.IndexedDBRecord.FromLevelDBRecord( - record) + idb_record = chromium_record.IndexedDBRecord.FromLevelDBRecord( + db_record) except( errors.ParserError, errors.DecoderError, NotImplementedError) as err: print( - (f'Error parsing blink value: {err} for {record.__class__.__name__} ' + (f'Error parsing Indexeddb record {record.__class__.__name__}: {err} ' f'at offset {record.offset} in {db_record.path}'), file=sys.stderr) print(f'Traceback: {traceback.format_exc()}', file=sys.stderr) - print(f'Record: {record}', file=sys.stderr) - _Output(db_record, output=args.output) + continue + _Output(idb_record, output=args.output) def App(): @@ -98,10 +153,51 @@ def App(): prog='dfindexeddb', description='A cli tool for parsing indexeddb files', epilog=f'Version {version.GetVersion()}') - parser.add_argument( + + subparsers = parser.add_subparsers() + + parser_db = subparsers.add_parser( + 'db', help='Parse a directory as indexeddb.') + parser_db.add_argument( '-s', '--source', required=True, type=pathlib.Path, help='The source leveldb folder') - parser.add_argument( + parser_db.add_argument( + '--use_manifest', + action='store_true', + help='Use manifest file to determine active/deleted records.') + parser_db.add_argument( + '-o', + '--output', + choices=[ + 'json', + 'jsonl', + 'repr'], + default='json', + help='Output format. Default is json') + parser_db.set_defaults(func=DbCommand) + + parser_ldb = subparsers.add_parser( + 'ldb', help='Parse a ldb file as indexeddb.') + parser_ldb.add_argument( + '-s', '--source', required=True, type=pathlib.Path, + help='The source .ldb file.') + parser_ldb.add_argument( + '-o', + '--output', + choices=[ + 'json', + 'jsonl', + 'repr'], + default='json', + help='Output format. Default is json') + parser_ldb.set_defaults(func=LdbCommand) + + parser_log = subparsers.add_parser( + 'log', help='Parse a log file as indexeddb.') + parser_log.add_argument( + '-s', '--source', required=True, type=pathlib.Path, + help='The source .log file.') + parser_log.add_argument( '-o', '--output', choices=[ @@ -110,7 +206,7 @@ def App(): 'repr'], default='json', help='Output format. Default is json') - parser.set_defaults(func=IndexeddbCommand) + parser_log.set_defaults(func=LogCommand) args = parser.parse_args() args.func(args) diff --git a/dfindexeddb/leveldb/cli.py b/dfindexeddb/leveldb/cli.py index a620fe0..894ceea 100644 --- a/dfindexeddb/leveldb/cli.py +++ b/dfindexeddb/leveldb/cli.py @@ -66,8 +66,12 @@ def _Output(structure, output): def DbCommand(args): """The CLI for processing leveldb folders.""" - for rec in record.LevelDBRecord.FromDir(args.source): - _Output(rec, output=args.output) + if args.use_manifest: + for rec in record.LevelDBRecord.FromManifest(args.source): + _Output(rec, output=args.output) + else: + for rec in record.LevelDBRecord.FromDir(args.source): + _Output(rec, output=args.output) def LdbCommand(args): @@ -159,6 +163,10 @@ def App(): required=True, type=pathlib.Path, help='The source leveldb directory') + parser_db.add_argument( + '--use_manifest', + action='store_true', + help='Use manifest file to determine active/deleted records.') parser_db.add_argument( '-o', '--output', diff --git a/dfindexeddb/leveldb/definitions.py b/dfindexeddb/leveldb/definitions.py index 90067a7..fbced36 100644 --- a/dfindexeddb/leveldb/definitions.py +++ b/dfindexeddb/leveldb/definitions.py @@ -25,6 +25,8 @@ SEQUENCE_LENGTH = 7 TYPE_LENGTH = 1 +MANIFEST_FILENAME_PATTERN = r'MANIFEST-[0-9]{6}' + class BlockCompressionType(enum.IntEnum): """Block compression types.""" diff --git a/dfindexeddb/leveldb/record.py b/dfindexeddb/leveldb/record.py index 881e1de..9a5b9a3 100644 --- a/dfindexeddb/leveldb/record.py +++ b/dfindexeddb/leveldb/record.py @@ -16,9 +16,12 @@ from __future__ import annotations import dataclasses import pathlib +import re import sys -from typing import Any, Generator, Union +from typing import Any, Generator, Optional, Union +from dfindexeddb import errors +from dfindexeddb.leveldb import definitions from dfindexeddb.leveldb import descriptor from dfindexeddb.leveldb import ldb from dfindexeddb.leveldb import log @@ -34,18 +37,20 @@ class LevelDBRecord: Attributes: path: the file path where the record was parsed from. record: the leveldb record. + level: the leveldb level, None indicates the record came from a log file. + recovered: True if the record is a recovered record. """ path: str record: Union[ ldb.KeyValueRecord, - log.ParsedInternalKey, - descriptor.VersionEdit] + log.ParsedInternalKey] + level: Optional[int] = None + recovered: Optional[bool] = None @classmethod def FromFile( cls, - file_path: pathlib.Path, - include_versionedit: bool = False + file_path: pathlib.Path ) -> Generator[LevelDBRecord, Any, Any]: """Yields leveldb records from the given path. @@ -54,7 +59,6 @@ def FromFile( Args: file_path: the file path. - include_versionedit: include VersionEdit records from descriptor files. """ if file_path.name.endswith('.log'): for record in log.FileReader( @@ -64,12 +68,7 @@ def FromFile( for record in ldb.FileReader(file_path.as_posix()).GetKeyValueRecords(): yield cls(path=file_path.as_posix(), record=record) elif file_path.name.startswith('MANIFEST'): - if not include_versionedit: - print(f'Ignoring {file_path.as_posix()}', file=sys.stderr) - return - for record in descriptor.FileReader( - file_path.as_posix()).GetVersionEdits(): - yield cls(path=file_path.as_posix(), record=record) + print(f'Ignoring descriptor file {file_path.as_posix()}', file=sys.stderr) elif file_path.name in ('LOCK', 'CURRENT', 'LOG', 'LOG.old'): print(f'Ignoring {file_path.as_posix()}', file=sys.stderr) else: @@ -78,25 +77,114 @@ def FromFile( @classmethod def FromDir( cls, - path: pathlib.Path, - include_versionedit: bool = False + path: pathlib.Path ) -> Generator[LevelDBRecord, Any, Any]: """Yields LevelDBRecords from the given directory. Args: path: the file path. - include_versionedit: include VersionEdit records from descriptor files. + + Yields: + LevelDBRecords + """ + if not path or not path.is_dir(): + raise ValueError(f'{path} is not a directory') + for file_path in path.iterdir(): + yield from cls.FromFile(file_path=file_path) + + @classmethod + def FromManifest( + cls, + path: pathlib.Path + ) -> Generator[LevelDBRecord, Any, Any]: + """Yields LevelDBRecords from the given directory using the manifest. + + Args: + path: the file path. Yields: LevelDBRecords Raises: + ParserError: if the CURRENT or MANIFEST-* file does not exist. ValueError: if path is not a directory. """ - if path.is_dir(): - for file_path in path.iterdir(): - yield from cls.FromFile( - file_path=file_path, - include_versionedit=include_versionedit) - else: + if not path or not path.is_dir(): raise ValueError(f'{path} is not a directory') + + current_path = path / 'CURRENT' + if not current_path.exists(): + raise errors.ParserError(f'{current_path!s} does not exist.') + + current_manifest = current_path.read_text().strip() + manifest_regex = re.compile(definitions.MANIFEST_FILENAME_PATTERN) + if not manifest_regex.fullmatch(current_manifest): + raise errors.ParserError( + f'{current_path!s} does not contain the expected content') + + manifest_path = path / current_manifest + if not manifest_path.exists(): + raise errors.ParserError(f'{manifest_path!s} does not exist.') + + latest_version = descriptor.FileReader( + str(manifest_path)).GetLatestVersion() + if not latest_version: + raise errors.ParserError( + f'Could not parse a leveldb version from {manifest_path!s}') + + # read log records + log_records = [] + if latest_version.current_log: + current_log = path / latest_version.current_log + if current_log.exists(): + for log_record in cls.FromFile(file_path=current_log): + log_records.append(log_record) + else: + print('No current log file.', file=sys.stderr) + + # read records from the "young" or 0-level + young_records = [] + for active_file in latest_version.active_files.get(0, {}).keys(): + current_young = path / active_file + if current_young.exists(): + for young_record in cls.FromFile(current_young): + young_records.append(young_record) + + active_records = {} + for record in sorted( + log_records, + key=lambda record: record.record.sequence_number, + reverse=True): + if record.record.key not in active_records: + record.recovered = False + active_records[record.record.key] = record + else: + record.recovered = True + + for record in sorted( + young_records, + key=lambda record: record.record.sequence_number, + reverse=True): + if record.record.key not in active_records: + record.recovered = False + active_records[record.record.key] = record + else: + record.recovered = True + record.level = 0 + + yield from sorted( + log_records + young_records, + key=lambda record: record.record.sequence_number, + reverse=False) + + if latest_version.active_files.keys(): + for level in range(1, max(latest_version.active_files.keys()) + 1): + for filename in latest_version.active_files.get(level, []): + current_filename = path / filename + for record in cls.FromFile(file_path=current_filename): + if record.record.key in active_records: + record.recovered = True + else: + record.recovered = False + record.level = level + yield record diff --git a/dfindexeddb/version.py b/dfindexeddb/version.py index b84276b..7cba406 100644 --- a/dfindexeddb/version.py +++ b/dfindexeddb/version.py @@ -15,7 +15,7 @@ """Version information for dfIndexeddb.""" -__version__ = "20240402" +__version__ = "20240417" def GetVersion(): diff --git a/pyproject.toml b/pyproject.toml index b32292b..16ab1fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dfindexeddb" -version = "20240402" +version = "20240417" requires-python = ">=3.8" description = "dfindexeddb is an experimental Python tool for performing digital forensic analysis of IndexedDB and leveldb files." license = {file = "LICENSE"}