Skip to content

Commit

Permalink
add md5 file utf-8 bom header support (#1818)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkonie committed Feb 7, 2025
1 parent fcc2a97 commit c1a00f2
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Added
- Support for numeric field values as list (#1789, #2033)
- **Taskflowbackend**
- ``TaskflowAPI.raise_submit_api_exception()`` helper (#1847)
- UTF-8 BOM header support for MD5 files (#1818)

Changed
-------
Expand Down
1 change: 1 addition & 0 deletions docs_manual/source/sodar_release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Release for SODAR Core v1.0 upgrade, iRODS v4.3 upgrade and feature updates.
- Add token-based iRODS/IGV basic auth support for OIDC users
- Add support for comment, performer and contact field values as list
- Add support for numeric field values as list
- Add support for UTF-8 BOM header in MD5 checksum files
- Update minimum supported iRODS version to v4.3.3
- Update REST API versioning
- Update REST API views for OpenAPI support
Expand Down
8 changes: 7 additions & 1 deletion taskflowbackend/tasks/irods_tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""iRODS tasks for Taskflow"""

import codecs
import logging
import os
import random
Expand Down Expand Up @@ -674,7 +675,12 @@ def execute(self, paths, zone_path, *args, **kwargs):
md5_path = path + '.md5'
try:
with self.irods.data_objects.open(md5_path, mode='r') as f:
file_sum = re.split(MD5_RE, f.read().decode('utf-8'))[0]
dec = 'utf-8'
md5_content = f.read()
# Support for BOM header forced by PowerShell (see #1818)
if md5_content[:3] == codecs.BOM_UTF8:
dec += '-sig'
file_sum = re.split(MD5_RE, md5_content.decode(dec))[0]
except Exception as ex:
msg = 'Unable to read checksum file "{}"'.format(
'/'.join(md5_path.split('/')[zone_path_len:])
Expand Down
39 changes: 39 additions & 0 deletions taskflowbackend/tests/test_flows.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests for Taskflow flows in the taskflowbackend app"""

import codecs
import os

from irods.exception import (
Expand Down Expand Up @@ -1018,6 +1019,44 @@ def test_validate_upper_case(self):
self.zone.refresh_from_db()
self.assertEqual(self.zone.status, ZONE_STATUS_ACTIVE)

def test_validate_bom_header(self):
"""Test landing_zone_move validation with BOM header in MD5 file"""
coll_path = os.path.join(self.zone_path, COLL_NAME)
zone_coll = self.irods.collections.create(coll_path)
obj = self.make_irods_object(zone_coll, OBJ_NAME)
obj_path = obj.path
# Make MD5 object with BOM header
md5_path = obj.path + '.md5'
md5_content = codecs.BOM_UTF8 + bytes(
self.get_md5_checksum(obj), encoding='utf-8'
)
make_object(self.irods, md5_path, md5_content)
self.assertEqual(self.irods.data_objects.exists(obj_path), True)
self.assertEqual(
self.irods.data_objects.exists(obj_path + '.md5'), True
)

flow_data = {
'zone_uuid': str(self.zone.sodar_uuid),
'validate_only': True,
}
flow = self.taskflow.get_flow(
irods_backend=self.irods_backend,
project=self.project,
flow_name='landing_zone_move',
flow_data=flow_data,
)
self.build_and_run(flow)

self.zone.refresh_from_db()
self.assertEqual(self.zone.status, ZONE_STATUS_ACTIVE)
self.assertEqual(self.irods.data_objects.exists(obj_path), True)
self.assertEqual(
self.irods.data_objects.exists(obj_path + '.md5'), True
)
sample_coll_path = os.path.join(self.sample_path, COLL_NAME)
self.assertEqual(self.irods.collections.exists(sample_coll_path), False)

def test_validate_no_checksum(self):
"""Test landing_zone_move validation with missing checksum"""
coll_path = os.path.join(self.zone_path, COLL_NAME)
Expand Down

0 comments on commit c1a00f2

Please sign in to comment.