diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index dfae2cb..53e489e 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -95,18 +95,28 @@ def _get_translations_from_special_cols( return translations, set(translated_cols) -def clean_column_name(column_name: str) -> str: +def clean_column_name(column_name: str, already_seen: dict[str, str]) -> str: + """ + + Preserves ":" vs "::" and any spaces around the colons + """ RE_MEDIA_COLUMN_NAMES = '|'.join(MEDIA_COLUMN_NAMES) + if column_name in already_seen: + return already_seen[column_name] # "LaBeL" -> "label", "HiNT" -> "hint" if column_name.lower() in ['label', 'hint']: - return column_name.lower() + cleaned = column_name.lower() + already_seen[column_name] = cleaned + return cleaned # "Bind:Some:Thing" -> "bind:Some:Thing", "BodY:" -> "body:" match = re.match(r'^(bind|body):.*', column_name, flags=re.IGNORECASE) if match: - lower_cased = match.group(0).lower() - return re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE) + lower_cased = match.groups()[0].lower() + cleaned = re.sub(r'^(bind|body)', lower_cased, column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned # "Media:Audio::ES" -> "media:audio::ES", "ViDeO : ES" -> "video : ES" match = re.match( @@ -116,40 +126,47 @@ def clean_column_name(column_name: str) -> str: ) if match: matched = match.groups() - lower_media_prefix = matched[0].lower() + lower_media_prefix = matched[0].lower() if matched[0] else '' lower_media_type = matched[1].lower() - return re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$', + cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})(\s*::?\s*)([^:]+)$', rf'{lower_media_prefix}{lower_media_type}\3\4', column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned + return cleaned # "Media: AuDiO" -> "media: audio", "VIDEO" -> "video" match = re.match( - rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name + rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', column_name, flags=re.IGNORECASE ) if match: matched = match.groups() - lower_media_prefix = matched[0].lower() + lower_media_prefix = matched[0].lower() if matched[0] else '' lower_media_type = matched[1].lower() - return re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', + cleaned = re.sub(rf'^(media\s*::?\s*)?({RE_MEDIA_COLUMN_NAMES})$', rf'{lower_media_prefix}{lower_media_type}', column_name, flags=re.IGNORECASE) + already_seen[column_name] = cleaned match = re.match(r'^([^:]+)(\s*::?\s*)([^:]+)$', column_name) if match: # example: label::x, constraint_message::x, hint::x matched = match.groups() lower_column_shortname = matched[0].lower() - return re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name, + cleaned = re.sub(r'^([^:]+)(\s*::?\s*)([^:]+)$', rf'{lower_column_shortname}\2\3', column_name, flags=re.IGNORECASE) - return column_name.lower() + already_seen[column_name] = cleaned + return cleaned + cleaned = column_name.lower() + already_seen[column_name] = cleaned + return cleaned def preprocess_columns(content: Dict[str, List[Any]]) -> None: - + seen = {} for sheet, rows in content.items(): for row in rows: for column_name, value in row.copy().items(): - cleaned_name = clean_column_name(column_name) + cleaned_name = clean_column_name(column_name, seen) del row[column_name] row[cleaned_name] = value diff --git a/tests/test_expand_content.py b/tests/test_expand_content.py index f6a618f..03c4414 100644 --- a/tests/test_expand_content.py +++ b/tests/test_expand_content.py @@ -7,14 +7,13 @@ from formpack import FormPack from formpack.constants import OR_OTHER_COLUMN as _OR_OTHER from formpack.constants import UNTRANSLATED -from formpack.utils.expand_content import SCHEMA_VERSION +from formpack.utils.expand_content import SCHEMA_VERSION, clean_column_name from formpack.utils.expand_content import _expand_tags from formpack.utils.expand_content import _get_special_survey_cols from formpack.utils.expand_content import expand_content, _expand_type_to_dict from formpack.utils.flatten_content import flatten_content from formpack.utils.string import orderable_with_none -from formpack.src.formpack.utils.expand_content import clean_column_name def test_expand_selects_with_or_other(): @@ -629,7 +628,7 @@ class ColumnTestCase(TestCase): ('VIDEO :: SPANISH', 'video :: SPANISH'), ('MEDIA:AUDIO', 'media:audio'), ('IMAGE', 'image'), - ('LABEL : SPANISH', 'label : Spanish') + ('LABEL : SPANISH', 'label : SPANISH') ) @unpack def test_clean_column_name(self, name, expected):