Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor combine_journal_lists scripts to improve quality #163

Merged
merged 1 commit into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 76 additions & 21 deletions scripts/combine_journal_lists_dotless.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Python script for combining several journal abbreviation lists
and producing an alphabetically sorted list. If the same journal
names are repeated, only the version found last is retained.
names are repeated, only the version found first is retained.

This version of the script specifically combines the lists following the ISO4
standard WITHOUT dots after abbreviated words.
Expand All @@ -13,37 +13,92 @@
Output: writes file 'journalList_dotless.csv'
"""

import csv
import json
from pathlib import Path
import re
import sys
import pandas as pd

# Define the list of CSV files
import_order = [
'journals/journal_abbreviations_entrez.csv',
'journals/journal_abbreviations_medicus.csv',
'journals/journal_abbreviations_webofscience-dotless.csv'
"journals/journal_abbreviations_entrez.csv",
"journals/journal_abbreviations_medicus.csv",
"journals/journal_abbreviations_webofscience-dotless.csv",
]


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
dfs = []
for file in import_order:
df = pd.read_csv(file, header=None)
dfs.append(df)
print(f"{file}: {len(df)}")
merged_df = pd.concat(dfs, ignore_index=True)
def load_data(file_paths):
"""Load and combine data from CSV files."""
journal_dict = {}
normalized_keys = set()
for path in file_paths:
with open(path, mode="r", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
name = row[0].strip()
abbr = row[1].strip()

# Drop duplicates based on the first column value and keep the last one obtained
merged_df.drop_duplicates(subset=[0], keep='last', inplace=True)
# Discard entries where name or abbr is missing
if not (name and abbr):
continue
# Discard entries that are too long or too short
if len(name) >= 80 or len(name) <= 3:
continue
# Discard names that start with non-alphanumeric characters
if not name[0].isalnum():
continue
# Discard names that consist only of numbers
if name.replace(" ", "").isnumeric():
continue
# Discard names containing \
if name.count("\\"):
continue
# Discard entries where the first letters of name and abbr do not match
if abbr[0] != name.replace("The", "").replace("A ", "")[0]:
continue
# Only keep the first occurrence
if name in journal_dict:
continue
# Generate normalizedKey, keeping only the first match
normalized_key = normalize_name(name)
if normalized_key in normalized_keys:
continue

# Sort alphabetically
sorted_df = merged_df.sort_values(by=[0])
journal_dict[name] = abbr
normalized_keys.add(normalized_key) # Add to the set of used keys
return journal_dict

# Save the result to the specified CSV file and ensure values are quoted
sorted_df.to_csv(output_filename, index=False, header=False, quoting=1)

print(f"Write {output_filename}, Combined key count: {len(merged_df)}")
def normalize_name(name):
"""
Normalize the journal name by removing specified characters using regex.
See src/utils/str.ts -> normalizeKey()
"""
return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower()


def save_to_json(data, output_file):
"""Save the data to a JSON file."""
with open(output_file, mode="w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=2, ensure_ascii=False)


def save_to_csv(data, output_file):
"""Save the data to a CSV file."""
with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file, quoting=1)
for name, abbr in data.items():
writer.writerow([name, abbr])


def main(filename):
base_path = Path().cwd()
output_filename = base_path / filename
import_paths = [base_path / file for file in import_order]

journal_data = load_data(import_paths)
sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically
save_to_csv(sorted_journal_data, output_filename)


if __name__ == "__main__":
Expand Down
117 changes: 87 additions & 30 deletions scripts/combine_journal_lists_dots.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Python script for combining several journal abbreviation lists
and producing an alphabetically sorted list. If the same journal
names are repeated, only the version found last is retained.
names are repeated, only the version found first is retained.

This version of the script specifically combines the lists following the ISO4
standard WITH dots after abbreviated words.
Expand All @@ -13,45 +13,102 @@
Output: writes file 'journalList_dots.csv' (or specified output file)
"""

import csv
import json
from pathlib import Path
import re
import sys
import pandas as pd

# Define the list of CSV files
import_order = [
'journals/journal_abbreviations_acs.csv',
'journals/journal_abbreviations_ams.csv',
'journals/journal_abbreviations_general.csv',
'journals/journal_abbreviations_geology_physics.csv',
'journals/journal_abbreviations_ieee.csv',
'journals/journal_abbreviations_lifescience.csv',
'journals/journal_abbreviations_mathematics.csv',
'journals/journal_abbreviations_mechanical.csv',
'journals/journal_abbreviations_meteorology.csv',
'journals/journal_abbreviations_sociology.csv',
'journals/journal_abbreviations_webofscience-dots.csv'
# Keep IEEE before ubc, because IEEE has its own style.
"journals/journal_abbreviations_ieee.csv",
"journals/journal_abbreviations_acs.csv",
# Keep ubc before other jabref's, because ubc's data is more accurate.
"journals/journal_abbreviations_ubc.csv",
"journals/journal_abbreviations_ams.csv",
"journals/journal_abbreviations_general.csv",
"journals/journal_abbreviations_geology_physics.csv",
"journals/journal_abbreviations_lifescience.csv",
"journals/journal_abbreviations_mathematics.csv",
"journals/journal_abbreviations_mechanical.csv",
"journals/journal_abbreviations_meteorology.csv",
"journals/journal_abbreviations_sociology.csv",
"journals/journal_abbreviations_webofscience-dots.csv",
]


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
dfs = []
for file in import_order:
df = pd.read_csv(file, header=None)
dfs.append(df)
print(f"{file}: {len(df)}")
merged_df = pd.concat(dfs, ignore_index=True)
def load_data(file_paths):
"""Load and combine data from CSV files."""
journal_dict = {}
normalized_keys = set()
for path in file_paths:
with open(path, mode="r", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
name = row[0].strip()
abbr = row[1].strip()

# Drop duplicates based on the first column value and keep the last one obtained
merged_df.drop_duplicates(subset=[0], keep='last', inplace=True)
# Discard entries where name or abbr is missing
if not (name and abbr):
continue
# Discard entries that are too long or too short
if len(name) >= 80 or len(name) <= 3:
continue
# Discard names that start with non-alphanumeric characters
if not name[0].isalnum():
continue
# Discard names that consist only of numbers
if name.replace(" ", "").isnumeric():
continue
# Discard names containing \
if name.count("\\"):
continue
# Discard entries where the first letters of name and abbr do not match
if abbr[0] != name.replace("The", "").replace("A ", "")[0]:
continue
# Only keep the first occurrence
if name in journal_dict:
continue
# Generate normalizedKey, keeping only the first match
normalized_key = normalize_name(name)
if normalized_key in normalized_keys:
continue

# Sort alphabetically
sorted_df = merged_df.sort_values(by=[0])
journal_dict[name] = abbr
normalized_keys.add(normalized_key) # Add to the set of used keys
return journal_dict

# Save the result to the specified CSV file and ensure values are quoted
sorted_df.to_csv(output_filename, index=False, header=False, quoting=1)

print(f"Write {output_filename}, Combined key count: {len(merged_df)}")
def normalize_name(name):
"""
Normalize the journal name by removing specified characters using regex.
See src/utils/str.ts -> normalizeKey()
"""
return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower()


def save_to_json(data, output_file):
"""Save the data to a JSON file."""
with open(output_file, mode="w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=2, ensure_ascii=False)


def save_to_csv(data, output_file):
"""Save the data to a CSV file."""
with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file, quoting=1)
for name, abbr in data.items():
writer.writerow([name, abbr])


def main(filename):
base_path = Path().cwd()
output_filename = base_path / filename
import_paths = [base_path / file for file in import_order]

journal_data = load_data(import_paths)
sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically
save_to_csv(sorted_journal_data, output_filename)


if __name__ == "__main__":
Expand Down
Loading