Skip to content

Commit 6377ef8

Browse files
committed
feat(harvester): add Memovs audiovisual archives API
Add new API harvester for Médiathèque Valais audiovisual archives (archives.memovs.ch). This harvester enables automatic ingestion of Film, Photo, and Audio metadata from the Memovs digital archives. Implementation includes: - ApiMemovs class with pagination and date filtering support - JSON transformation layer mapping Memovs metadata to RERO-ILS - Document type detection (Film, Audio, Photo, Other) - Holdings creation with electronic locations - Support for creators, contributors, subjects, and descriptions - Integration with existing CLI and task infrastructure - Fix duplicate logging in CLI by disabling logger propagation Configuration added as VS-MEMO in apisources.yml with endpoint https://archives.memovs.ch/docs/api/ and code 'memovs'. Technical notes: - Uses current_app.logger with lazy % formatting for performance - Sets app.logger.propagate = False to prevent duplicate log messages in CLI commands Co-authored-by: Peter Weber <[email protected]>
1 parent 51036d4 commit 6377ef8

File tree

12 files changed

+1937
-2
lines changed

12 files changed

+1937
-2
lines changed

data/apisources.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,8 @@ NJ-CANTOOK:
2626
url: https://bm.ebibliomedia.ch
2727
classname: 'rero_ils.modules.api_harvester.cantook.api.ApiCantook'
2828
code: 'ebibliomedia'
29+
30+
VS-MEMO:
31+
url: https://archives.memovs.ch/docs/api/
32+
classname: 'rero_ils.modules.api_harvester.memovs.api.ApiMemovs'
33+
code: 'memovs'

data/organisations.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"default_currency": "GBP",
2020
"current_budget_pid": "2",
2121
"online_harvested_source": [
22-
"mv-cantook"
22+
"mv-cantook", "memovs"
2323
],
2424
"collection_enabled_on_public_view": false
2525
},

rero_ils/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,12 @@ def _(x):
525525
"kwargs": {"name": "NJ-CANTOOK"},
526526
"enabled": False,
527527
},
528+
"celery.harvest-vs-memo": {
529+
"task": "rero_ils.modules.api_harvester.tasks.harvest_records",
530+
"schedule": schedules.crontab(minute=55, hour=5), # Every day at 03:33 UTC,
531+
"kwargs": {"name": "VS-MEMO"},
532+
"enabled": False,
533+
},
528534
}
529535

530536
INDEXER_BULK_REQUEST_TIMEOUT = 60
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# RERO ILS
4+
# Copyright (C) 2024 RERO
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as published by
8+
# the Free Software Foundation, version 3 of the License.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
"""Memovs API harvester module."""
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# RERO ILS
4+
# Copyright (C) 2024 RERO
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as published by
8+
# the Free Software Foundation, version 3 of the License.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
"""API for Memovs audiovisual archives records."""
19+
20+
from invenio_db import db
21+
from requests import codes as requests_codes
22+
23+
from rero_ils.modules.documents.api import Document, DocumentsSearch
24+
from rero_ils.modules.holdings.api import Holding, HoldingsSearch, create_holding
25+
from rero_ils.modules.utils import JsonWriter, requests_retry_session
26+
27+
from ..api import ApiHarvest
28+
from ..models import HarvestActionType
29+
from .dojson.json import memovs_json
30+
31+
32+
class ApiMemovs(ApiHarvest):
33+
"""ApiMemovs class.
34+
35+
Class for harvesting audiovisual archives from Memovs API.
36+
"""
37+
38+
def __init__(self, name, file_name=None, process=False, harvest_count=-1, verbose=False):
39+
"""Class init."""
40+
super().__init__(
41+
name=name,
42+
process=process,
43+
harvest_count=harvest_count,
44+
verbose=verbose,
45+
)
46+
if file_name:
47+
self.file = JsonWriter(file_name)
48+
self._vendor = "MEMOVS"
49+
50+
def get_request_url(self, start_date="1990-01-01", page=1):
51+
"""Get request URL.
52+
53+
:param start_date: date from where records has to be harvested
54+
:param page: page from where records have to be harvested
55+
:returns: request url
56+
"""
57+
params = f"from={start_date}&currentPage={page}"
58+
return f"{self._url}?{params}"
59+
60+
def delete_holdings(self, document_pid):
61+
"""Delete holdings.
62+
63+
:param document_pid: document pid
64+
"""
65+
for hold_pid in list(Holding.get_holdings_pid_by_document_pid(document_pid)):
66+
if holding := Holding.get_record_by_pid(hold_pid):
67+
for electronic_location in holding.get("electronic_location", []):
68+
if electronic_location.get("source") == self._code:
69+
holding.delete(dbcommit=True, delindex=True)
70+
break
71+
72+
def create_holdings(self, document_pid, link):
73+
"""Create holdings.
74+
75+
:param document_pid: document pid
76+
:param link: link to memovs document
77+
"""
78+
holdings = []
79+
for info in self._info.values():
80+
item_type_pid = info["item_type_pid"]
81+
for location_pid, url in info["locations"].items():
82+
if url:
83+
# Use organization-specific URL if available
84+
link_parts = link.split("/")[3:]
85+
link_parts.insert(0, url.rstrip("/"))
86+
link = "/".join(link_parts)
87+
# Check if the holding already exists
88+
query = (
89+
HoldingsSearch()
90+
.filter("term", document__pid=document_pid)
91+
.filter("term", location__pid=location_pid)
92+
.filter("term", holdings_type="electronic")
93+
.filter("term", electronic_location__source=self._code)
94+
)
95+
if query.count() == 0:
96+
holding = create_holding(
97+
document_pid=document_pid,
98+
location_pid=location_pid,
99+
item_type_pid=item_type_pid,
100+
electronic_location={"source": self._code, "uri": link},
101+
holdings_type="electronic",
102+
)
103+
holdings.append(holding)
104+
db.session.commit()
105+
for holding in holdings:
106+
holding.reindex()
107+
108+
def create_update_record(self, data):
109+
"""Create, update or delete record.
110+
111+
:param data: data for record operation
112+
:returns: harvested id and status
113+
"""
114+
status = HarvestActionType.NOTSET
115+
record = None
116+
record_data = memovs_json.do(data)
117+
if record_data.pop("deleted", None):
118+
status = HarvestActionType.DELETED
119+
link = record_data.pop("link", None)
120+
121+
# Get harvested ID
122+
harvested_id = record_data.pop("pid")
123+
# Check if document already exists
124+
query = DocumentsSearch().filter("term", identifiedBy__value__raw=harvested_id).source(includes=["pid"])
125+
try:
126+
pid = next(query.scan()).pid
127+
except StopIteration:
128+
pid = None
129+
130+
if pid:
131+
if doc := Document.get_record_by_pid(pid):
132+
if status == HarvestActionType.DELETED:
133+
self._count_del += 1
134+
self.delete_holdings(document_pid=doc.pid)
135+
# Try to delete document
136+
doc.pop("harvested", None)
137+
if not doc.reasons_not_to_delete():
138+
doc.delete(dbcommit=True, delindex=True)
139+
else:
140+
self._count_upd += 1
141+
status = HarvestActionType.UPDATED
142+
record_data["pid"] = doc.pid
143+
record = doc.replace(data=record_data, dbcommit=True, reindex=True)
144+
if link:
145+
self.create_holdings(document_pid=record.pid, link=link)
146+
elif status == HarvestActionType.NOTSET:
147+
self._count_new += 1
148+
status = HarvestActionType.CREATED
149+
record = Document.create(data=record_data, dbcommit=True, reindex=True)
150+
if link:
151+
self.create_holdings(document_pid=record.pid, link=link)
152+
return harvested_id, status
153+
154+
def harvest_records(self, from_date):
155+
"""Harvest Memovs records.
156+
157+
:param from_date: record changed after this date to get
158+
:returns: count and total items
159+
"""
160+
self._count = 0
161+
url = self.get_request_url(start_date=from_date, page=1)
162+
request = requests_retry_session().get(url)
163+
164+
if request.status_code != requests_codes.ok:
165+
self.verbose_print(f"Error fetching data: {request.status_code}")
166+
return self._count, 0
167+
168+
response_data = request.json()
169+
total_pages = response_data.get("totalPages", 0)
170+
total_items = response_data.get("totalDocuments", 0)
171+
current_page = response_data.get("currentPage", 1)
172+
173+
while (
174+
request.status_code == requests_codes.ok
175+
and current_page <= total_pages
176+
and (self.harvest_count < 0 or self._count < self.harvest_count)
177+
):
178+
self.verbose_print(f"API page: {current_page}/{total_pages} url: {url}")
179+
self.process_records(response_data.get("documents", []))
180+
181+
# Get next page
182+
current_page += 1
183+
if current_page <= total_pages:
184+
url = self.get_request_url(start_date=from_date, page=current_page)
185+
request = requests_retry_session().get(url)
186+
if request.status_code == requests_codes.ok:
187+
response_data = request.json()
188+
189+
return self._count, total_items
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# RERO ILS
4+
# Copyright (C) 2024 RERO
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as published by
8+
# the Free Software Foundation, version 3 of the License.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
"""Memovs dojson module."""
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# RERO ILS
4+
# Copyright (C) 2024 RERO
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as published by
8+
# the Free Software Foundation, version 3 of the License.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
"""Memovs json transformation."""
19+
20+
from .model import Transformation
21+
22+
memovs_json = Transformation()

0 commit comments

Comments
 (0)