Skip to content

Commit 690bcdd

Browse files
committed
update synchro script
1 parent 028e7ca commit 690bcdd

File tree

9 files changed

+109
-96
lines changed

9 files changed

+109
-96
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ coverage/
3636
###> phpstan/phpstan ###
3737
phpstan.neon
3838
###< phpstan/phpstan ###
39+
40+
openchurch_elasticsearch_data

compose.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ services:
3939
- app-network
4040
ports:
4141
- 9200:9200
42+
volumes:
43+
- ./openchurch_elasticsearch_data:/usr/share/elasticsearch/data
4244
backend:
4345
extra_hosts: *default-extra_hosts
4446
container_name: openchurch_backend
@@ -74,6 +76,7 @@ services:
7476
dockerfile: ./docker/python/Dockerfile
7577
depends_on:
7678
- redis
79+
- backend
7780
networks:
7881
- app-network
7982
extra_hosts:
@@ -83,6 +86,7 @@ services:
8386
OPENCHURCH_HOST: "https://api.openchurch.local/api"
8487
PYTHONWARNINGS: "ignore:Unverified HTTPS request"
8588
SYNCHRO_SECRET_KEY: "secret"
89+
REDIS_URL: redis://redis:6379
8690
volumes:
8791
- ./scripts:/app
8892
redis:
@@ -101,6 +105,7 @@ services:
101105

102106
volumes:
103107
openchurch_db_data: {}
108+
openchurch_elasticsearch_data: {}
104109

105110
networks:
106111
app-network:

docker/python/Dockerfile

-5
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,10 @@ FROM python:3.9-slim as runner
1414
WORKDIR /app/
1515
COPY --from=compiler /opt/venv /opt/venv
1616

17-
# install and configure cron daemon
18-
RUN apt update; apt install cron libterm-readline-gnu-perl procps -y; apt -yq autoremove; apt clean
19-
COPY etc/cron.d/syncro /etc/cron.d/syncro
2017
COPY usr/local/bin/docker-python-entrypoint /usr/local/bin/docker-python-entrypoint
2118

2219
# Enable venv
2320
ENV PATH="/opt/venv/bin:$PATH"
2421
COPY ./scripts /app/
2522

26-
CMD ["/usr/sbin/cron", "-f", "-l", "15"]
27-
2823
ENTRYPOINT ["/usr/local/bin/docker-python-entrypoint"]

etc/cron.d/syncro

-27
This file was deleted.

scripts/requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ SPARQLWrapper==1.8.5
66
SQLAlchemy==1.3.20
77
urllib3==1.25.11
88
sentry-sdk==1.3.0
9-
mwparserfromhell>=0.5.0
9+
mwparserfromhell>=0.5.0
10+
redis==5.2.1

scripts/synchro.py

+87-31
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@
99
import sentry_sdk
1010
import urllib.parse
1111
import urllib3
12+
import argparse
13+
import redis
1214

1315
from codecs import open
1416
from dotenv import load_dotenv
1517
from sqlalchemy import create_engine, exc, MetaData, Table, orm, func, insert, update
1618
from SPARQLWrapper import SPARQLWrapper, JSON
1719
from requests.adapters import HTTPAdapter
1820
from urllib3.util.retry import Retry
21+
from datetime import datetime, timedelta
1922

2023
load_dotenv(dotenv_path='.env')
2124
sentry_sdk.init(dsn=os.getenv('SENTRY_DSN_SYNCHRO'))
@@ -110,6 +113,7 @@ class Query(object):
110113
373074, # suffragan diocese
111114
]
112115
dateformat = '%Y-%m-%d %H:%M:%S'
116+
verbosity_level = 0
113117

114118
def __init__(self):
115119
self.cache_places = {}
@@ -149,7 +153,7 @@ def split_into_batches(lst, batch_size):
149153

150154
def fetch(self, file_name, query):
151155
if os.path.isfile(file_name):
152-
if os.path.getmtime(file_name) > time.time() - 12 * 3600: # cache JSON for 12 hours
156+
if os.path.getmtime(file_name) > time.time() - 1 * 1800: # cache JSON for 30 mins
153157
with open(file_name, 'r', encoding='utf-8') as content_file:
154158
print('Loading from file', file_name ,'please wait...')
155159
return json.loads(content_file.read())
@@ -195,7 +199,7 @@ def extractDiocesesFromSparqlQuery(self, sparqlData):
195199
'name': Query.ucfirst(label_fr),
196200
'contactCountryCode': 'fr',
197201
'website': website,
198-
'wikidataUpdatedAt': str(datetime.datetime.strptime(modified, Query.dateformat)),
202+
'wikidataUpdatedAt': str(datetime.strptime(modified, Query.dateformat)),
199203
}
200204
return dioceses
201205

@@ -284,7 +288,7 @@ def extractChurchesFromSparqlQuery(self, sparqlData):
284288
'name': Query.ucfirst(label_fr),
285289
'latitude': float(latitude),
286290
'longitude': float(longitude),
287-
'wikidataUpdatedAt': str(datetime.datetime.strptime(modified, Query.dateformat)),
291+
'wikidataUpdatedAt': str(datetime.strptime(modified, Query.dateformat)),
288292
}
289293
return churches
290294

@@ -295,8 +299,10 @@ def update_dioceses(self, sparqlData, client):
295299
fields = client.populate_fields(wikidataIdDioceses[wikidataId], wikidataId)
296300
wikidataEntities['wikidataEntities'].append(fields)
297301
if len(wikidataEntities['wikidataEntities']) > 0:
302+
self.print_logs(wikidataEntities['wikidataEntities'], 2)
298303
response = client.upsert_wikidata_entities('/communities/upsert', wikidataEntities)
299-
print('DIOCESES : ', response)
304+
self.print_logs(response, 1)
305+
return response
300306

301307
def update_parishes(self, sparqlData, client):
302308
wikidataEntities = {'wikidataEntities': []}
@@ -305,8 +311,10 @@ def update_parishes(self, sparqlData, client):
305311
fields = client.populate_fields(wikidataIdParishes[wikidataId], wikidataId)
306312
wikidataEntities['wikidataEntities'].append(fields)
307313
if len(wikidataEntities['wikidataEntities']) > 0:
314+
self.print_logs(wikidataEntities['wikidataEntities'], 2)
308315
response = client.upsert_wikidata_entities('/communities/upsert', wikidataEntities)
309-
print("PARISHES : ", response)
316+
self.print_logs(response, 1)
317+
return response
310318

311319
def update_churches(self, sparqlData, client):
312320
wikidataEntities = {'wikidataEntities': []}
@@ -315,8 +323,14 @@ def update_churches(self, sparqlData, client):
315323
fields = client.populate_fields(wikidataIdChurches[wikidataId], wikidataId)
316324
wikidataEntities['wikidataEntities'].append(fields)
317325
if len(wikidataEntities['wikidataEntities']) > 0:
326+
self.print_logs(wikidataEntities['wikidataEntities'], 2)
318327
response = client.upsert_wikidata_entities('/places/upsert', wikidataEntities)
319-
print("CHURCHES : ", response)
328+
self.print_logs(response, 1)
329+
return response
330+
331+
def print_logs(self, data, required_level):
332+
if self.verbosity_level >= required_level:
333+
print(json.dumps(data, indent=4, ensure_ascii=False))
320334

321335
class UuidDoesNotExistException(Exception):
322336
pass
@@ -330,7 +344,7 @@ class OpenChurchClient(object):
330344
session = requests.Session()
331345
# Configure retries and timeouts
332346
retry_strategy = Retry(
333-
total=3,
347+
total=1,
334348
backoff_factor=1,
335349
status_forcelist=[429, 500, 502, 503, 504]
336350
)
@@ -347,15 +361,15 @@ class OpenChurchClient(object):
347361
session.request_timeout = (3.05, 27)
348362

349363
def upsert_wikidata_entities(self, path, body):
350-
response = self.session.put(self.hostname + path, json=body, headers=self.headers, verify=False)
351-
if response.status_code == 200:
352-
data = response.json()
353-
return data
354-
elif response.status_code == 404 and True:
355-
print(response.status_code, response.text, 'for GET', path)
356-
raise UuidDoesNotExistException
357-
else:
358-
print(response.status_code, response.text, 'for PUT', path, body)
364+
try:
365+
response = self.session.put(self.hostname + path, json=body, headers=self.headers, verify=False)
366+
if response.status_code == 200:
367+
data = response.json()
368+
return data
369+
else:
370+
print(response.status_code, 'for PUT', path)
371+
return None
372+
except requests.exceptions.RequestException as e:
359373
return None
360374

361375
def populate_fields(self, values, wikidata_id):
@@ -375,22 +389,64 @@ def populate_fields(self, values, wikidata_id):
375389
def percentage(num, total):
376390
return '%s = %s%%' % (num, (round(100 * num / total, 2)))
377391

378-
if __name__ == '__main__':
392+
def get_redis_key(type, origin, to):
393+
return '%s_%s-%s' % (type, origin, to)
394+
395+
def process_entity(type, batch_size, verbosity_level):
396+
redis_url = os.getenv('REDIS_URL')
397+
redis_client = redis.from_url(redis_url)
379398
q = Query()
399+
q.verbosity_level = verbosity_level
380400
client = OpenChurchClient()
381-
batch_size = 25
382-
383-
# dioceses = q.fetch('wikidata_dioceses.json', dioceses_query)
384-
# batches = Query.split_into_batches(dioceses, batch_size)
385-
# for batch in batches:
386-
# q.update_dioceses(batch, client)
387401

388-
# parishes = q.fetch('wikidata_parishes.json', parishes_query)
389-
# batches = Query.split_into_batches(parishes, batch_size)
390-
# for batch in batches:
391-
# q.update_parishes(batch, client)
392-
393-
churches = q.fetch('wikidata_churches.json', churches_query)
394-
batches = Query.split_into_batches(churches, batch_size)
402+
print("starting synchro for", type)
403+
if type == "diocese":
404+
data = q.fetch('wikidata_dioceses.json', dioceses_query)
405+
method = "update_dioceses"
406+
elif type == "parish":
407+
data = q.fetch('wikidata_parishes.json', parishes_query)
408+
method = "update_parishes"
409+
elif type == "church":
410+
data = q.fetch('wikidata_churches.json', churches_query)
411+
method = "update_churches"
412+
else:
413+
raise("Type d'entité non reconnu")
414+
415+
batches = Query.split_into_batches(data, batch_size)
416+
iteration = 1
395417
for batch in batches:
396-
q.update_churches(batch, client)
418+
can_process = True
419+
key = get_redis_key(type, iteration - 1, len(batch))
420+
value = redis_client.hgetall(key)
421+
if value:
422+
# A key exist. We chek if we can process it
423+
decoded_data = {key.decode('utf-8'): value.decode('utf-8') for key, value in value.items()}
424+
if decoded_data.get('status') in {'success', 'error'}:
425+
time_diff = datetime.now() - datetime.strptime(decoded_data.get('updatedAt'), "%Y-%m-%d %H:%M:%S.%f")
426+
if time_diff <= timedelta(minutes=30):
427+
can_process = False # We updated the batch less than 30 minutes ago. We skip it
428+
429+
if can_process:
430+
redis_client.hset(key, "status", "processing")
431+
redis_client.hset(key, "updatedAt", str(datetime.now()))
432+
print("Processing batch %s/%s" % (iteration, len(batches) + 1))
433+
res = getattr(q, method)(batch, client)
434+
if res:
435+
success_count = sum(1 for value in res.values() if value in {'Updated', 'Inserted'})
436+
redis_client.hset(key, "successCount", success_count)
437+
redis_client.hset(key, "failureCount", len(res) - success_count)
438+
redis_client.hset(key, "status", "success")
439+
print(success_count, len(res) - success_count)
440+
else:
441+
redis_client.hset(key, "status", "error")
442+
else:
443+
print("Ignore batch %s/%s" % (iteration, len(batches) + 1))
444+
iteration += 1
445+
print("ended syncrho for", type)
446+
447+
if __name__ == '__main__':
448+
parser = argparse.ArgumentParser()
449+
parser.add_argument("--entity-only", type=str, required=True, choices=["parish", "diocese", "church"], help="Spécifiez l'entité à traiter : 'diocese', 'parish' ou 'church'")
450+
parser.add_argument("-v", "--verbose", action="count", default=0, help="Augmente le niveau de verbosité (utilisez -vvv pour plus de détails).")
451+
args = parser.parse_args()
452+
process_entity(args.entity_only, 100, args.verbose)

src/FieldHolder/Community/Infrastructure/ApiPlatform/State/Processor/UpsertCommunityProcessor.php

+5-15
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,11 @@ public function process(mixed $data, Operation $operation, array $uriVariables =
5656
$communities = $this->communityRepo->addSelectField()->withWikidataIds($wikidataIds)->asCollection();
5757
foreach ($communities as $community) {
5858
$wikidataId = $community->getMostTrustableFieldByName(FieldCommunity::WIKIDATA_ID)->getValue();
59-
$openChurchWikidataUpdatedAt = $community->getMostTrustableFieldByName(FieldCommunity::WIKIDATA_UPDATED_AT);
60-
$wikidataUpdatedAt = $this->fieldHolderUpsertService->getFieldByName($wikidataIdFields[$wikidataId], FieldCommunity::WIKIDATA_UPDATED_AT->value);
61-
if (
62-
!$openChurchWikidataUpdatedAt
63-
|| intval((new \DateTimeImmutable($wikidataUpdatedAt->value))->diff($openChurchWikidataUpdatedAt->getValue())->format('%a')) >= 1
64-
) {
65-
// WikidataUpdatedAt diff is greater than 1 day. We have to update the data
66-
try {
67-
$this->fieldService->upsertFields($community, $wikidataIdFields[$wikidataId]);
68-
$result[$wikidataId] = 'Updated';
69-
} catch (\Exception $e) {
70-
$result[$wikidataId] = $this->fieldHolderUpsertService->handleError($community, $e, [$this->communityRepo, 'detach']);
71-
}
72-
} else {
73-
$result[$wikidataId] = 'No need to update';
59+
try {
60+
$this->fieldService->upsertFields($community, $wikidataIdFields[$wikidataId]);
61+
$result[$wikidataId] = 'Updated';
62+
} catch (\Exception $e) {
63+
$result[$wikidataId] = $this->fieldHolderUpsertService->handleError($community, $e, [$this->communityRepo, 'detach']);
7464
}
7565
unset($wikidataIdFields[$wikidataId]);
7666
}

src/FieldHolder/Place/Infrastructure/ApiPlatform/State/Processor/UpsertPlaceProcessor.php

+5-15
Original file line numberDiff line numberDiff line change
@@ -57,21 +57,11 @@ public function process(mixed $data, Operation $operation, array $uriVariables =
5757
$places = $this->placeRepo->addSelectField()->withWikidataIds($wikidataIds)->asCollection();
5858
foreach ($places as $place) {
5959
$wikidataId = $place->getMostTrustableFieldByName(FieldPlace::WIKIDATA_ID)->getValue();
60-
$openChurchWikidataUpdatedAt = $place->getMostTrustableFieldByName(FieldPlace::WIKIDATA_UPDATED_AT);
61-
$wikidataUpdatedAt = $this->fieldHolderUpsertService->getFieldByName($wikidataIdFields[$wikidataId], FieldPlace::WIKIDATA_UPDATED_AT->value);
62-
if (
63-
!$openChurchWikidataUpdatedAt
64-
|| intval((new \DateTimeImmutable($wikidataUpdatedAt->value))->diff($openChurchWikidataUpdatedAt->getValue())->format('%a')) >= 1
65-
) {
66-
// WikidataUpdatedAt diff is greater than 1 day. We have to update the data
67-
try {
68-
$this->fieldService->upsertFields($place, $wikidataIdFields[$wikidataId]);
69-
$result[$wikidataId] = 'Updated';
70-
} catch (\Exception $e) {
71-
$result[$wikidataId] = $this->fieldHolderUpsertService->handleError($place, $e, [$this->placeRepo, 'detach']);
72-
}
73-
} else {
74-
$result[$wikidataId] = 'No need to update';
60+
try {
61+
$this->fieldService->upsertFields($place, $wikidataIdFields[$wikidataId]);
62+
$result[$wikidataId] = 'Updated';
63+
} catch (\Exception $e) {
64+
$result[$wikidataId] = $this->fieldHolderUpsertService->handleError($place, $e, [$this->placeRepo, 'detach']);
7565
}
7666
unset($wikidataIdFields[$wikidataId]);
7767
}

usr/local/bin/docker-python-entrypoint

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ set -e
44
env > /etc/environment
55
echo -------------------; echo Versions: ; python --version ; pip --version ; echo -------------------
66

7-
/opt/venv/bin/python /app/synchro.py
8-
/opt/venv/bin/python /app/synchro.py push
7+
/opt/venv/bin/python /app/synchro.py --entity-only diocese
8+
/opt/venv/bin/python /app/synchro.py --entity-only parish
9+
/opt/venv/bin/python /app/synchro.py --entity-only church
910

1011
exec "$@"

0 commit comments

Comments
 (0)