Skip to content

Commit 67e3147

Browse files
committed
Download safeguarding files from Google Drive (optionally)
1 parent 73e2190 commit 67e3147

File tree

3 files changed

+28
-9
lines changed

3 files changed

+28
-9
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ dependencies = [
3232
"packaging~=21.3",
3333
"rapidpro-abtesting @ git+https://github.com/IDEMSInternational/rapidpro_abtesting.git@master",
3434
"requests~=2.31",
35-
"rpft @ git+https://github.com/IDEMSInternational/rapidpro-flow-toolkit.git@1.6.0",
35+
"rpft @ git+https://github.com/IDEMSInternational/rapidpro-flow-toolkit.git@1.7.0",
3636
]
3737

3838
[project.scripts]

src/parenttext_pipeline/extract_keywords.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ def process_keywords(sources):
1616

1717

1818
def process_source(source):
19-
input_file = source["path"]
19+
input_file = source.get("location") or source["path"]
2020
language = source["key"]
21-
book = openpyxl.load_workbook(input_file)
21+
book = openpyxl.load_workbook(input_file, read_only=True)
2222
all_tables = {}
2323

2424
for sheet in book.worksheets:

src/parenttext_pipeline/pull_data.py

+25-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import os
2+
import re
23
import shutil
34
import tempfile
45
from datetime import datetime, timezone
56
from pathlib import Path
67

78
import requests
89
from rpft.converters import convert_to_json
10+
from rpft.google import Drive
911

1012
from parenttext_pipeline.common import (
1113
clear_or_create_folder,
@@ -139,16 +141,33 @@ def pull_json(config, source, source_name):
139141
shutil.copyfile(filepath, source_input_path / f"{new_name}.json")
140142

141143

144+
def is_google_drive_file_id(location):
145+
return bool(re.fullmatch(r"[a-z0-9_-]{33}", location, re.IGNORECASE))
146+
147+
142148
def pull_safeguarding(config, source, source_name):
143-
# Safeguarding files
144-
source_input_path = get_input_subfolder(
145-
config, source_name, makedirs=True, in_temp=False
149+
keywords_file_path = (
150+
get_input_subfolder(config, source_name, makedirs=True, in_temp=False)
151+
/ "safeguarding_words.json"
146152
)
147-
safeguarding_file_path = source_input_path / "safeguarding_words.json"
153+
148154
if source.sources:
149-
process_keywords_to_file(source.sources, safeguarding_file_path)
155+
156+
with tempfile.TemporaryDirectory() as dest:
157+
158+
for s in source.sources:
159+
location = s.get("location") or s["path"]
160+
161+
if is_google_drive_file_id(location):
162+
name, content = Drive.fetch(location)
163+
s["location"] = Path(dest) / (location + Path(name).suffix)
164+
165+
with open(s["location"], "wb") as f:
166+
f.write(content)
167+
168+
process_keywords_to_file(source.sources, keywords_file_path)
150169
else:
151-
shutil.copyfile(source.filepath, safeguarding_file_path)
170+
shutil.copyfile(source.filepath, keywords_file_path)
152171

153172

154173
def unpack_archive(destination, location):

0 commit comments

Comments
 (0)