IDEMSInternational · istride · Feb 20, 2025 · Feb 14, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
     "packaging~=21.3",
     "rapidpro-abtesting @ git+https://github.com/IDEMSInternational/rapidpro_abtesting.git@master",
     "requests~=2.31",
-    "rpft @ git+https://github.com/IDEMSInternational/rapidpro-flow-toolkit.git@1.6.0",
+    "rpft @ git+https://github.com/IDEMSInternational/rapidpro-flow-toolkit.git@1.7.0",
 ]
 
 [project.scripts]

diff --git a/src/parenttext_pipeline/extract_keywords.py b/src/parenttext_pipeline/extract_keywords.py
@@ -16,9 +16,9 @@ def process_keywords(sources):
 
 
 def process_source(source):
-    input_file = source["path"]
+    input_file = source.get("location") or source["path"]
     language = source["key"]
-    book = openpyxl.load_workbook(input_file)
+    book = openpyxl.load_workbook(input_file, read_only=True)
     all_tables = {}
 
     for sheet in book.worksheets:

diff --git a/src/parenttext_pipeline/pull_data.py b/src/parenttext_pipeline/pull_data.py
@@ -1,11 +1,13 @@
 import os
+import re
 import shutil
 import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
 
 import requests
 from rpft.converters import convert_to_json
+from rpft.google import Drive
 
 from parenttext_pipeline.common import (
     clear_or_create_folder,
@@ -139,16 +141,33 @@ def pull_json(config, source, source_name):
         shutil.copyfile(filepath, source_input_path / f"{new_name}.json")
 
 
+def is_google_drive_file_id(location):
+    return bool(re.fullmatch(r"[a-z0-9_-]{33}", location, re.IGNORECASE))
+
+
 def pull_safeguarding(config, source, source_name):
-    # Safeguarding files
-    source_input_path = get_input_subfolder(
-        config, source_name, makedirs=True, in_temp=False
+    keywords_file_path = (
+        get_input_subfolder(config, source_name, makedirs=True, in_temp=False)
+        / "safeguarding_words.json"
     )
-    safeguarding_file_path = source_input_path / "safeguarding_words.json"
+
     if source.sources:
-        process_keywords_to_file(source.sources, safeguarding_file_path)
+
+        with tempfile.TemporaryDirectory() as dest:
+
+            for s in source.sources:
+                location = s.get("location") or s["path"]
+
+                if is_google_drive_file_id(location):
+                    name, content = Drive.fetch(location)
+                    s["location"] = Path(dest) / (location + Path(name).suffix)
+
+                    with open(s["location"], "wb") as f:
+                        f.write(content)
+
+            process_keywords_to_file(source.sources, keywords_file_path)
     else:
-        shutil.copyfile(source.filepath, safeguarding_file_path)
+        shutil.copyfile(source.filepath, keywords_file_path)
 
 
 def unpack_archive(destination, location):