|
1 | 1 | import os
|
| 2 | +import re |
2 | 3 | import shutil
|
3 | 4 | import tempfile
|
4 | 5 | from datetime import datetime, timezone
|
5 | 6 | from pathlib import Path
|
6 | 7 |
|
7 | 8 | import requests
|
8 | 9 | from rpft.converters import convert_to_json
|
| 10 | +from rpft.google import Drive |
9 | 11 |
|
10 | 12 | from parenttext_pipeline.common import (
|
11 | 13 | clear_or_create_folder,
|
@@ -139,16 +141,33 @@ def pull_json(config, source, source_name):
|
139 | 141 | shutil.copyfile(filepath, source_input_path / f"{new_name}.json")
|
140 | 142 |
|
141 | 143 |
|
| 144 | +def is_google_drive_file_id(location): |
| 145 | + return bool(re.fullmatch(r"[a-z0-9_-]{33}", location, re.IGNORECASE)) |
| 146 | + |
| 147 | + |
142 | 148 | def pull_safeguarding(config, source, source_name):
|
143 |
| - # Safeguarding files |
144 |
| - source_input_path = get_input_subfolder( |
145 |
| - config, source_name, makedirs=True, in_temp=False |
| 149 | + keywords_file_path = ( |
| 150 | + get_input_subfolder(config, source_name, makedirs=True, in_temp=False) |
| 151 | + / "safeguarding_words.json" |
146 | 152 | )
|
147 |
| - safeguarding_file_path = source_input_path / "safeguarding_words.json" |
| 153 | + |
148 | 154 | if source.sources:
|
149 |
| - process_keywords_to_file(source.sources, safeguarding_file_path) |
| 155 | + |
| 156 | + with tempfile.TemporaryDirectory() as dest: |
| 157 | + |
| 158 | + for s in source.sources: |
| 159 | + location = s.get("location") or s["path"] |
| 160 | + |
| 161 | + if is_google_drive_file_id(location): |
| 162 | + name, content = Drive.fetch(location) |
| 163 | + s["location"] = Path(dest) / (location + Path(name).suffix) |
| 164 | + |
| 165 | + with open(s["location"], "wb") as f: |
| 166 | + f.write(content) |
| 167 | + |
| 168 | + process_keywords_to_file(source.sources, keywords_file_path) |
150 | 169 | else:
|
151 |
| - shutil.copyfile(source.filepath, safeguarding_file_path) |
| 170 | + shutil.copyfile(source.filepath, keywords_file_path) |
152 | 171 |
|
153 | 172 |
|
154 | 173 | def unpack_archive(destination, location):
|
|
0 commit comments