forked from RNAcentral/rnacentral-import-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstream.py
81 lines (66 loc) · 2.45 KB
/
stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
"""
Copyright [2009-2020] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import csv
import logging
import collections as coll
from rnacentral_pipeline.databases.helpers.publications import reference
from rnacentral_pipeline.databases.europepmc import fetch
from . import xml
LOGGER = logging.getLogger(__name__)
def load_ids(handle, column):
data = coll.defaultdict(list)
reader = csv.reader(handle)
for row in reader:
key = reference(row[column])
rest = [d for i, d in enumerate(row) if i != column]
data[key].append(rest)
return dict(data)
def fallback(data):
for id_ref, rows in data.items():
try:
ref = fetch.lookup(id_ref)
yield id_ref, ref, rows
except (fetch.UnknownReference, fetch.TooManyPublications):
pass
def lookup(ids, directory, column, allow_fallback=True, ignore_missing=False):
data = load_ids(ids, column)
not_found = dict(data)
for ref in xml.parse_directory(directory):
for possible_id in ref.id_references:
if possible_id in data:
yield ref, data[possible_id]
del not_found[possible_id]
if not not_found:
return
if allow_fallback:
for id_ref, ref, rows in fallback(dict(not_found)):
yield ref, rows
del not_found[id_ref]
for id_ref, rows in not_found.items():
LOGGER.warning("Failed to lookup: %s (%i entries)", id_ref, len(rows))
if not ignore_missing:
raise ValueError("Could not lookup %s" % id_ref)
def write_lookup(
ids, directory, output, column=0, allow_fallback=True, ignore_missing=False
):
writer = csv.writer(output)
for ref, rows in lookup(
ids,
directory,
column,
allow_fallback=allow_fallback,
ignore_missing=ignore_missing,
):
for rest in rows:
writer.writerows(ref.writeable(rest))