forked from CRG-Beato/utils_beatolab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathio_metadata.py
executable file
·108 lines (83 loc) · 3.5 KB
/
io_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python
#==================================================================================================
# Created on: 2016-02-05
# Usage: ./io_metadata.py <database> <mode>
# Author: [email protected]
# Goal: update metadata
#==================================================================================================
# IMPORTANT:
# in the presence of fields with (some) missing values, the following error can occur:
# >sqlalchemy.exc.StatementError: (exceptions.ValueError) could not convert string to float:
# it seems python/sql thinks the expected value is a float when in reality is a string
# the solution is delete that field in the table of the database and re-run the script
# Import python modules and functions
import os, sys
import dataset
import pandas as pd
import numpy as np
from collections import OrderedDict
import glob
import time
import datetime
# Get variables from script parameters
metadata = sys.argv[1]
mode = sys.argv[2]
# Load database
db = dataset.connect('sqlite:///%s' % metadata)
#==================================================================================================
# COMMANDS
#==================================================================================================
if mode == "download_input_metadata":
# read table with downloaded input metadata
spreadsheet = sys.argv[3]
df = pd.read_table(spreadsheet)
# create/load table and add input metadata from multiple samples
input_metadata = db.get_table('input_metadata', primary_id = 'SAMPLE_ID', primary_type = 'String')
for index, row in df.iterrows():
input_metadata.upsert(row.to_dict(), ['SAMPLE_ID'])
elif mode == "quality_control_raw_reads":
# get variables
sample_id = sys.argv[3]
attribute = sys.argv[4]
value = sys.argv[5]
# create/load table and add sample data
# important: because this script may be used in parallel, with many instances trying to access to the data
# do not use transactions (e.g. db.begin(), db.commit()...) as these do not allow multiple writings to the database
quality_control_raw_reads = db.get_table('quality_control_raw_reads', primary_id = 'SAMPLE_ID', primary_type = 'String')
new_data = {}
new_data['SAMPLE_ID'] = sample_id
new_data[attribute] = value
quality_control_raw_reads.upsert(new_data, ['SAMPLE_ID'])
elif mode == "get_from_metadata":
# get variables
table = sys.argv[3]
sample_id = sys.argv[4]
attribute = sys.argv[5]
#print metadata, mode, table, sample_id, attribute
# load table, select sample and print attribute
tab = db.load_table(table)
my_sample = tab.find(SAMPLE_ID = sample_id)
for s in my_sample:
print s[attribute]
elif mode == "add_to_metadata":
# get variables
table = sys.argv[3]
sample_id = sys.argv[4]
time_stamp = sys.argv[5]
attribute = sys.argv[6]
value = sys.argv[7]
my_key = ";".join([sample_id, time_stamp])
# important: because this script may be used in parallel, with many instances trying to access to the data
# do not use transactions (e.g. db.begin(), db.commit()...) as these do not allow multiple writings to the database
# *seq table stores the most recent values
tab = db.get_table(table, primary_id = 'SAMPLE_ID', primary_type = 'String')
new_data = {}
new_data['SAMPLE_ID'] = sample_id
new_data[attribute] = value
tab.upsert(new_data, ['SAMPLE_ID'])
# jobs table stores all values from different jobs
tab = db.get_table('jobs', primary_id = 'JOB_ID', primary_type = 'String')
new_data = {}
new_data['JOB_ID'] = my_key
new_data[attribute] = value
tab.upsert(new_data, ['JOB_ID'])