-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrefseq_to_library.py
246 lines (191 loc) · 10 KB
/
refseq_to_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
'''
Script to make data library of RefSeq reference genomes for specified genus (or species)
usage: refseq_to_library.py [-h] [-s SPECIES] [-u URL] [-d DIR] [-k KEY] [-v]
[-t [FILETYPES [FILETYPES ...]]] [-e]
genus
Add RefSeq reference genomes to galaxy data libraries.
positional arguments:
genus the genus to create a library for
optional arguments:
-h, --help show this help message and exit
-s SPECIES, --species SPECIES
the species to create the library for
-u URL, --url URL the galaxy URL
-d DIR, --dir DIR the RefSeq directory containing all species (overrides
default)
-k KEY, --key KEY the Galaxy API key to use (overrides default)
-v, --verbose Print out debugging information
-t [FILETYPES [FILETYPES ...]], --filetypes [FILETYPES [FILETYPES ...]]
A space-seperated list of filetypes to include in the
data library. Defaults to fna, faa, ffn, gbk, gff
-e, --exclude Exclude the file types specified in -t. Defaults to
excluding fna, faa, ffn, gbk, gff
Needs an API key in GALAXY_KEY unless specified via command line
Assumes Galaxy instance exists at localhost and refseq folder has the following structure:
refseq_folder/
species/
fna files
'''
from __future__ import print_function
from collections import defaultdict
from bioblend.galaxy import GalaxyInstance
import os
import sys
import argparse
def printerr(*args):
'''
Function for printing to stderr
:param args: The contents to be printed to stderr
:return: None
'''
print(*args, file=sys.stderr)
def getFilesInLibraryFolder(contents, folder):
'''
Function for getting a list of all files in a library's folder.
:param contents: The contents of the library - can be obtained with show_library(lib['id'], contents=True)
:param folder: The name of the folder (as a string)
:return: A list of file names (strings) within folder
'''
file_names = []
for item in contents:
# Split file path
filepath = item['name'].split('/')
# Check if file and inside folder, if so, add to list
if item['type'] == 'file' and filepath[1] == folder:
file_names.append(filepath[2])
return file_names
def getFilesToInclude(filePath, fileTypes, exclude=False):
'''
Function for getting a list of all files of a given type (or the inverse).
:param filePath: The path of the folder containing the files.
:param fileTypes: A list of file types you wish to include/exclude.
:param include: True if you want to get files matching those in fileTypes,
False if you want to exclude files matching those in fileTypes.
:return: A list of file names (strings) within folder
'''
# By default, we check if the filetype is in the filetypes.
compareFunc=lambda ftype,ftypes: ftype.endswith(tuple(ftypes))
if exclude: # Overriding default behaviour
compareFunc=lambda ftype,ftypes: not ftype.endswith(tuple(ftypes))
files_to_include = []
for fileName in os.listdir(filePath):
# Compare the file extension with the fileTypes list.
if compareFunc(fileName, fileTypes):
files_to_include.append(fileName)
return files_to_include
if __name__ == "__main__":
# Default values
GALAXY_URL = 'http://127.0.0.1:8080/galaxy/'
GALAXY_KEY = ''
REFSEQ_DIR = '/mnt/galaxyIndices/Bacteria/'
FILE_TYPES=['fna', 'faa', 'ffn', 'gbk', 'gff']
# Get things like API Key, RefSeq directory and genus from command line
parser = argparse.ArgumentParser(description='Add RefSeq reference genomes to galaxy data libraries.')
parser.add_argument("genus", type=str, help="the genus to create a library for")
parser.add_argument('-s', '--species', type=str, help='the species to create the library for', default="")
parser.add_argument('-u', '--url', type=str, help='the galaxy URL', default=GALAXY_URL)
parser.add_argument('-d', '--dir', type=str, help='the RefSeq directory containing all species (overrides default)', default=REFSEQ_DIR)
parser.add_argument('-k', '--key', type=str, help='the Galaxy API key to use (overrides default)', default=GALAXY_KEY)
parser.add_argument('-v', '--verbose', action="store_true", help='Print out debugging information')
parser.add_argument('-t', '--filetypes', nargs='*', help='A space-seperated list of filetypes to include in the data library. Defaults to fna, faa, ffn, gbk, gff', default=FILE_TYPES)
parser.add_argument('-e', '--exclude', action='store_true', help='Exclude the file types specified in -t. Defaults to excluding fna, faa, ffn, gbk, gff')
# Parse args, store genus in lowercase
args = parser.parse_args()
genus = args.genus.lower()
# Renaming for readability.
species = args.species.lower()
GALAXY_URL = args.url
REFSEQ_DIR = args.dir
GALAXY_KEY = args.key
FILE_TYPES = args.filetypes
# Ensure the RefSeq directory and Galaxy URL end in a / to avoid errors later
if REFSEQ_DIR[-1] != "/": REFSEQ_DIR += "/"
if GALAXY_URL[-1] != "/": GALAXY_URL += "/"
# Print out debugging info
if args.verbose:
print("Galaxy URL: " + GALAXY_URL)
print("Galaxy Key: " + GALAXY_KEY)
print("RefSeq Directory: " + REFSEQ_DIR)
print("Genus: " + genus)
print("Species: " + species)
# Check the RefSeq directory exists, exit if we can't find it
if not os.path.isdir(REFSEQ_DIR):
printerr("ERROR: The RefSeq directory could not be found at " + REFSEQ_DIR)
sys.exit(1)
# Initiating Galaxy connection
gi = GalaxyInstance(url=GALAXY_URL, key=GALAXY_KEY)
# Make a dict of all genus/species/RefSeq directories, map genus to a dict of species:folder pairs
dirs = defaultdict(lambda : defaultdict(list))
for folder in os.listdir(REFSEQ_DIR):
if args.verbose: print("Processing folder - " + folder)
# Ignore hidden folders/files
if folder[0] != ".":
# Temp copy folder in case it starts with a _
folder_tmp = folder
# If folder starts with a _ then trim string
if folder_tmp.find("_") == 0:
folder_tmp = folder_tmp[1:]
# Grab genus from folder name, add species:folder pair to genus dict
split_point = folder_tmp.split("_")
if len(split_point) > 2:
dirs[split_point[0].lower()][split_point[1].lower()].append(folder)
# If we don't have the genus, error and exit
if genus not in dirs:
printerr("ERROR: There are no genomes for your specified genus " + genus)
sys.exit(1)
# If we don't have the species, error and exit
if species and species not in dirs[genus].keys():
printerr("ERROR: There are no genomes for your specified species " + species)
sys.exit(1)
# Check for existing libraries
libraries = gi.libraries.get_libraries(deleted=False)
# Determine the library name - if species is not specified, nothing is added and trailing whitespace trimmed
possible_lib_name = genus + " " + species
possible_lib_name = possible_lib_name.strip()
# Get existing library info if it does exist, if it doesn't exist create library
if possible_lib_name in [lib['name'] for lib in libraries if not lib['deleted']]:
if args.verbose: print("Library already exists - checking it is up to date")
# Get library - assumes there is only one library of that name
lib = gi.libraries.get_libraries(name=possible_lib_name, deleted=False)[0]
else:
if args.verbose: print("Library doesn't exist - adding new library")
lib = gi.libraries.create_library(possible_lib_name, "Reference genomes for " + possible_lib_name)
# Species needs to be an iterable - ensure it is
if species:
# If species exists, put it in a list
species = [species]
else:
# If it was unspecified, make it a list of all the possible species
species = list(dirs[genus].keys())
# Get all the directory names for checking later on
lib_dirs = [d['name'][1:] for d in gi.libraries.get_folders(lib['id'])]
# For each species specified, go through each folder and add appropriate files
for spc in species:
for folder in dirs[genus][spc]:
# Check if folder exists, get required info if it does, otherwise create it
if folder in lib_dirs:
if args.verbose: print("Directory exists: " + folder)
# Get directory information
fldr = gi.libraries.get_folders(lib['id'], name="/" + folder)[0]
else:
if args.verbose: print("Adding directory to library - " + folder)
fldr = gi.libraries.create_folder(lib['id'], folder)[0]
for fna in getFilesToInclude(REFSEQ_DIR + folder, FILE_TYPES, args.exclude):
# If file doesn't exist, add it
if fna not in getFilesInLibraryFolder(gi.libraries.show_library(lib['id'], contents=True), folder):
if args.verbose: print("Adding file - " + fna)
if "127.0.0.1" in GALAXY_URL or "localhost" in GALAXY_URL:
# Local Galaxy server - create a symbolic link instead of a copy
gi.libraries.upload_from_galaxy_filesystem(
library_id=lib['id'],
filesystem_paths=REFSEQ_DIR + folder + "/" + fna,
folder_id=fldr['id'],
link_data_only="link_to_files")
else:
# Remote Galaxy server - copy files from local machine
gi.libraries.upload_file_from_local_path(
library_id=lib['id'],
file_local_path=REFSEQ_DIR + folder + "/" + fna,
folder_id=fldr['id'])
else:
if args.verbose: print("File exists - " + fna)