-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkhmer-batch.rb
executable file
·144 lines (131 loc) · 4.57 KB
/
khmer-batch.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env ruby
#
# khmer-batch
#
# Take in a set of fastq files and run khmer normalize-by-median.py on each
# storing the hashtable at each step and using it for the next step
#
# Make sure normalize-by-median.py is in your PATH or specify the location
# with the --script option
#
# Chris Boursnell ([email protected]) && Richard Smith ([email protected])
# created: 08/07/2013
# last modified: 14/07/2013
#
require 'rubygems'
require 'trollop'
opts = Trollop::options do
version "v0.0.1a"
opt :input, "A file of fastq files, 1 per line", :type => String
opt :files, "A list of colon separated input fastq files", :type => String
opt :script, "Specify the location of the khmer normalize-by-median.py script if it is not in your PATH", :default => "normalize-by-median.py", :type => String
opt :paired, "If the input fastq files are interleaved paired reads"
opt :interleave, "Do the input fastq files need to be interleaved"
opt :continue, "Continue a previous run using existing table.kh"
opt :memory, "Maximum amount of memory to be used by khmer in gigabytes", :default => 4.0, :type => :float
opt :kmer, "K value to use in khmer", :default => 21, :type => :int
opt :buckets, "Number of buckets", :default => 4, :type => :int
opt :cleanup, "Remove input files after they are processed"
opt :gzip, "gzip input and output files after they are processed"
opt :dsrc, "compress input and output files with dsrc after they are processed"
opt :test, "Don't run the command"
opt :verbose, "Be verbose"
end
filelist=[]
# check inputs
if opts.input and opts.files
abort "Choose either --input or --files but not both"
elsif opts.input
if !File.exists?(opts.input)
abort "Can't find file \"#{opts.input}\""
end
File.open(opts.input, "r").each_line do |line|
if !line.nil?
filelist << line.chomp
end
end
elsif opts.files
filelist = opts.files.split(":")
filelist.map! { |file| File.expand_path(file)}
filelist.each do |file|
if !File.exists?(file)
abort "Can't find file \"#{file}\""
end
end
end
if opts.interleave
newfilelist=[]
# check there are an even number of files in the list
if filelist.length % 2 == 1
abort "There needs to be an even number of fastq files in the list if you want to interleave them"
end
(0..filelist.length-1).step(2) do |i|
#if File.exists?("#{filelist[i]}") and File.exists?("#{filelist[i+1]}")
puts "Interleaving #{filelist[i]} and #{filelist[i+1]}" if opts.verbose
cmd = "paste #{filelist[i]} #{filelist[i+1]} | paste - - - - | awk -v FS=\"\t\" -v OFS=\"\n\" \'{print(\"@read\"NR\":1\",$3,$5,$7,\"@read\"NR\":2\",$4,$6,$8)}\' > #{filelist[i]}.in"
# puts cmd if opts.verbose
`#{cmd}` if !opts.test
if opts.cleanup
File.delete(filelist[i])
File.delete(filelist[i+1])
elsif opts.gzip
`gzip #{filelist[i]} #{filelist[i+1]}` if !opts.test
elsif opts.dsrc
puts "compressing #{filelist[i]}" if opts.verbose
`dsrc e #{filelist[i]} #{filelist[i]}.dsrc` if !opts.test
puts "compressing #{filelist[i+1]}" if opts.verbose
`dsrc e #{filelist[i+1]} #{filelist[i+1]}.dsrc` if !opts.test
`rm #{filelist[i]}` if !opts.test
`rm #{filelist[i+1]}` if !opts.test
end
newfilelist << "#{filelist[i]}.in"
#end
end
filelist = newfilelist
end
# build the command
first = true
if opts.continue
first = false
end
n = opts.buckets
x = (opts.memory/opts.buckets*1e9).to_i
pair=""
if opts.paired or opts.interleave
pair = "-p"
end
# run
filelist.each do |file|
puts "processing: #{file}" if opts.verbose
filepath = File.dirname(file)
filename = File.basename(file)
Dir.chdir(filepath) do |dir|
puts "changing working directory to #{dir}" if opts.verbose
if first
cmd = "#{opts.script} #{pair} -k #{opts.kmer} -N #{n} -x #{x} --savehash table.kh #{filename}"
puts "running: #{cmd}" if opts.verbose
puts `#{cmd}` if !opts.test
first = false
else
cmd = "#{opts.script} #{pair} -k #{opts.kmer} -N #{n} -x #{x} --loadhash table.kh --savehash table2.kh #{filename}"
puts "running #{cmd}" if opts.verbose
puts `#{cmd}` if !opts.test
`mv table2.kh table.kh` if !opts.test
end
if opts.cleanup
File.delete(file) if !opts.test
elsif opts.gzip
`gzip #{file}` if !opts.test
elsif opts.dsrc
puts "compressing #{file}" if opts.verbose
`dsrc #{file} #{file}.dsrc` if !opts.test
`rm #{file}` if !opts.test
end
end
end
file = filelist[0]
filepath = File.dirname(file)
filename = File.basename(file)
Dir.chdir(filepath) do |dir|
`rm table.kh` if !opts.test
end