forked from aleimba/bac-genomics-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrunc_seq.pl
340 lines (241 loc) · 10.6 KB
/
trunc_seq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/usr/bin/perl
#######
# POD #
#######
=pod
=head1 NAME
C<trunc_seq.pl> - truncate sequence files
=head1 SYNOPSIS
C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt>
seq-file_trunc_20_3500.embl>
B<or>
C<perl trunc_seq.pl file_of_filenames_and_coords.tsv>
=head1 DESCRIPTION
This script truncates sequence files according to the given
coordinates. The features/annotations in RichSeq files (e.g. EMBL or
GENBANK format) will also be adapted accordingly. Use option B<-o> to
specify a different output sequence format. Input can be given directly
as a file and truncation coordinates to the script, with the start
position as the first argument, stop as the second and (the path to)
the sequence file as the third. In this case the truncated sequence
entry is printed to C<STDOUT>. Input sequence files should contain only
one sequence entry, if a multi-sequence file is used as input only the
B<first> sequence entry is truncated.
Alternatively, a file of filenames (fof) with respective coordinates
and sequence files in the following B<tab-separated> format can be
given to the script (the header is optional):
#start\tstop\tseq-file
300\t9000\t(path/to/)seq-file
50\t1300\t(path/to/)seq-file2
With a fof the resulting truncated sequence files are printed into a
results directory. Use option B<-r> to specify a different results
directory than the default.
It is also possible to truncate a RichSeq sequence file loaded into the
L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser
from the Sanger Institute: Select a subsequence and then go to Edit
-E<gt> Subsequence (and Features)
=head1 OPTIONS
=over 20
=item B<-h>, B<-help>
Help (perldoc POD)
=item B<-o>=I<str>, B<-outformat>=I<str>
Specify different sequence format for the output (files) [fasta, embl,
or gbk]
=item B<-r>=I<str>, B<-result_dir>=I<str>
Path to result folder for fof input [default = './trunc_seq_results']
=item B<-v>, B<-version>
Print version number to C<STDOUT>
=back
=head1 OUTPUT
=over 20
=item C<STDOUT>
If a single sequence file is given to the script the truncated sequence
file is printed to C<STDOUT>. Redirect or pipe into another tool as
needed.
=back
B<or>
=over 20
=item F<./trunc_seq_results>
If a fof is given to the script, all output files are stored in a
results folder
=item F<./trunc_seq_results/seq-file_trunc_start_stop.format>
Truncated output sequence files are named appended with 'trunc' and the
corresponding start and stop positions
=back
=head1 EXAMPLES
=over
=item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt>
seq-file_trunc_120_3000.gbk>
=back
B<or>
=over
=item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl
revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta>
=back
B<or>
=over
=item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl
file_of_filenames_and_coords.tsv>
=back
=head1 DEPENDENCIES
=over
=item B<L<BioPerl|http://www.bioperl.org>>
Tested with BioPerl version 1.007001
=back
=head1 VERSION
0.2 update: 2015-12-07
0.1 2013-08-02
=head1 AUTHOR
Andreas Leimbach aleimba[at]gmx[dot]de
=head1 LICENSE
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 (GPLv3) of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=cut
########
# MAIN #
########
use strict;
use warnings;
use autodie;
use Getopt::Long;
use Pod::Usage;
use Bio::SeqIO; # bioperl module to handle sequence input/output
#use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
#use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
### Get options with Getopt::Long
my $Out_Format_Opt; # optional different output seq file format
my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input
my $VERSION = 0.2;
my ($Opt_Version, $Opt_Help);
GetOptions ('outformat=s' => \$Out_Format_Opt,
'result_dir=s' => \$Result_Dir,
'version' => \$Opt_Version,
'help|?' => \$Opt_Help)
or pod2usage(-verbose => 1, -exitval => 2);
### Run perldoc on POD
pod2usage(-verbose => 2) if ($Opt_Help);
if ($Opt_Version) {
print "$0 $VERSION\n";
exit;
}
### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc.
my $Fof; # file of filenames (fof) with truncation coords
my $Start;
my $Stop;
my $Seq_File;
if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) {
my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n";
pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
} elsif (@ARGV == 1) { # fof
check_file_exists($ARGV[0]); # subroutine to check for file existence
$Fof = shift;
} elsif (@ARGV == 3) {
check_file_exists($ARGV[2]); # subroutine
if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) {
my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n";
pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
}
($Start, $Stop, $Seq_File) = @ARGV;
}
### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof
if ($Fof) {
open (my $fof_fh, "<", "$Fof");
# create result folder
$Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
if (-e $Result_Dir) {
empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
} else {
mkdir $Result_Dir;
}
while (my $line = <$fof_fh>) {
chomp $line;
next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines
die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/);
($Start, $Stop, $Seq_File) = split(/\t/, $line);
check_file_exists($Seq_File); # subroutine
my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object
my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames
$seqout->write_seq($truncseq);
}
close $fof_fh;
} else { # single seq file, @ARGV == 3
my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine
my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output
$seqout->write_seq($truncseq);
}
exit;
###############
# Subroutines #
###############
### Subroutine to check if file exists
sub check_file_exists {
my $file = shift;
die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file);
}
### Subroutine to empty a directory with user interaction
sub empty_dir {
my $dir = shift;
print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
my $user_ask = <STDIN>;
if ($user_ask =~ /y/i) {
unlink glob "$dir/*"; # remove all files in results directory
} else {
die "\nScript abborted!\n";
}
return 1;
}
### Subroutine to create a Bio::SeqIO output object
sub seq_out {
my ($seqin, $start, $stop, $seq_file) = @_;
my $out_format; # need to keep $Out_Format_Opt for several seq files with fof
if ($Out_Format_Opt) {
$Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format
$out_format = $Out_Format_Opt;
} else { # same format as input file
if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
$out_format = $1;
} else {
die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
}
}
my $seqout; # Bio::SeqIO object
if ($seq_file) { # fof
$seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows)
my $file_ext;
if ($out_format eq 'genbank') {
$file_ext = 'gbk'; # back to shorter file extension for GENBANK format
} else {
$file_ext = $out_format;
}
$seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename
$seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format);
} else { # single seq file input
$seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format'
}
return $seqout;
}
### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object
sub trunc_seq {
my ($start, $stop, $seq_file) = @_;
print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n";
my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
my $count = 0;
my $truncseq;
while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object
$count++;
if ($count > 1) {
warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n";
last;
}
$truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop);
}
return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout
}