-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGapDistFromFasta.pl
executable file
·130 lines (117 loc) · 2.73 KB
/
GapDistFromFasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/perl
my $usage = "
# # # # # #
# GapDistrFromFasta.pl
# written by Linnéa Smeds 19 oct 2010
# ======================================================
# Makes a list of all gap sequences (N:s) in a fasta
# file, and also prints the length distribution of
# gaps. Can be run in \"fast\" mode, and then only return
# the total number of N:s in the input file.
# ======================================================
# Usage: perl NDistrFromFasta.pl <fastafile> <slow|fast>
# <outpref>
#
# Example 1: perl NDistrFromFasta.pl mySeqs.fa fast
# (Returns the total number of Ns in file)
# Example 2: perl NDistrFromFasta.pl mySeqs.fa slow prefix
# (Returns prefix.gaps with a list of all gaps (start,
# stop and length), and a prefix.gaphist with the gap
# size distribution: column1 = gap size,
# column2 = #gap of this size)
";
use strict;
use warnings;
# Input parameters
my $fasta = $ARGV[0];
my $flag = $ARGV[1];
my $outpref = $ARGV[2];
my ($listOut, $histOut);
if($flag eq "slow"){
if(defined $outpref) {
$listOut = $outpref.".gaps";
$histOut = $outpref.".gaphist";
}
else {
$listOut = $fasta.".gaps";
$histOut = $fasta.".gaphist";
}
}
elsif($flag eq "fast") {
}
else {
die "Flag must be either \"slow\" or \"fast\"\n\n$usage";
}
open(IN, $fasta);
if($flag eq "slow") {
open(OUT, ">$listOut");
}
my %hist = ();
my ($head, $seq) = ("","");
my $sum = 0;
while(<IN>) {
if($_ =~ m/^>/){
my @tab = split(/\s+/, $_);
$head = $tab[0];
chomp($head);
$head =~ s/>//;
$seq = "";
my $next = <IN>;
while ($next !~ m/^>/) {
chomp($next),
$seq.= $next;
if(eof(IN)) {
last;
}
$next = <IN>;
}
seek(IN, -length($next), 1);
#Only sum up the number of N:s
if($flag eq "fast") {
my @hits = $seq =~ m/(N+)/g;
for(@hits) {
$sum+=length($_);
}
}
#Print each gap with start, end and size to a .gaps file
elsif($flag eq "slow") {
my @seq = split(//,$seq);
my $Ntemp = "";
my ($start, $end) = ("","");
for(my $i=0; $i<scalar(@seq); $i++) {
if($seq[$i] eq 'N' || $seq[$i] eq 'n') {
if($Ntemp eq "") {
$start = $i+1;
}
$Ntemp .= 'N';
}
else {
if($Ntemp ne "") {
$end = $i;
my $size = $end-$start+1;
print OUT $head ."\t".$start."\t".$end."\t".$size."\n";
$sum += $size;
($Ntemp,$start,$end) = ("","","");
if(defined $hist{$size}) {
$hist{$size}++;
}
else {
$hist{$size}=1;
}
}
}
}
}
}
}
close(IN);
# Print the distribution of gap sizes to a second file (.gaphist).
if($flag eq "slow") {
close(OUT);
open(OUT, ">$histOut");
foreach my $key (sort {$a<=>$b} keys %hist) {
print OUT $key."\t".$hist{$key}."\n";
}
close(OUT);
}
print "Total number of N: $sum bp\n";