-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprot2cdnAln.pl
executable file
·122 lines (96 loc) · 3.09 KB
/
prot2cdnAln.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env perl
# (c) Pablo Vinuesa & Bruno Contreras-Moreira
# CCG-UNAM; http://www.ccg.unam.mx/~vinuesa/
# AAaln2cdnAln.pl Sep 25th, 2009
# reverse translates a protein alignment, given the DNA seqs, to generate the underlying codon alignment!
# use warnings;
use strict;
use File::Basename;
my $progname = basename($0);
my $version = 0.2; # 2022-11-10; added NOTE in help and checks if cdn_aln was written to disk
# v0.1Sep 25th, 2009;
die "\n# $progname V.$version needs two arguments:
1) the dna input sequence to align;
2) the corresponding protein alignment
NOTE: requires that the sequences in the protein alignment are in the same order as in the input fasta\n\n" unless @ARGV == 2;
my ($dna_input,$prot_aln) = @ARGV;
my $cdnAln=(convert_AAaln2NTaln($dna_input,$prot_aln));
if (-s $cdnAln){ print "# wrote $cdnAln\n";}
else { print STDERR " ERROR: the expected codon alignment file $cdnAln was not produced !!!\n";}
sub convert_AAaln2NTaln
{
my ($DNAfile,$AA_aln_file) = @_;
my ($basename,$ext) = (split(/\./, $DNAfile))[0,1];
my $outfile = $basename . "_cdnaln.$ext";
my ($seq,$sequence,$aa,$n_of_aa,$aligned_nts)=('','','','','',);
my ($align_positions,$FASTA,@length,$l);
# 1) read AA aligned fasta sequences
my %AAalign = read_FASTA_sequence($AA_aln_file);
# 2) read NT fasta sequence
my %NTsequences = read_FASTA_sequence($DNAfile);
# 3) replicate alignment
foreach $seq (sort {$a<=>$b} (keys(%NTsequences)))
{
$FASTA .= $NTsequences{$seq}{'NAME'};
# 3.1) dissect codons
my @codons;
$sequence = $NTsequences{$seq}{'SEQ'};
while($sequence)
{
push(@codons,substr($sequence,0,3));
$sequence = substr($sequence,3);
}
# 3.2) loop through aminoacids in $AAalign{$seq}
$sequence = $AAalign{$seq}{'SEQ'};
$n_of_aa = length($sequence);
$align_positions=0;
$aligned_nts = '';
for($aa=0;$aa<=$n_of_aa;$aa++)
{
if(substr($sequence,$aa,1) ne '-')
{
$aligned_nts .= $codons[$align_positions];
$align_positions++;
}
else
{
$aligned_nts .= '---';
}
}
$l = length($aligned_nts);
if(!grep(/$l/,@length)){ push(@length,$l); }
$FASTA .= $aligned_nts."\n";
}
if(scalar(@length) > 1)
{
print "# convert_AAaln2NTaln : warning, sequence lengths differ\n";
}
# write the output FASTA file
open(FASTA,">$outfile") || die "# convert_AAaln2NTaln : cannot write to $outfile\n";
print FASTA $FASTA;
close FASTA;
return $outfile;
}
sub read_FASTA_sequence
{
my ($infile) = @_;
my (%FASTA,$name,$seq,$n_of_sequences);
$n_of_sequences = 0;
open(FASTA,$infile) || die "# read_FASTA_sequence: cannot read $infile\n";
while(<FASTA>)
{
if(/^\>/)
{
$name = $_;
$n_of_sequences++;
$FASTA{$n_of_sequences}{'NAME'} = $name;
}
else
{
$FASTA{$n_of_sequences}{'SEQ'} .= $_;
$FASTA{$n_of_sequences}{'SEQ'} =~ s/[\s|\n]//g;
}
}
close(FASTA);
return %FASTA;
}