-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathConcat_Seq.pl
153 lines (110 loc) · 4.47 KB
/
Concat_Seq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
use strict;
use warnings;
use threads;
use FindBin;
use lib "$FindBin::Bin/lib";
use Getopt::Long;
use File::Basename qw<basename dirname>;
#########################################################################################################################################
# Before using, Concat_Seq.pl needs to pre-install several Perl Modules and NCBI BLAST+ programs
# Required Perl Modules: threads File::Basename Getopt::Long Bio::SeqIO; (part of BioPerl, http://bioperl.org)
#########################################################################################################################################
my $now_time = localtime;
print "\n$now_time : Concat_Seq start...\n\n";
my $usage = <<USAGE;
=NAME
Concat_Seq.pl
=DESCRIPTION
Run this command to enble users to concatenate alignment files into a pseudo-DNA fasta file.
=USAGE
Concat_Seq.pl -dir alignment_file_directory
FOR EXAMPLE:
# perl /media/xiangyang/X128/test/127/Orthogroups/Concat_Seq.pl -dir /media/xiangyang/X128/test/127/Orthogroups/alignment_dir
=ARGUMENTS
=======================
REQUIRED ARGUMENTS:
~~~~~~~~~~~~~~~~~~~
-dir, --alignment_file_directory
A directory containing alignment files as FASTA format file. To avoid a mistake, sequence ids cannot use special character,
such as space, equal.
-h, --help
Show this message.
=AUTHOR
Dr. Xiangyang Li (E-mail: lixiangyang\@fudan.edu.cn), Fudan university; Kaili University; Bacterial Genome Data mining & Bioinformatic Analysis (www.microbialgenomic.com/).
=COPYRIGHT
Copyright 2021, Xiangyang Li. All Rights Reserved.
USAGE
my %options = (
'alignment_file_directory=s' => undef,
'help' => undef
);
GetOptions(
'dir|alignment_file_directory=s' => \$options{alignment_file_directory},
'h|help' => \$options{help}
);
if (defined $options{help}) {
print "$usage\n";
exit(0);
}
my $home_directory = $FindBin::Bin; # obtaining the home directory where concatenation sequence file located
print "\nConcatenate alignment files into a pseudo-DNA fasta file\n";
my $SNP_dir =
my $snp_fasta_out = concatenation_sequence ($options{alignment_file_directory}, $home_directory);
$now_time = localtime;
print "$now_time : Finished!\n";
sub concatenation_sequence {
my ($path, $out_dir) = @_;
my $snp_fasta_out = "$out_dir/concatenation.fasta";
open (SNP_OUT, ">$snp_fasta_out");
my $Equnce_number=0;
my $number_files = 0; # count number of files
my @hash_ref_array = (); # array with hash references
my %nseq_hash = (); # key:infile, val:nseq
# Read all infiles to count sequences
opendir DIR, $path or die $!;
my @dir = readdir DIR;
closedir DIR;
foreach my $each_file (@dir) {
my $infile = "$path/$each_file";
my %seq_hash = parse_fasta($infile, "T"); # key: seqid, value:sequence
## Save sequences in array with hash references.
push(@hash_ref_array, \%seq_hash);
} # Done with file
######################################concat sequences of identy strain from multiple file############
## First, concatenate all sequences from hashes
my %print_hash = (); # key:label, value:sequence
foreach my $h_ref (@hash_ref_array) {
$number_files++;
foreach my $seqid (sort keys %$h_ref) {
$Equnce_number++; # %$h_ref 为哈希引用
$print_hash{$seqid} .= $h_ref->{$seqid};
}
}
foreach my $label (sort keys %print_hash) {
$print_hash{$label} =~ s/\S{60}/$&\n/gs; ## replace word of size $lwidth with itself and "\n"
print SNP_OUT ">$label\n";
print SNP_OUT $print_hash{$label}, "\n";
}
return $snp_fasta_out;
}
sub parse_fasta {
my ($infile, $unify_id) = @_;
my $term = $/; # input record separator;
my %seq_hash = (); # key:seqid, val:seq
open my $INFILE, $infile or die "could not open infile '$infile' : $! \n";
$/ = ">"; # input record separator;
while(<$INFILE>) {
chomp;
next if($_ eq '');
my ($id, @sequencelines) = split /\n/;
if ($unify_id eq "T"){
my @ww = split ";", $id; #使每个文件的ID变为一致
$id = $ww[2];
}
foreach my $line (@sequencelines) {
$seq_hash{$id} .= $line;
}
}
$/ = $term;
return(%seq_hash);
} # end of parse_fasta