Skip to content

Add clustal output format as an option #50

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
famsa
FAMSA.VC.VC.opendb
Debug
x64
Expand All @@ -9,3 +10,6 @@ FAMSA.VC.db
*.opendb
/.vs
/src/famsa.vcxproj.user
**/*.o
libs/mimalloc/*.o
.vscode/**
1 change: 1 addition & 0 deletions src/core/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ const symbol_t NO_AA_SYMBOLS = UNKNOWN_SYMBOL; // no. of symbols that can be
#define MAX3(x, y, z) (max(x, max(y, z)))
#define ABS(x) ((x) >= 0 ? (x) : -(x))

enum class seq_t {AA, DNA, RNA};

inline void *my_align(std::size_t alignment, std::size_t size,
void *&ptr, std::size_t &space) {
Expand Down
265 changes: 264 additions & 1 deletion src/core/io_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,19 @@ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys
using namespace std;

// *******************************************************************
bool IOService::saveAlignment(const std::string& file_name, vector<CGappedSequence*>& sequences, int no_threads, int gzip_level)
bool IOService::saveAlignment(const std::string& file_name, vector<CGappedSequence*>& sequences, const std::string& format, seq_t seq_type, int no_threads, int gzip_level)
{
if (format == "fasta"){
return saveFASTAFormat(file_name, sequences, no_threads, gzip_level);
} else if (format == "clustal"){
return saveCLUSTALFormat(file_name, sequences, seq_type);
} else {
return saveFASTAFormat(file_name, sequences, no_threads, gzip_level);
}
}

// *******************************************************************
bool IOService::saveFASTAFormat(const std::string& file_name, vector<CGappedSequence*>& sequences, int no_threads, int gzip_level)
{
string s;
string id, seq;
Expand Down Expand Up @@ -180,3 +192,254 @@ bool IOService::saveAlignment(const std::string& file_name, vector<CGappedSequen

return true;
}


// BEGIN CLUSTAL OUTPUT SPECIFIC CODE
// *******************************************************************
FILE* openFile(const std::string& file_name, const char* mode = "r")
{
FILE* file_ptr = std::fopen(file_name.c_str(), mode);
if (!file_ptr) {
std::cerr << "Error: could not open file \""
<< file_name << "\" with mode \"" << mode << "\"\n";
std::exit(EXIT_FAILURE);
}
return file_ptr;
}

// *******************************************************************
size_t utf8len(const char *s)
{
size_t len = 0;
for (; *s; ++s) if ((*s & 0xC0) != 0x80) ++len;
return len;
}

// This was taken and modified from:
// https://github.com/GSLBiotech/clustal-omega/blob/d21fab82d380638c568c9427ed39cb42dd87d93b/src/squid/clustal.c#L191

// *******************************************************************
bool IOService::saveCLUSTALFormat(const std::string& file_name, vector<CGappedSequence*>& sequences, seq_t seq_type)
{
int idx; /* counter for sequences */
int len; /* tmp variable for name lengths */
int namelen; /* maximum name length used */
int pos; /* position counter */
char *buf; /* buffer for writing seq */
int cpl = 60; //msa->alen<iWrap ? msa->alen+10 : (iWrap > 0 ? iWrap : 60); /* char per line (< 64) */

/* consensus line stuff */
int subpos;
char first;
int bail;
int strong_bins[9];
int weak_bins[11];
/*int cons;*/
int bin;
int nseq = sequences.size(); // msa->nseq
bool bResno = false;
FILE *fp = openFile(file_name, "w");
int *piResCnt = NULL;

if (1 == bResno) {
if (NULL == (piResCnt = (int *)malloc(nseq * sizeof(int)))) {
printf("%s:%s:%d: could not malloc %d int for residue count",
__FUNCTION__, __FILE__, __LINE__, nseq);
return false;
} else {
memset(piResCnt, 0, nseq * sizeof(int));
}
} /* do print residue numbers */

if (NULL == (buf = (char *)malloc(cpl+20))) {
printf("%s:%s:%d: could not malloc %d char for buffer",
__FUNCTION__, __FILE__, __LINE__, cpl+20);
return false;
} else {
memset(buf, 0, cpl+20);
}

/* calculate max namelen used */
namelen = 0;
for (idx = 0; idx < nseq; idx++)
{
/*if ((len = strlen(msa->sqname[idx])) > namelen) */ /* strlen() gives problems for unicode, FS, -> 290 */
if ((len = utf8len(sequences[idx]->id.c_str())) > namelen)
namelen = len;
}

fprintf(fp, "CLUSTAL multiple sequence alignment format. Alignment performed with FAMSA.\n");

/*****************************************************
* Write the sequences
*****************************************************/

fprintf(fp, "\n"); /* original had two blank lines */

map<string, string> alignments;
int alen = 0; // length of global alignment

string firstseq;
for (idx = 0; idx < nseq; idx++)
{
auto p = sequences[idx];
string seq = p->Decode();
string id = p->id;
if (idx == 0){
alen = seq.size();
firstseq = seq;
}
alignments[id] = seq;

// cout << id << " -- " << alignments[id] << endl;
}

for (pos = 0; pos < alen; pos += cpl)
{
fprintf(fp, "\n"); /* Blank line between sequence blocks */
for (idx = 0; idx < nseq; idx++)
{
auto p = sequences[idx];
string id = p->id;
string sqname = id.c_str();// msa->sqname[idx]
if (sqname[0] == '>') {
sqname.erase(0, 1);
}
const char *aseq = alignments[id].c_str(); // msa->aseq
const char *csqname = sqname.c_str();
strncpy(buf, aseq + pos, cpl);

buf[cpl] = '\0';
if (1 == bResno) {
char *pc = NULL;
for (pc = buf; *pc != '\0'; pc++){
if ( ( (*pc >= 'a') && (*pc <= 'z') ) || ( (*pc >= 'A') && (*pc <= 'Z') ) ) {
piResCnt[idx]++;
}
}
/* printf("%*s") gives problems for unicode, FS, -> 290 */
/*fprintf(fp, "%-*s\t%s\t%d\n", namelen+5, msa->sqname[idx], buf, piResCnt[idx]);*/
fprintf(fp, "%s%*s %s\t%d\n", csqname, (int)(namelen+5-utf8len(csqname)), "", buf, piResCnt[idx]);
} else {
/* printf("%*s") gives problems for unicode, FS, -> 290 */
/*fprintf(fp, "%-*s\t%s\n", namelen+5, msa->sqname[idx], buf);*/
fprintf(fp, "%s%*s %s\n", csqname, (int)(namelen+5-utf8len(csqname)), "", buf);
}
}
/* do consensus dots */

/* print namelen+5 spaces */
for(subpos = 0; subpos <= namelen+5; subpos++)
fprintf(fp, " ");

const char *firstaseq = firstseq.c_str(); // msa->aseq
for(subpos = pos; subpos < min(pos + cpl, alen); subpos++)
{
/* see if 100% conservation */
first = firstaseq[subpos];
bail = 0;
for (idx = 1; idx < nseq; idx++)
{
auto p = sequences[idx];
string id = p->id;
const char *aseq = alignments[id].c_str(); // msa->aseq
if(toupper(aseq[subpos]) != toupper(first)) { /* toupper makes consensus case-insensitive, FS, r290 -> */
bail = 1;
break;
}
}
if(!bail)
fprintf(fp, "*");
else {
/* if not then check strong */
for(bin = 0; bin < 9; bin++)
strong_bins[bin] = 0; /* clear the bins */

for (idx = 0; (seq_type == seq_t::AA) && (idx < nseq); idx++)
{ /* do this only for amino acids, no strong/weak consensus for nucleotide, FS, r290 -> */
auto p = sequences[idx];
string id = p->id;
const char *aseq = alignments[id].c_str(); // msa->aseq

switch(toupper(aseq[subpos])) /* toupper makes consensus case-insensitive, FS, r290 -> */
{
case 'S': strong_bins[0]++; break;
case 'T': strong_bins[0]++; break;
case 'A': strong_bins[0]++; break;
case 'N': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; break;
case 'E': strong_bins[1]++; strong_bins[3]++; break;
case 'Q': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; strong_bins[4]++; break;
case 'K': strong_bins[1]++; strong_bins[2]++; strong_bins[4]++; break;
case 'D': strong_bins[3]++; break;
case 'R': strong_bins[4]++; break;
case 'H': strong_bins[2]++; strong_bins[4]++; strong_bins[7]++; break; /* added bin-2 (NHQK), FS 2016-07-14 */
case 'M': strong_bins[5]++; strong_bins[6]++; break;
case 'I': strong_bins[5]++; strong_bins[6]++; break;
case 'L': strong_bins[5]++; strong_bins[6]++; break;
case 'V': strong_bins[5]++; break;
case 'F': strong_bins[6]++; strong_bins[8]++; break;
case 'Y': strong_bins[7]++; strong_bins[8]++; break;
case 'W': strong_bins[8]++; break;
}
}
bail = 0;
for(bin = 0; bin < 9; bin++)
if(strong_bins[bin] == nseq) {
bail = 1;
break;
}

if(bail)
fprintf(fp, ":");
else {
/* check weak */
for(bin = 0; bin < 11; bin++)
weak_bins[bin] = 0; /* clear the bins */

for(idx = 0; (seq_type == seq_t::AA) && (idx < nseq); idx++)
{ /* do this only for amino acids, no strong/weak consensus for nucleotide, FS, r290 -> */
auto p = sequences[idx];
string id = p->id;
const char *aseq = alignments[id].c_str(); // msa->aseq

switch(toupper(aseq[subpos])) /* toupper makes consensus case-insensitive, FS, r290 -> */
{
case 'C': weak_bins[0]++; break;
case 'S': weak_bins[0]++; weak_bins[2]++; weak_bins[3]++; weak_bins[4]++; weak_bins[5]++; weak_bins[6]++; break;
case 'A': weak_bins[0]++; weak_bins[1]++; weak_bins[2]++; weak_bins[4]++; break;
case 'T': weak_bins[1]++; weak_bins[3]++; weak_bins[4]++; break;
case 'V': weak_bins[1]++; weak_bins[9]++; break;
case 'G': weak_bins[2]++; weak_bins[5]++; break; /* Added bin-5, FS 2016-07-14 */
case 'N': weak_bins[3]++; weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
case 'K': weak_bins[3]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
case 'D': weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; break;
case 'E': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
case 'Q': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
case 'H': weak_bins[7]++; weak_bins[8]++; weak_bins[10]++; break;
case 'R': weak_bins[8]++; break;
case 'F': weak_bins[9]++; weak_bins[10]++; break;
case 'L': weak_bins[9]++; break;
case 'I': weak_bins[9]++; break;
case 'M': weak_bins[9]++; break;
case 'Y': weak_bins[10]++; break;
}
}
bail = 0;
for(bin = 0; bin < 11; bin++)
if(weak_bins[bin] == nseq) {
bail = 1;
break;
}
if(bail)
fprintf(fp, ".");
else
fprintf(fp, " ");
}
}
}
fprintf(fp,"\n");
} // end position for loop

free(piResCnt); piResCnt = NULL;
return true;
}
7 changes: 6 additions & 1 deletion src/core/io_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ class IOService {
public:
template <class seq_t>
static size_t loadFasta(const std::string& file_name, std::vector<seq_t>& sequences, memory_monotonic_safe* mma = nullptr);
static bool saveAlignment(const std::string& file_name, vector<CGappedSequence*> & sequences, int no_threads, int gzip_level);
static bool saveAlignment(const std::string& file_name, vector<CGappedSequence*> & sequences, const std::string& format, seq_t seq_type, int no_threads, int gzip_level);

private:
static bool saveFASTAFormat(const std::string& file_name, vector<CGappedSequence*> & sequences, int no_threads, int gzip_level);
static bool saveCLUSTALFormat(const std::string& file_name, vector<CGappedSequence*> & sequences, seq_t seq_type);

};


Expand Down
26 changes: 25 additions & 1 deletion src/core/params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ void CParams::show_usage(bool expert)

<< " -gz - enable gzipped output (default: " << bool2str[gzippd_output] << ")\n"
<< " -gz-lev <value> - gzip compression level [0-9] (default: " << gzip_level << ")\n"
<< " -refine_mode <on | off | auto> - refinement mode (default: auto - the refinement is enabled for sets <= " << thr_refinement << " seq.)\n\n";
<< " -refine_mode <on | off | auto> - refinement mode (default: auto - the refinement is enabled for sets <= " << thr_refinement << " seq.)\n"
<< " -output_format <fasta | clustal> - output format type (default: fasta)\n"
<< " -seq_type <aa | dna | rna> - input sequence type type, only used for clustal output format (default: aa)\n\n";


if (expert) {
Expand Down Expand Up @@ -186,6 +188,28 @@ bool CParams::parse(int argc, char** argv, bool& showExpert)
findOption(params, "-cluster_fraction", cluster_fraction);
findOption(params, "-cluster_iters", cluster_iters);

string def_val = output_format;
findOption(params, "-output_format", output_format);
if (output_format != "fasta" && output_format != "clustal")
{
LOG_NORMAL << "Incorrect output format: " << output_format << ". This was changed to default value: " << def_val << endl;
output_format = def_val;
}

string input_seq_type = "aa"; // default is amino acid.
def_val = input_seq_type;
findOption(params, "-seq_type", input_seq_type);
if (input_seq_type == "aa")
seq_type = seq_t::AA;
else if (input_seq_type == "dna")
seq_type = seq_t::DNA;
else if (input_seq_type == "rna")
seq_type = seq_t::RNA;
else {
LOG_NORMAL << "Incorrect seq_type: " << input_seq_type << ". This was changed to default value: " << def_val << endl;
seq_type = seq_t::AA;
}

export_tree = findSwitch(params, "-gt_export");
export_distances = findSwitch(params, "-dist_export");
generate_square_matrix = findSwitch(params, "-square_matrix");
Expand Down
2 changes: 2 additions & 0 deletions src/core/params.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ class CParams
string input_file_name;
string input_file_name_2;
string output_file_name;
string output_format = "fasta";
seq_t seq_type = seq_t::AA;

vector<vector<score_t>> score_matrix;
vector<score_t> score_vector;
Expand Down
7 changes: 4 additions & 3 deletions src/famsa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ int main(int argc, char *argv[])
CParams params;

if (!params.parse(argc, argv, showExpert)) {
// some parameters could be parsed - used default values for printing
// some parameters could be parsed - used clustal values for printing
CParams def_params;
def_params.show_usage(showExpert);
return 0;
Expand Down Expand Up @@ -86,7 +86,7 @@ int main(int argc, char *argv[])

profile_aligner.GetAlignment(result);

IOService::saveAlignment(params.output_file_name, result, params.n_threads,
IOService::saveAlignment(params.output_file_name, result, params.output_format, params.seq_type, params.n_threads,
params.gzippd_output ? params.gzip_level : -1);
return 0;
}
Expand All @@ -113,7 +113,8 @@ int main(int argc, char *argv[])
famsa.getStatistics().put("alignment.length", result[0]->gapped_size);

LOG_VERBOSE << "Saving alignment in " << params.output_file_name;
ok = IOService::saveAlignment(params.output_file_name, result, params.n_threads,

ok = IOService::saveAlignment(params.output_file_name, result, params.output_format, params.seq_type, params.n_threads,
params.gzippd_output ? params.gzip_level : -1);

LOG_VERBOSE << " [OK]" << endl;
Expand Down
Loading