-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathduplex_filter.cpp
314 lines (290 loc) · 12.4 KB
/
duplex_filter.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
//
// Created by Ruolin Liu on 2/18/20.
//
#include <iostream>
#include <fstream>
#include <string>
#include <getopt.h>
#include "Alignment.h"
#include "AlignmentConsensus.h"
#include "BamIO.h"
#include "MutCounter.h"
#ifdef BGZF_MAX_BLOCK_SIZE
#pragma push_macro("BGZF_MAX_BLOCK_SIZE")
#undef BGZF_MAX_BLOCK_SIZE
#define BGZF_MAX_BLOCK_SIZE_BAK
#endif
#ifdef BGZF_BLOCK_SIZE
#pragma push_macro("BGZF_BLOCK_SIZE")
#undef BGZF_BLOCK_SIZE
#define BGZF_BLOCK_SIZE_BAK
#endif
#include "InsertSeqFactory.h"
#ifdef BGZF_MAX_BLOCK_SIZE_BAK
#undef BGZF_MAX_BLOCK_SIZE_BAK
#pragma pop_macro("BGZF_MAX_BLOCK_SIZE")
#endif
#ifdef BGZF_BLOCK_SIZE_BAK
#undef BGZF_BLOCK_SIZE_BAK
#pragma pop_macro("BGZF_BLOCK_SIZE")
#endif
using std::string;
struct CssOptions {
string bam;
string reference;
int mapq = 10;
// bool output_nononverlapping_pair = false;
bool clip3 = false;
// int pair_min_overlap = 1;
bool trim_overhang = false;
string tmpdir = "/tmp";
//bool output_singleend = false;
int bqual_min = 0;
string outbam = "";
int thread = 1;
int min_fraglen = 30;
int max_fraglen = std::numeric_limits<int>::max();
float min_passQ_frac = 0;
bool filter_5endclip = false;
int max_N_filter = std::numeric_limits<int>::max();
int max_snv_filter = std::numeric_limits<int>::max();
int clustered_mut_cutoff = std::numeric_limits<int>::max();
float max_frac_prim_AS = std::numeric_limits<float>::max();
float max_pair_mismatch_frac = 1.0;
//hidden options
int count_read = 0;
bool standard_ngs_filter = false;
bool load_unpair = true;
bool load_supplementary = false;
bool load_secondary = false;
};
static struct option filter_long_options[] = {
{"bam", required_argument, 0, 'b'},
{"reference", required_argument, 0, 'r'},
// {"load_supplementary", no_argument, 0, 'l'},
// {"load_unpair", no_argument, 0, 'u'},
// {"load_secondary", no_argument, 0, '2'},
// {"load_duplicate", no_argument, 0, 'D'},
{"clip3", no_argument, 0, 'C'},
{"mapq", required_argument , 0, 'm'},
{"baseq", required_argument , 0, 'q'},
{"outbam", required_argument , 0, 'o'},
// {"pair_min_overlap", required_argument, 0, 'p'},
{"trim_overhang", no_argument, 0, 't'},
// {"output_nononverlapping_pair",no_argument, 0, 'i'},
{"min_fraglen", required_argument, 0, 'g'},
{"max_fraglen", required_argument, 0, 'G'},
{"max_frac_prim_AS", required_argument, 0, 'B'},
{"min_passQ_frac", required_argument, 0, 'Q'},
{"max_N_filter", required_argument, 0, 'y'},
{"max_snv_filter", required_argument, 0, 'x'},
{"max_pair_mismatch_frac", required_argument, 0, 'N'},
{"clustered_mut_cutoff", required_argument, 0, 'c'},
{"filter_5endclip", no_argument, 0, '5'},
// {"dirtmp", required_argument , 0, 'd'},
// {"thread", required_argument, 0, 'T'},
{0,0,0,0}
};
const char* filter_short_options = "b:m:o:lCq:tg:G:B:Q:y:x:c:5r:N:";
void filter_print_help()
{
std::cerr<< "---------------------------------------------------\n";
std::cerr<< "Description: A standalone program designed for fragment-level and base-level filtering used in the 'codec call' process.\n";
std::cerr<< "This can remove non-duplex and low quality reads and bases.\n";
std::cerr<< "Usage: codec filter [options].\n";
std::cerr<< "General Options:\n";
std::cerr<< "-b/--bam, Input bam in query-name sorted order [required]\n";
std::cerr<< "-o/--outbam, Output bam in query-name sorted order [required].\n";
std::cerr<< "-r/--reference, Reference genome [required].\n";
std::cerr<< "-m/--mapq, Min mapping quality [10].\n";
std::cerr<< "-q/--baseq, If one of the baseq < cutoff, make all baseq = 2 so that VC will ingnore them. [0].\n";
std::cerr<< "-t/--trim_overhang, Remove overhang regions (where R1 and R2 do not overlap) in the output reads. [false].\n";
std::cerr<< "-C/--clip3, trim the 3'end soft clipping. [false].\n";
// std::cerr<< "-s/--output_singleend, The R1R2 consensus will be output in a single end format [false].\n";
// std::cerr<< "-p/--pair_min_overlap, When using selector, the minimum overlap between the two ends of the pair [1].\n";
// std::cerr<< "-d/--dirtmp, Temporary dir for sorted bam [/tmp]\n";
// std::cerr<< "-T/--thread, Number of threads for sort [1]\n";
std::cerr<<"filtering options\n";
std::cerr<< "-G/--max_fraglen, Filter out a read if its fragment length is larger than this value [INT_MAX].\n";
std::cerr<< "-g/--min_fraglen, Filter out a read if its fragment length is less than this value [30].\n";
std::cerr<< "-5/--filter_5endclip, Filtering out reads with 5'end soft clipping [False].\n";
std::cerr<< "-B/--max_frac_prim_AS, Filter out a read if the AS of the secondary alignment is >= this fraction times the AS of the primary alignment [FLOAT_MAX].\n";
std::cerr<< "-Q/--min_passQ_frac, Filter out a read if the fraction of bases passing quality threshold (together with -q) is less than this number [0].\n";
std::cerr<< "-x/--max_snv_filter, Skip a read if the number of mismatch bases is larger than this value [INT_MAX].\n";
std::cerr<< "-y/--max_N_filter, Skip a read if its num of N bases is larger than this value [INT_MAX].\n";
std::cerr<< "-N/--max_pair_mismatch_frac, Filter out a read-pair if its R1 and R2 has mismatch fraction larger than this value [1.0].\n";
std::cerr<< "-c/--clustered_mut_cutoff, Filter out a read if at least this number of mutations occur in a window of 30 bp near the read end (<15 bp). [INT_MAX].\n";
//std::cerr<< "-M/--consensus_mode, 0 for paired end consensus. [0]\n";
}
int filter_parse_options(int argc, char* argv[], CssOptions& opt) {
int option_index;
int next_option = 0;
do {
next_option = getopt_long(argc, argv, filter_short_options, filter_long_options, &option_index);
switch (next_option) {
case -1:break;
case 'r':
opt.reference = optarg;
break;
case 'b':
opt.bam = optarg;
break;
case 'o':
opt.outbam = optarg;
break;
case 'm':
opt.mapq = atoi(optarg);
break;
case 'q':
opt.bqual_min = atoi(optarg);
break;
case 'l':
opt.load_supplementary = true;
break;
case 't':
opt.trim_overhang = true;
break;
// case 'i':
// opt.output_nononverlapping_pair = true;
// break;
case 'C':
opt.clip3 = true;
break;
case 'd':
opt.tmpdir = optarg;
break;
case 'T':
opt.thread = atoi(optarg);
break;
case 'g':
opt.min_fraglen = atoi(optarg);
break;
case 'G':
opt.max_fraglen = atoi(optarg);
break;
case '5':
opt.filter_5endclip = true;
break;
case 'Q':
opt.min_passQ_frac = atof(optarg);
break;
case 'B':
opt.max_frac_prim_AS = atof(optarg);
break;
case 'y':
opt.max_N_filter = atoi(optarg);
break;
case 'x':
opt.max_snv_filter = atoi(optarg);
break;
case 'N':
opt.max_pair_mismatch_frac = atof(optarg);
break;
case 'c':
opt.clustered_mut_cutoff = atoi(optarg);
break;
default:filter_print_help();
return 1;
}
} while (next_option != -1);
return 0;
}
int codec_filter(int argc, char ** argv) {
CssOptions opt;
int parse_ret = filter_parse_options(argc, argv, opt);
if (parse_ret) return 1;
if (argc == 1) {
filter_print_help();
return 1;
}
if (opt.outbam.empty()) {
std::cerr << "-o/--outbam is required" << std::endl;
filter_print_help();
return 1;
}
if (opt.bam.empty()) {
std::cerr << "-b/--bam is required" << std::endl;
filter_print_help();
return 1;
}
//char temp[100];
//strcpy(temp, opt.tmpdir.c_str());
//strcat(temp, "/tempsort.XXXXXX");
//int fd = mkstemp(temp);
//if (fd == -1) {
// std::cerr << "unable to create temp file for sorting bam in queryname order\n";
// return 1;
//}
//string samsort = "samtools sort -n " + opt.bam + " -o " + string(temp) + " -@ " + std::to_string(opt.thread);
//std::cout << "sorting bam: " << samsort << std::endl;
//std::system(samsort.c_str());
//std::cout << "sorting done" << std::endl;
SeqLib::RefGenome ref;
ref.LoadIndex(opt.reference);
const int L = 250;
auto errorstat = cpputil::ErrorStat(L, opt.bqual_min);
cpputil::InsertSeqFactory isf(opt.bam,
0,
opt.load_supplementary,
opt.load_secondary,
false,
!opt.load_unpair,
opt.clip3);
//cpputil::UnMappedBamWriter writer(opt.outbam, isf.bamheader());
SeqLib::BamWriter bamwriter;
bamwriter.Open(opt.outbam);
bamwriter.SetHeader(isf.bamheader());
bamwriter.WriteHeader();
int64_t read_counter = 0;
int64_t duplex_counter = 0;
int64_t pass_counter = 0;
while (!isf.finished()) {
std::vector<cpputil::Segments> frag;
while( (frag= isf.FetchReadNameSorted(opt.load_unpair)).size() > 0) {
for (auto seg : frag) {
assert (seg.size() == 2);
++ read_counter;
int ol = cpputil::GetNumOverlapBasesPEAlignment(seg);
if (ol > 0) {
++duplex_counter;
std::vector<std::string> orig_quals; // consensus output
std::vector<std::string> seqs;
for (auto &s: seg) {
seqs.push_back(s.Sequence());
}
int pair_nmismatch = 0, olen = 0;
float nqpass;
int fail = cpputil::FailFilter(frag, isf.bamheader(), ref, opt, errorstat, true, pair_nmismatch, nqpass,
olen);
if (fail) {
continue;
}
++pass_counter;
auto seq = cpputil::PairConsensus(seg, seqs, opt.trim_overhang, opt.bqual_min,
orig_quals); // trim overhang if true
//if (seg.front().FirstFlag()) {
seg.front().SetSequence(seq.first);
seg.front().SetQualities(orig_quals.front(), 33);
seg.back().SetSequence(seq.second);
seg.back().SetQualities(orig_quals.back(), 33);
//writer.WriteRecord(seg.front(), seg.back(), seq.first, seq.second, orig_quals.front(), orig_quals.back());
//}
//else if (seg.back().FirstFlag()) {
//writer.WriteRecord(seg.back(), seg.front(), seq.second, seq.first, orig_quals.back(), orig_quals.front());
//}
bamwriter.WriteRecord(seg.front());
bamwriter.WriteRecord(seg.back());
}
}
}
}
bamwriter.Close();
std::cout << "total_molecules" << "\t" << "duplex_molecules" << "\t" << "pass_filter_duplexes" << "\t" << "duplexes_filtered_by_mapq" \
<< "\t" << "duplexes_filtered_by_mismatch_rate" << "\t" << "duplexes_filtered_by_base_quality_rate" << "\t" << "duplexes_filtered_by_edit_distance" \
<< "\t" << "duplexes_filtered_by_clustered_errors" << "\t" << "duplexes_filtered_by_sclip" << "\t" << "duplexes_filtered_by_largefrag" << std::endl;
std::cout << read_counter << "\t" << duplex_counter << "\t" << pass_counter << "\t" << errorstat.n_filtered_by_mapq << \
"\t" << errorstat.n_filtered_pairmismatch_rate << "\t" << errorstat.n_filtered_q30rate << "\t" << \
errorstat.n_filtered_edit << "\t" << errorstat.n_filtered_clustered << "\t" << errorstat.n_filtered_sclip << \
"\t" << errorstat.n_filtered_largefrag << std::endl;
return 0;
}