Skip to content

Commit

Permalink
Merge pull request #69 from ICGC-TCGA-PanCancer/dev
Browse files Browse the repository at this point in the history
Merge bam_stats update for more robust insert size handling
  • Loading branch information
keiranmraine committed Mar 2, 2015
2 parents a214c4e + 04fa18e commit 51fb0cc
Show file tree
Hide file tree
Showing 11 changed files with 707 additions and 49 deletions.
33 changes: 33 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,36 @@
1.5.3
bam_stats C - changed array for khash in insert size calculations in order to make code more robust.
Header RG line reading now reads anything not a tab or newline as it should when determining what the values of tags are.
1.5.2
bamToBw.pl fixes
* Pull actual binaries from jkent_util not html page associated
* process name corrections in bamToBw.pm command line args
1.5.1
bam_stats c now has CRAM support.
Also dropped need for samtools v1.x api as can be handled by htslib on it's own.
1.5.0
bamToBw.pl and new biobambam dep
No changes to old tools, just additions and prep for handling CRAM input.
1.4.0
bam_stats in C, less than 2 hours to generate stats on a sample level BAM file of ~120GB.
1.3.0
bam_stats.pl is now multi-threaded, can get ~50% runtime reduction with 3-4 threads, memory still <500MB.
Upgrades biobambam to 0.0.185 (and dependencies).
1.2.3
xml_to_bas.pl - detect readgroup id clashes and attempt to reconcile, #54
1.2.2
Fixed bug in bwa_mem.pl when using '-f' option on paired fastq.
1.2.1
Makes xml_to_bas.pl more robust on AWS. Retrieved XML was being truncated on some network configurations.
1.2.0
Modifications made to the bwa_mem.pl code to split a lane of data into fragments to reduce failure recovery time. Primarily added to handle X10 data better.
Also updated samtools to 0.1.20, last version that is currently compatible with Bio::DB::Sam.
1.1.1
Fix missing dependancy and build a relocatable version of biobambam suitable for use in artifactory.
1.1.0
Minor enhancement to bwa_mem.pl to automatically generate the *.bas file.
Added xml_to_bas.pl for pancancer users, see the wiki for details.
Fixed a few minor issues, #36, #37, #39
1.0.4
Install biobambam 0.0.142 to prevent over-counting of duplicates when multiple
libraries, also required libmaus 0.0.124.
Expand Down
1 change: 1 addition & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ c/bam_access.c
c/bam_access.h
c/bam_stats.c
c/dbg.h
c/khash.h
Changes
dists/snappy-1.1.2.tar.gz
docs.tar.gz
Expand Down
4 changes: 2 additions & 2 deletions MYMETA.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"unknown"
],
"dynamic_config" : 0,
"generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690",
"generated_by" : "ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.131560",
"license" : [
"gpl_2"
],
Expand Down Expand Up @@ -55,5 +55,5 @@
}
},
"release_status" : "stable",
"version" : "v1.5.2"
"version" : "v1.5.3"
}
48 changes: 24 additions & 24 deletions MYMETA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,38 @@ abstract: unknown
author:
- unknown
build_requires:
ExtUtils::MakeMaker: '0'
ExtUtils::MakeMaker: 0
configure_requires:
ExtUtils::MakeMaker: '0'
ExtUtils::MakeMaker: 0
dynamic_config: 0
generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.142690'
generated_by: 'ExtUtils::MakeMaker version 6.68, CPAN::Meta::Converter version 2.131560'
license: gpl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: '1.4'
version: 1.4
name: PCAP
no_index:
directory:
- t
- inc
requires:
Bio::DB::Sam: '1.39'
Bio::Root::Version: '1.006923'
Capture::Tiny: '0.24'
Const::Fast: '0.014'
Data::UUID: '1.219'
Devel::Cover: '1.09'
File::Which: '0.05'
GD: '2.52'
IPC::System::Simple: '1.25'
List::Util: '1.38'
Math::Gradient: '0.04'
Module::Build: '0.42'
Pod::Coverage: '0.23'
Proc::ProcessTable: '0.5'
Sub::Exporter::Progressive: '0.001011'
Term::UI: '0.42'
Test::Fatal: '0.013'
Try::Tiny: '0.19'
XML::Simple: '2.2'
version: v1.5.2
Bio::DB::Sam: 1.39
Bio::Root::Version: 1.006923
Capture::Tiny: 0.24
Const::Fast: 0.014
Data::UUID: 1.219
Devel::Cover: 1.09
File::Which: 0.05
GD: 2.52
IPC::System::Simple: 1.25
List::Util: 1.38
Math::Gradient: 0.04
Module::Build: 0.42
Pod::Coverage: 0.23
Proc::ProcessTable: 0.5
Sub::Exporter::Progressive: 0.001011
Term::UI: 0.42
Test::Fatal: 0.013
Try::Tiny: 0.19
XML::Simple: 2.2
version: v1.5.3
2 changes: 1 addition & 1 deletion c/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION=1.5.2
VERSION=1.5.3

#Compiler
CC = gcc -O3 -DVERSION='"$(VERSION)"' -g
Expand Down
30 changes: 14 additions & 16 deletions c/bam_access.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,39 +35,39 @@ void parse_rg_line(char *tmp_line, const int idx, rg_info_t **groups,
//Now tokenise tmp_line on \t and read in
char *tag = strtok(tmp_line,"\t");
while(tag != NULL){
int chk = sscanf(tag,"ID:%s",id);
int chk = sscanf(tag,"ID:%[^\t\n]",id);
if(chk == 1){
groups[idx]->id = malloc(sizeof(char) * 1000);
strcpy(groups[idx]->id,id);
tag = strtok(NULL,"\t");
continue;
}
chk=0;
chk = sscanf(tag,"SM:%s",sm);
chk = sscanf(tag,"SM:%[^\t\n]",sm);
if(chk == 1){
groups[idx]->sample = (char *) malloc(sizeof(char) * 1000);
strcpy(groups[idx]->sample,sm);
tag = strtok(NULL,"\t");
continue;
}
chk = 0;
chk = sscanf(tag,"PL:%s",pl);
chk = sscanf(tag,"PL:%[^\t\n]",pl);
if(chk == 1){
groups[idx]->platform = (char *) malloc(sizeof(char) * 1000);
strcpy(groups[idx]->platform,pl);
tag = strtok(NULL,"\t");
continue;
}
chk = 0;
chk = sscanf(tag,"PU:%s",pu);
chk = sscanf(tag,"PU:%[^\t\n]",pu);
if(chk == 1){
groups[idx]->platform_unit = (char *) malloc(sizeof(char) * 1000);
strcpy(groups[idx]->platform_unit,pu);
tag = strtok(NULL,"\t");
continue;
}
chk = 0;
chk = sscanf(tag,"LB:%s",lib);
chk = sscanf(tag,"LB:%[^\t\n]",lib);
if(chk == 1){
groups[idx]->lib = (char *) malloc(sizeof(char) * 1000);
strcpy(groups[idx]->lib,lib);
Expand Down Expand Up @@ -166,12 +166,7 @@ rg_info_t **parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_sta
(*grp_stats)[j][0]->divergent= 0;
(*grp_stats)[j][0]->mapped_bases= 0;
(*grp_stats)[j][0]->proper= 0;
(*grp_stats)[j][0]->inserts = malloc(sizeof(uint64_t)*200000);
check_mem((*grp_stats)[j][0]->inserts);
int k=0;
for(k=0;k<200000;k++){
(*grp_stats)[j][0]->inserts[k] = 0;
}
(*grp_stats)[j][0]->inserts = kh_init(ins);
(*grp_stats)[j][1] = (stats_rd_t *)malloc(sizeof(stats_rd_t));//Setup read two stats store
check_mem((*grp_stats)[j][1]);
(*grp_stats)[j][1]->length= 0;
Expand Down Expand Up @@ -263,11 +258,14 @@ int process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_si
if((b->core.flag & (BAM_FPROPER_PAIR|BAM_FREAD1)) == (BAM_FPROPER_PAIR|BAM_FREAD1)){
(*grp_stats)[rg_index][read]->proper++;
uint32_t ins = b->core.isize;
//if(abs(ins)>49999){
// fprintf(stderr,"Absolute insert size above 49999 : %d. Not in useful range, ignoring.\n",abs(ins));
// }else{
(*grp_stats)[rg_index][read]->inserts[abs(ins)-1]++;
// }
int res;
khint_t k;
k = kh_put(ins,(*grp_stats)[rg_index][read]->inserts,abs(ins),&res);
if(res){
kh_value((*grp_stats)[rg_index][read]->inserts,k) = 1;
}else{
kh_value((*grp_stats)[rg_index][read]->inserts,k) = kh_value((*grp_stats)[rg_index][read]->inserts,k)+1;
}
}

}
Expand Down
6 changes: 5 additions & 1 deletion c/bam_access.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
#include "htslib/sam.h"
#include "dbg.h"

#include "khash.h"
KHASH_MAP_INIT_INT(ins,uint64_t)
//KHASH_INIT2(ins,, khint32_t, uint64_t, 1, kh_int_hash_func, kh_int_hash_equal)

typedef struct {
uint32_t length;
uint64_t count;
Expand All @@ -36,7 +40,7 @@ typedef struct {
uint64_t mapped_bases;
uint64_t proper;
//list of counts of possible insert sizes....
uint64_t *inserts; //from counts of insert size from 0-200000 bp. See how this works, might need amore dynamic means for some data.
khash_t(ins) *inserts; //from counts of insert size from 0-200000 bp. See how this works, might need amore dynamic means for some data.
//FQP is not included as we're not covering quality plots yet.
} stats_rd_t;

Expand Down
40 changes: 36 additions & 4 deletions c/bam_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include <libgen.h>
#include <math.h>
#include "dbg.h"
#include <bam_access.h>

#include "khash.h"

static char *input_file;
static char *output_file;
static char *ref_file;
Expand Down Expand Up @@ -133,18 +136,27 @@ void options(int argc, char *argv[]){
return;
}

int calculate_mean_sd_median_insert_size(uint64_t *inserts,double *mean, double *sd, double *median){
int calculate_mean_sd_median_insert_size(khash_t(ins) *inserts,double *mean, double *sd, double *median){

uint64_t pp_mean = 0;
uint64_t tt_mean = 0;
uint32_t key;
uint64_t val;

kh_foreach(inserts,key,val,
{ pp_mean += key * val;
tt_mean += val;
});

/*
int i = 0;
for(i=0;i<200000;i++){
if(inserts[i]>0){
pp_mean += (i+1) * inserts[i];
tt_mean += inserts[i];
}
}
*/

if(tt_mean){//Calculate mean , median, sd
*mean = (double) ((double)pp_mean/(double)tt_mean);
Expand All @@ -155,15 +167,24 @@ int calculate_mean_sd_median_insert_size(uint64_t *inserts,double *mean, double
uint32_t prev_insert = 0;
uint64_t running_total = 0;

kh_foreach(inserts,key,val,
{ insert = key;
running_total += val;
if(running_total >= midpoint) break;
prev_insert = key;
});

/*
int j=0;
for(j=0;j<200000;j++){
if(inserts[j]>0){
insert = j+1;
running_total += inserts[j];
if(running_total >= midpoint) break ;
prev_insert = j+1;
}
}
}*/

if(tt_mean %2 == 0 && ( running_total - midpoint2 >= insert )){
//warn "Thinks is even AND split between bins ";
Expand All @@ -176,14 +197,23 @@ int calculate_mean_sd_median_insert_size(uint64_t *inserts,double *mean, double
//We have mean and median so calculate the SD
uint64_t pp_sd = 0;
uint64_t tt_sd = 0;

kh_foreach(inserts,key,val,
{ double diff = (double)(key) - (*mean);
pp_sd += (diff * diff) * (double)val;
tt_sd += val;
});

/*
int k=0;
for(k=0;k<200000;k++){
if(inserts[k]>0){
double diff = (double)(k+1) - (*mean);
pp_sd += (diff * diff) * inserts[k];
tt_sd += inserts[k];
}
}
}*/
kh_destroy(ins, inserts);

if(tt_sd){
double variance = fabs((double)((double)pp_sd / (double)tt_sd));
Expand Down Expand Up @@ -256,8 +286,10 @@ int print_results(rg_info_t **grps){
uint32_t read_length_r1 = grp_stats[i][0]->length;
uint32_t read_length_r2 = grp_stats[i][1]->length;

char *file = basename(input_file);

chk = fprintf(out,rg_line_pattern,
input_file,
file,
grps[i]->sample,
grps[i]->platform,
grps[i]->platform_unit,
Expand Down
Loading

0 comments on commit 51fb0cc

Please sign in to comment.