DKFZ-ODCF · NagaComBio · Nov 14, 2018 · Nov 28, 2018 · Nov 28, 2018 · Nov 28, 2018
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ build/
 *.swp
 *.DS_Store
 #.+
+hg19_GRCh37_1000genomes
diff --git a/ACEseqWorkflow.jar b/ACEseqWorkflow.jar
diff --git a/ACEseqWorkflow_5.0.1.iml → ACEseqWorkflow_6.0.0.iml b/ACEseqWorkflow_5.0.1.iml → ACEseqWorkflow_6.0.0.iml
diff --git a/README.ACEseqWorkflow.txt b/README.ACEseqWorkflow.txt
@@ -15,6 +15,10 @@ isNoControlWorkflow	        false   Run analysis with matching control and estim
 
 == Changelist
 
+* Version update 6.0.0
+- Changed the phasing routine. The program "impute2" was replaced by "Beagle". Files and tools were renamed accordingly.
+- Generally renamed all tools and files from "imputeGenotype" to "phaseGenotype" (and so on) as the subroutine does not actually perform imputation but rather phasing.
+
 * Version update to 5.0.1
 - fixed density(NA) bug and index bug for frequencies (as.character) in clustering step
 

diff --git a/buildversion.txt b/buildversion.txt
@@ -1,2 +1,2 @@
-5.0
+6.0
 0
diff --git a/documentation/source/methods.rst b/documentation/source/methods.rst
@@ -2,7 +2,7 @@ Methods - Theory
 ================
 
 
-| ACEseq can be used to estimate copy-numbers from WGS data using a tumor vs. control approach. Thus a pre-requesite is WGS data from healthy tissue and tumor tissue of the same patient with at least 30x coverage. Samtools [] mpileup is used to determine the coverage for tumor and control sample - position specific for each single nucleotide polymorphism (SNP) position recorded in dbSNP and per 1 kb window. To get chromosome specific allele frequencies, the genotypes of SNP positions are phased with Impute2 [] and A and B allele are assigned accordingly. Haploblocks are defined as regions with consecutively phased SNPs. Subsequently, B-allele frequencies (BAFs) are estimated for all SNP positions in tumor and control with sufficient coverage in the control: 
+| ACEseq can be used to estimate copy-numbers from WGS data using a tumor vs. control approach. Thus a pre-requesite is WGS data from healthy tissue and tumor tissue of the same patient with at least 30x coverage. Samtools [] mpileup is used to determine the coverage for tumor and control sample - position specific for each single nucleotide polymorphism (SNP) position recorded in dbSNP and per 1 kb window. To get chromosome specific allele frequencies, the genotypes of SNP positions are phased with Impute2 (up to v5) or Beagle (since v6) [] and A and B allele are assigned accordingly. Haploblocks are defined as regions with consecutively phased SNPs. Subsequently, B-allele frequencies (BAFs) are estimated for all SNP positions in tumor and control with sufficient coverage in the control:
 
   .. math::
      \label{eq:BAF}

diff --git a/documentation/source/parameters.rst b/documentation/source/parameters.rst
@@ -11,7 +11,7 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang
  svOutputDirectory,${outputAnalysisBaseDirectory}/SV_calls,path,
  crestOutputDirectory,${outputAnalysisBaseDirectory}/crest,path,
  cnvSnpOutputDirectory,${aceseqOutputDirectory}/cnv_snp,path,
- imputeOutputDirectory,${aceseqOutputDirectory}/phasing,path,
+ phasingOutputDirectory,${aceseqOutputDirectory}/phasing,path,
  plotOutputDirectory,${aceseqOutputDirectory}/plots,path,
  runWithoutControl,false,boolean,use control for analysis (false|true)
  minHT,5,integer,minimum number of consecutive SNPs to be considered for haploblocks
@@ -41,7 +41,7 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang
  min_distance,0.05,float,min_distance
  haplogroupFilePrefix,haploblocks_chr,string,prefix for file with haplogroups per chromosome
  haplogroupFileSuffix,txt,string,suffix for file with haplogroups per chromosome
- haplogroupFilePath,${imputeOutputDirectory}/${haplogroupFilePrefix},path,
+ haplogroupFilePath,${phasingOutputDirectory}/${haplogroupFilePrefix},path,
  min_length_purity,1000000,integer,minimum length of segments to be considered for tumor cell content and ploidy estimation
  min_hetSNPs_purity,500,integer,minimum number of control heterozygous SNPs in segments to be considered for tumor cell content and ploidy estimation
  dh_stop,max,string,
@@ -72,18 +72,9 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang
  PATIENTSEX,male,string,patient sex used in case of no control workflow (male|female|klinefelter)
  CNV_ANNO_SUFFIX,cnv.anno.tab.gz,string,suffix for mappability annotated chromosome-wise 1kb coverage files
  CNV_SUFFIX,cnv.tab.gz,string,suffix chromosome-wise 1kb coverage files
- FILE_UNPHASED_PRE,${imputeOutputDirectory}/${unphasedGenotypesFilePrefix},path,
- FILE_UNPHASED_GENOTYPE,${imputeOutputDirectory}/unphased_genotype_chr,path,
- FILE_PHASED_PRE,${imputeOutputDirectory}/${phasedGenotypesFilePrefix},path,
- FILE_PHASED_GENOTYPE,${imputeOutputDirectory}/phased_genotype_chr,path,
- FILE_INFO,info,string,
- FILE_INFO_SAMPLE,info_by_sample,string,
- FILE_HAPS,haps,string,
- FILE_HAPS_CONF,haps_confidence,string,
- FILE_SUMMARY,summary,string,
- FILE_WARNINGS,warnings,string,
- FILE_PART,part,string,
- FILE_SAMPLE_G,${imputeOutputDirectory}/sample_g.txt,path,sample_g file used by imputation on X chromosome for females
+ FILE_UNPHASED_PRE,${phasingOutputDirectory}/${unphasedGenotypesFilePrefix},path,
+ FILE_PHASED_GENOTYPE,${phasingOutputDirectory}/phased_genotype_chr,path,
+ FILE_SAMPLE_G,${phasingOutputDirectory}/sample_g.txt,path,sample_g file used by imputation on X chromosome for females
  MALE_FAKE_CONTROL_PRE,${pathToACEseqResults}/cnv_snp/${pid}.chr,path,path and prefix to chromosome-wise 1kb coverage file used for fake control workflow for male patients
  FEMALE_FAKE_CONTROL_PRE,${pathToACEseqResults}/cnv_snp/${pid}.chr,path,path and prefix to chromosome-wise 1kb coverage file used for fake control workflow for female patients
  PLOT_PRE,${aceseqOutputDirectory}/${pid}_plot,path,
@@ -104,16 +95,13 @@ Multiple parameters can be set with ACEseq though not all are necessary to chang
  CHROMOSOME_LENGTH_FILE,${path}/chrlengths.txt,path,
  REPLICATION_TIME_FILE,${path}/ReplicationTime_10cellines_mean_10KB.Rda,path,"replication timing file"
  GC_CONTENT_FILE,${path}/hg19_GRch37_100genomes_gc_content_10kb.txt,path,
- GENETIC_MAP_FILE,${path}/genetic_map_chr${CHR_NAME}_combined_b37.txt,path,"impute files"
- KNOWN_HAPLOTYPES_FILE,${path}/ALL.chr${CHR_NAME}.integrated_phase1_v3. 20101123.snps_indels_svs.genotypes.nomono.haplotypes.gz,path,"impute files"
- KNOWN_HAPLOTYPES_LEGEND_FILE,${path}ALL.chr${CHR_NAME}.integrated_phase1_v3. 20101123.snps_indels_svs.genotypes.nomono.legend.gz,path,"impute files"
- GENETIC_MAP_FILE_X,${path}/genetic_map_chrX_nonPAR_combined_b37.txt,path,"impute files"
- KNOWN_HAPLOTYPES_FILE_X,${path}/ALL_1000G_phase1integrated_v3_chrX_nonPAR_impute.hap.gz,path,"impute files"
- KNOWN_HAPLOTYPES_LEGEND_FILE_X,${path}/ALL_1000G_phase1integrated_v3_chrX_nonPAR_impute.legend.gz,path,"impute files"
  outputExecutionDirectory,${path}/exec_${executionTimeString},,"path to log files"
- imputeBaseDirectory,${path}/,path,"directory for impute files"
  mergedBamSuffix,merged.mdup.bam,string,"A list of all known suffixes for merged bam files. I.e. merged.dupmark.bam, merged.mdup.bam..." 
  mergedBamSuffixList,${mergedBamSuffix},string,"A list of all known suffixes for merged bam files. I.e. merged.dupmark.bam, merged.mdup.bam..."
  defaultMergedBamSuffix,${mergedBamSuffix},string,The default suffix for merged bam files when they are created by Roddy.
  libloc_PSCBS,,string,path to PSCBS library in R
  libloc_flexclust,,string,path to felxclust library in R
+ BEAGLE_REFERENCE_FILE,${baseDirectoryReference}/tools_data/Beagle/chr${CHR_NAME}.1kg.phase3.v5a.b37.bref3,path
+ BEAGLE_REFERENCE_FILE_X,${baseDirectoryReference}/tools_data/Beagle/chrX.1kg.phase3.v5a.b37.bref3,path
+ BEAGLE_GENETIC_MAP,${baseDirectoryReference}/tools_data/genetic_maps/plink.chr${CHR_NAME}.GRCh37.map,path
+ BEAGLE_GENETIC_MAP_X,${baseDirectoryReference}/tools_data/genetic_maps/plink.chrX.GRCh37.map,path
diff --git a/installation/GRCh38_related_files/README_downloadReferences_GRCh38.md b/installation/GRCh38_related_files/README_downloadReferences_GRCh38.md
@@ -0,0 +1,105 @@
+### Reference files for GRCh38
+
+Information on downloading and parsing files needed for GRCh38 reference genome
+
+
+#### Reference genome
+
+We are using the GRCh38 version hosted at `http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa`,
+
+The above reference genome (GRCh38_decoy_ebv_phiX_alt_hla) can be downloaded and processed using the repositiry at `https://github.com/DKFZ-ODCF/setup-reference-data`.
+
+List of files needed from the above process
+- GRCh38_decoy_ebv_phiX_alt_hla_chr.fa
+- GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv
+- GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLength.tsv
+
+
+#### dbSNP version 135
+
+Downloaded from ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz
+
+Post-processing for usage in ACEseq:
+```
+DBSNP_VERSION=135
+zcat 00-All.vcf.gz |
+    awk '/^#/{print} /VC=SNV/{ v=$8; sub(/.*dbSNPBuildID=/, "", v); sub(/;.*/, "", v); if (v~/^[0-9]+$/ && int(v)<='$DBSNP_VERSION') print }' |
+    bgzip > 00-All.SNV.vcf.gz
+tabix -p vcf 00-All.SNV.vcf.gz
+```
+
+#### Generating mappability track
+The mappability file (GRCh38_Mappability_Align_100mer.bedGraph.gz) is created using the `create_mappability.sh` bash script.
+
+The tools used have to be in the `tools` folder. The tools `gem-2-wig`, `gem-indexer`, `gem-mappability` are from the package *gem-tools* from the Vlaams Instituut voor Biotechnologie (VIB). 
+Information about the tools can be found here: https://wiki.bits.vib.be/index.php/Create_a_mappability_track#Install_and_run_the_GEM_library_tools. The tools were downloaded from https://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/. The corresponding paper should be this one: https://www.researchgate.net/publication/221776385_Fast_Computation_and_Applications_of_Genome_Mappability
+
+The two tools `wigToBigWig` and `BigWigToBedGraph` are downloaded from http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/
+
+Finally, `tabix` from htslib is used (see http://www.htslib.org/doc/tabix.html)
+
+Run the script `sh create_mappability.sh`
+
+
+#### Replication time
+Replication time from individual cell lines from GRCh37 ENCODE data were lifted over to GRCh38, and averages were re-calculated.
+The new R-object is uploaded to the `$repo_root/installation/GRCh38_related_files/time_mean_10KB.Rda`
+
+
+#### Gaps and centromeres
+- `gap.txt.gz` downloaded from `http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/gap.txt.gz`
+- `centromeres.txt.gz` downloaded from `http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/centromeres.txt.gz`
+- `centromeres_merged.txt.gz` created with the following command
+``` 
+zcat centromeres.txt.gz | sort -k 2 -V | awk 'BEGIN {printf "#bin\tchrom\tchromStart\tchromEnd\n"; chr=""} $2!=chr { if (end != ""){printf $1"\t" chr "\t" start "\t" end "\n"}; chr=$2; start=$3} {end=$4} END {printf $1"\t" chr "\t" start "\t" end "\n"}' | gzip > centromeres_merged.txt.gz
+```
+- `gap_with_centromeres.txt.gz` file created with the following command 
+```
+zcat centromeres_merged.txt.gz | tail -n +2 | awk ' BEGIN {OFS="\t"} {print $0, "1", "N", $4-$3, "centromere", "no"}' | gzip > gap_with_centromeres.txt.gz
+```
+
+
+#### GC content
+The GC-content (`gc_content_hg38.txt`) is calculated directly from the reference genome with the script `calc_gc_content.py`.
+```
+python3 calc_gc_content -v -i ${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa -o gc_content_hg38.txt
+```
+The -v flag increases verbosity. Note that you have to use python3!
+
+
+#### Beagle reference files
+The variants were downloaded for all the chromosomes mapped to hg38 reference genome
+Info: https://www.internationalgenome.org/announcements/Variant-calls-from-1000-Genomes-Project-data-on-the-GRCh38-reference-assemlby/
+FTP site: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/
+
+Added 'chr' prefix and converted the VCF files to BREF format using Beagle. Refer to the script `beagle_vcf_to_bref.sh`
+
+This step will generate the following files,
+- `ALL.chr${CHR_NAME}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref3`
+- `ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref3`
+
+
+#### Beagle genetic map files
+Downloaded from `http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip`
+
+Add `chr` prefix
+
+```
+for chr in `seq 1 22` X ; do cat plink.chr${chr}.GRCh38.map | sed 's/^/chr/' > plink.chr${chr}.GRCh38.CHR.map; done
+```
+
+
+#### Local controls for no-control workflow (optional)
+These are lift-over files from the hg19 workflow. New hg38 native files will be generated for the next versions.
+
+
+#### Exclusion list or blacklist files
+These are lift-over files from the hg19 workflow. New hg38 native files will be generated for the next versions.
+`ACEseqWorkflow/resources/analysisTools/copyNumberEstimationWorkflow/artifact.homoDels.potentialArtifacts.hg38_liftover.txt`
+
+
+#### Hg38 cytoband
+Cytoband file was copied from ANNOVAR database files. Only the coordinates from Chr1-22, X, and Y were kept.
+```
+cat ANNOVAR/annovar_April2018/humandb/hg38_cytoBand.txt | grep -v "_" | grep -v "^chrM" > hg38_cytoBand.txt
+```
diff --git a/installation/GRCh38_related_files/beagle_vcf_to_bref.sh b/installation/GRCh38_related_files/beagle_vcf_to_bref.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+module load java/1.8.0_131 
+
+for chr in `seq 1 22` X Y
+do
+  echo $chr
+  (zcat ALL.chr${chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | head -n 300 | grep "#" ; zcat ALL.chr${chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | grep -v "#" | sed 's/^/chr/') | java -jar bref3.18May20.d20.jar > ALL.chr${chr}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.CHR.bref
+done
diff --git a/installation/GRCh38_related_files/calc_gc_content.py b/installation/GRCh38_related_files/calc_gc_content.py
@@ -0,0 +1,58 @@
+import sys
+import argparse
+import re
+
+assert sys.version_info.major >= 3, "Python 3 is required for string operations" 
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-i", "--input", help="Input sequence - fasta file format", required=True)
+parser.add_argument("-o", "--output", help="Output file name", default='gc_content.txt')
+parser.add_argument("-t", "--threshold", help="Threshold", default=0.0)
+parser.add_argument("--stepsize", help="Threshold (regions with gc lower than threshold are discarded)", default=10000)
+parser.add_argument("-v", "--verbose", action = 'store_true', help="Add verbosity")
+args = parser.parse_args()
+
+threshold = float(args.threshold)
+stepsize = int(args.stepsize)
+
+output_data = ['\t'.join(['chromosome', 'start', 'end', 'gc_content'])]
+
+if args.verbose: print('Load data')
+with open(args.input, "r") as f:
+    data = f.read().splitlines()
+
+if args.verbose: print('Calculate chromosome stats')
+chr_names, chr_borders = [], []
+
+for i in range(len(data)):
+    if '>' in data[i]:
+        chr_names.append(data[i].split('>')[1].split(' ')[0])
+        chr_borders.append(i)
+chr_borders = chr_borders + [len(data)]
+
+for i in range(len(chr_names)):
+
+    if not re.match("^(chr)?[0-9,X,Y]{1,2}$", chr_names[i]):
+        continue
+
+    if args.verbose: print('Calculating Chromosome {} - length = {} aa'.format(chr_names[i], 61*(chr_borders[i+1] - chr_borders[i])))
+    current_chr = data[chr_borders[i]:chr_borders[i+1]][1:]
+
+    current_chr = ''.join(current_chr)
+
+    for j in range(0, len(current_chr)//stepsize):
+        current_data = current_chr[(stepsize*j):(stepsize*(j+1))]
+
+        gc = len(re.findall('[gcGC]', current_data))
+        gcta = len(re.findall('[gctaGCTA]', current_data))
+        if gcta == 0: gcta = 1.
+        gc_content = gc/gcta
+
+        if gc_content > threshold:
+            output_data.append('\t'.join([chr_names[i], str(stepsize*j), str(stepsize*(j+1)), '{:1.6f}'.format(gc_content)]))
+if args.verbose: print('Finished with calculation. Writing to file {}'.format(args.output))
+
+with open(args.output, 'w') as f:
+    for line in output_data:
+        f.write("%s\n" % line)
+if args.verbose: print('GC-content calculated succesfully')
diff --git a/installation/GRCh38_related_files/create_mappability.sh b/installation/GRCh38_related_files/create_mappability.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+nr_cores=4
+kmer=100
+reference_genome="${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa"
-reference_genome="${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa"
+reference_path="/path/to/reference"
+reference_genome="${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa"
-reference_genome="${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa"
+reference_path="/path/to/reference"
+reference_genome="${reference_path}/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa"
+
+mkdir -p ref out
+# copy the reference but only chr1-22, X and Y
+echo "copy reference genome"
+awk '/^>chrM/ {exit} /^>/ {print $1}  !/^>/ {print}' ${reference_genome} > ref/reference.fa
+echo "gem-indexer"
+./tools/gem-indexer -T ${nr_cores} -c dna -i ref/reference.fa -o out/index
+echo "gem-mappability"
+./tools/gem-mappability -T ${nr_cores} -I out/index.gem -l ${kmer} -o out/outfiles
+echo "gem-2-wig"
+./tools/gem-2-wig -I out/index.gem -i out/outfiles.mappability -o out/outfiles
+echo "wigToBigWig"
+./tools/wigToBigWig out/outfiles.wig out/outfiles.sizes out/outfiles.bigWig
+echo "bigWigToBedGraph"
+./tools/bigWigToBedGraph out/outfiles.bigWig out/outfiles.bedGraph
+
+echo "Filter lines and compress"
+awk '$4 > 0.0 {print $0}' out/outfiles.bedGraph | ./tools/bgzip > GRCh38_Mappability_Align_100mer.bedGraph.gz
+
+echo "Create Index with Tabix"
+./tools/tabix -p bed GRCh38_Mappability_Align_100mer.bedGraph.gz
+
+echo "cleaning"
+rm -r ref out
-rm -r ref out
+ref_dir=$(mktemp -d -t ref-XXXXXXXXXX)
+out_dir=$(mktemp -d -t out-XXXXXXXXXX)
+
+# Replace all 'ref' and 'out' in your script with '${ref_dir}' and '${out_dir}' respectively
+
+rm -r ${ref_dir} ${out_dir}
-rm -r ref out
+ref_dir=$(mktemp -d -t ref-XXXXXXXXXX)
+out_dir=$(mktemp -d -t out-XXXXXXXXXX)
+
+# Replace all 'ref' and 'out' in your script with '${ref_dir}' and '${out_dir}' respectively
+
+rm -r ${ref_dir} ${out_dir}
diff --git a/installation/GRCh38_related_files/time_mean_10KB.Rda b/installation/GRCh38_related_files/time_mean_10KB.Rda
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ build/ @@
     *.swp
     *.DS_Store
     #.+
+    hg19_GRCh37_1000genomes