Merge branch 'master' of https://github.com/imgag/AIdiva

imgag · Jul 11, 2022 · cbf4191 · cbf4191
2 parents 68bf433 + 2f6fc7a
commit cbf4191
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 21 deletions.
diff --git a/aidiva/variant_annotation/annotate_with_vep.py b/aidiva/variant_annotation/annotate_with_vep.py
@@ -143,7 +143,7 @@ def annotate_from_bed(input_vcf_file, output_vcf_file, annotation_dict, num_core
         tmp_repeatmasker.close()
 
         subprocess.run(f"{command} -bed {bed_annotation['segmentDuplication']} -name SegDup -sep '&' -in {input_vcf_file} -out {tmp_segDup.name} -threads {num_cores}", shell=True, check=True)
-        subprocess.run(f"{command} -bed {bed_annotation['simpleRepeat']} -name SimpleRepeat -sep '&' -in {tmp_segDup.name} -out {tmp_simpleRepeat.name} -threads {num_cores}", shell=True, check=True)
+        subprocess.run(f"{command} -bed {bed_annotation['simpleRepeat']} -name SimpleRepeats -sep '&' -in {tmp_segDup.name} -out {tmp_simpleRepeat.name} -threads {num_cores}", shell=True, check=True)
         subprocess.run(f"{command} -bed {bed_annotation['oe_lof']} -name oe_lof -sep '&' -in {tmp_simpleRepeat.name} -out {tmp_oe_lof.name} -threads {num_cores}", shell=True, check=True)
 
         if os.path.isfile(bed_annotation["omim"]):

diff --git a/data/AIdiva_example_configuration_annotated.yaml b/data/AIdiva_example_configuration_annotated.yaml
@@ -12,8 +12,8 @@ Analysis-Input:
 
     # trained scoring models used to predict the pathogenicity score
     # if no trained model is present you can use the train_model.py script to train a new custom model
-    scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
-    scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
+    scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
+    scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl
 
     prioritization-information:
         # Identifier to get the score from the annotated file
@@ -25,7 +25,7 @@ Analysis-Input:
 Model-Features:
     # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) if not wanted use a empty list "[]" instead
     # NOTE: currently not used, we use directly the MAX_AF annotation from VEP
-    allele_frequency_list:
+    allele-frequency-list:
         - gnomAD_AFR_AF
         - gnomAD_ASJ_AF
         - gnomAD_EAS_AF
@@ -43,7 +43,7 @@ Model-Features:
 
     # List containing the names of the features used for the model training
     # the exact order of the features is crucial (make sure to have the exact same order as in the training step)
-    feature_list:
+    feature-list:
         - SIFT
         - PolyPhen
         - CADD_PHRED

diff --git a/data/AIdiva_example_configuration_annotated_grch38.yaml b/data/AIdiva_example_configuration_annotated_grch38.yaml
@@ -12,8 +12,8 @@ Analysis-Input:
 
     # trained scoring models used to predict the pathogenicity score
     # if no trained model is present you can use the train_model.py script to train a new custom model
-    scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
-    scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
+    scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
+    scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl
 
     prioritization-information:
         # Identifier to get the score from the annotated file
@@ -25,7 +25,7 @@ Analysis-Input:
 Model-Features:
     # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) if not wanted use a empty list "[]" instead
     # NOTE: currently not used, we use directly the MAX_AF annotation from VEP
-    allele_frequency_list:
+    allele-frequency-list:
         - gnomAD_AFR_AF
         - gnomAD_ASJ_AF
         - gnomAD_EAS_AF
@@ -43,7 +43,7 @@ Model-Features:
 
     # List containing the names of the features used for the model training
     # the exact order of the features is crucial (make sure to have the exact same order as in the training step)
-    feature_list:
+    feature-list:
         - SIFT
         - PolyPhen
         - CADD_PHRED

diff --git a/data/AIdiva_example_configuration_with_annotation.yaml b/data/AIdiva_example_configuration_with_annotation.yaml
@@ -12,8 +12,8 @@ Analysis-Input:
 
     # trained scoring models used to predict the pathogenicity score
     # if no trained model is present you can use the train_model.py script to train a new custom model
-    scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
-    scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
+    scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
+    scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl
 
     prioritization-information:
         # Identifier to get the score from the annotated file
@@ -25,7 +25,7 @@ Analysis-Input:
 Model-Features:
     # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these)
     # NOTE: currently not used, we use directly the MAX_AF annotation from VEP
-    allele_frequency_list:
+    allele-frequency-list:
         - gnomAD_AFR_AF
         - gnomAD_ASJ_AF
         - gnomAD_EAS_AF
@@ -43,7 +43,7 @@ Model-Features:
 
     # List containing the names of the features used for the model training
     # the exact order of the features is crucial (make sure to have the exact same order as in the training step)
-    feature_list:
+    feature-list:
         - SIFT
         - PolyPhen
         - CADD_PHRED

diff --git a/data/AIdiva_example_configuration_with_annotation_grch38.yaml b/data/AIdiva_example_configuration_with_annotation_grch38.yaml
@@ -12,8 +12,8 @@ Analysis-Input:
 
     # trained scoring models used to predict the pathogenicity score
     # if no trained model is present you can use the train_model.py script to train a new custom model
-    scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
-    scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
+    scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
+    scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl
 
     prioritization-information:
         # Identifier to get the score from the annotated file
@@ -25,7 +25,7 @@ Analysis-Input:
 Model-Features:
     # List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these)
     # NOTE: currently not used, we use directly the MAX_AF annotation from VEP
-    allele_frequency_list:
+    allele-frequency-list:
         - gnomAD_AFR_AF
         - gnomAD_ASJ_AF
         - gnomAD_EAS_AF
@@ -43,7 +43,7 @@ Model-Features:
 
     # List containing the names of the features used for the model training
     # the exact order of the features is crucial (make sure to have the exact same order as in the training step)
-    feature_list:
+    feature-list:
         - SIFT
         - PolyPhen
         - CADD_PHRED

diff --git a/doc/install_additional_tools.md b/doc/install_additional_tools.md
@@ -11,8 +11,9 @@ Ngs-bits is used to annotate the VCF files.
 ```
 git clone https://github.com/imgag/ngs-bits.git
 cd ngs-bits
-git checkout cba4aa891b5af683f74f0b0dabbe143719e0883a && git submodule update --recursive --init
+git checkout 2022_04 && git submodule update --recursive --init
 make build_3rdparty
+make build_libs_release
 make build_tools_release
 ```
 
@@ -41,13 +42,13 @@ mkdir -p $vep_data_dir
 cd $vep_data_dir
 mkdir -p ftp
 cd ftp
-wget ftp://ftp.ensembl.org/pub/release-100/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh37.tar.gz
-#wget ftp://ftp.ensembl.org/pub/release-100/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh38.tar.gz
+wget ftp://ftp.ensembl.org/pub/release-103/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh37.tar.gz
+#wget ftp://ftp.ensembl.org/pub/release-103/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh38.tar.gz
 
 # install ensembl-vep
 PERL5LIB=$vep_install_dir/Bio/:$vep_cpan_dir/lib/perl5/:$PERL5LIB
 cd $vep_install_dir
 perl INSTALL.pl --SPECIES homo_sapiens --ASSEMBLY GRCh37 --AUTO acp --NO_UPDATE --NO_BIOPERL --CACHEDIR $vep_data_dir/cache --CACHEURL $vep_data_dir/ftp --NO_TEST
 cp $vep_data_dir/cache/Plugins/*.pm $vep_install_dir/modules/ #should not be necessary - probably a bug in the VEP installation script when using the CACHEDIR option (MS)
 
-```
+```