Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/imgag/AIdiva
Browse files Browse the repository at this point in the history
  • Loading branch information
dboceck committed Jul 11, 2022
2 parents 68bf433 + 2f6fc7a commit cbf4191
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 21 deletions.
2 changes: 1 addition & 1 deletion aidiva/variant_annotation/annotate_with_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def annotate_from_bed(input_vcf_file, output_vcf_file, annotation_dict, num_core
tmp_repeatmasker.close()

subprocess.run(f"{command} -bed {bed_annotation['segmentDuplication']} -name SegDup -sep '&' -in {input_vcf_file} -out {tmp_segDup.name} -threads {num_cores}", shell=True, check=True)
subprocess.run(f"{command} -bed {bed_annotation['simpleRepeat']} -name SimpleRepeat -sep '&' -in {tmp_segDup.name} -out {tmp_simpleRepeat.name} -threads {num_cores}", shell=True, check=True)
subprocess.run(f"{command} -bed {bed_annotation['simpleRepeat']} -name SimpleRepeats -sep '&' -in {tmp_segDup.name} -out {tmp_simpleRepeat.name} -threads {num_cores}", shell=True, check=True)
subprocess.run(f"{command} -bed {bed_annotation['oe_lof']} -name oe_lof -sep '&' -in {tmp_simpleRepeat.name} -out {tmp_oe_lof.name} -threads {num_cores}", shell=True, check=True)

if os.path.isfile(bed_annotation["omim"]):
Expand Down
8 changes: 4 additions & 4 deletions data/AIdiva_example_configuration_annotated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Analysis-Input:

# trained scoring models used to predict the pathogenicity score
# if no trained model is present you can use the train_model.py script to train a new custom model
scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl

prioritization-information:
# Identifier to get the score from the annotated file
Expand All @@ -25,7 +25,7 @@ Analysis-Input:
Model-Features:
# List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) if not wanted use a empty list "[]" instead
# NOTE: currently not used, we use directly the MAX_AF annotation from VEP
allele_frequency_list:
allele-frequency-list:
- gnomAD_AFR_AF
- gnomAD_ASJ_AF
- gnomAD_EAS_AF
Expand All @@ -43,7 +43,7 @@ Model-Features:

# List containing the names of the features used for the model training
# the exact order of the features is crucial (make sure to have the exact same order as in the training step)
feature_list:
feature-list:
- SIFT
- PolyPhen
- CADD_PHRED
Expand Down
8 changes: 4 additions & 4 deletions data/AIdiva_example_configuration_annotated_grch38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Analysis-Input:

# trained scoring models used to predict the pathogenicity score
# if no trained model is present you can use the train_model.py script to train a new custom model
scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl

prioritization-information:
# Identifier to get the score from the annotated file
Expand All @@ -25,7 +25,7 @@ Analysis-Input:
Model-Features:
# List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these) if not wanted use a empty list "[]" instead
# NOTE: currently not used, we use directly the MAX_AF annotation from VEP
allele_frequency_list:
allele-frequency-list:
- gnomAD_AFR_AF
- gnomAD_ASJ_AF
- gnomAD_EAS_AF
Expand All @@ -43,7 +43,7 @@ Model-Features:

# List containing the names of the features used for the model training
# the exact order of the features is crucial (make sure to have the exact same order as in the training step)
feature_list:
feature-list:
- SIFT
- PolyPhen
- CADD_PHRED
Expand Down
8 changes: 4 additions & 4 deletions data/AIdiva_example_configuration_with_annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Analysis-Input:

# trained scoring models used to predict the pathogenicity score
# if no trained model is present you can use the train_model.py script to train a new custom model
scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl

prioritization-information:
# Identifier to get the score from the annotated file
Expand All @@ -25,7 +25,7 @@ Analysis-Input:
Model-Features:
# List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these)
# NOTE: currently not used, we use directly the MAX_AF annotation from VEP
allele_frequency_list:
allele-frequency-list:
- gnomAD_AFR_AF
- gnomAD_ASJ_AF
- gnomAD_EAS_AF
Expand All @@ -43,7 +43,7 @@ Model-Features:

# List containing the names of the features used for the model training
# the exact order of the features is crucial (make sure to have the exact same order as in the training step)
feature_list:
feature-list:
- SIFT
- PolyPhen
- CADD_PHRED
Expand Down
8 changes: 4 additions & 4 deletions data/AIdiva_example_configuration_with_annotation_grch38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Analysis-Input:

# trained scoring models used to predict the pathogenicity score
# if no trained model is present you can use the train_model.py script to train a new custom model
scoring-model-snps: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indels: <full-path-to-model>/rf_model_indel.pkl
scoring-model-snp: <full-path-to-model>/rf_model_snp.pkl
scoring-model-indel: <full-path-to-model>/rf_model_indel.pkl

prioritization-information:
# Identifier to get the score from the annotated file
Expand All @@ -25,7 +25,7 @@ Analysis-Input:
Model-Features:
# List containing the names of the allele frequency sources (populations) that are present in the data set (the MaxAF will be based on these)
# NOTE: currently not used, we use directly the MAX_AF annotation from VEP
allele_frequency_list:
allele-frequency-list:
- gnomAD_AFR_AF
- gnomAD_ASJ_AF
- gnomAD_EAS_AF
Expand All @@ -43,7 +43,7 @@ Model-Features:

# List containing the names of the features used for the model training
# the exact order of the features is crucial (make sure to have the exact same order as in the training step)
feature_list:
feature-list:
- SIFT
- PolyPhen
- CADD_PHRED
Expand Down
9 changes: 5 additions & 4 deletions doc/install_additional_tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ Ngs-bits is used to annotate the VCF files.
```
git clone https://github.com/imgag/ngs-bits.git
cd ngs-bits
git checkout cba4aa891b5af683f74f0b0dabbe143719e0883a && git submodule update --recursive --init
git checkout 2022_04 && git submodule update --recursive --init
make build_3rdparty
make build_libs_release
make build_tools_release
```

Expand Down Expand Up @@ -41,13 +42,13 @@ mkdir -p $vep_data_dir
cd $vep_data_dir
mkdir -p ftp
cd ftp
wget ftp://ftp.ensembl.org/pub/release-100/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh37.tar.gz
#wget ftp://ftp.ensembl.org/pub/release-100/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh38.tar.gz
wget ftp://ftp.ensembl.org/pub/release-103/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh37.tar.gz
#wget ftp://ftp.ensembl.org/pub/release-103/variation/indexed_vep_cache/homo_sapiens_vep_103_GRCh38.tar.gz
# install ensembl-vep
PERL5LIB=$vep_install_dir/Bio/:$vep_cpan_dir/lib/perl5/:$PERL5LIB
cd $vep_install_dir
perl INSTALL.pl --SPECIES homo_sapiens --ASSEMBLY GRCh37 --AUTO acp --NO_UPDATE --NO_BIOPERL --CACHEDIR $vep_data_dir/cache --CACHEURL $vep_data_dir/ftp --NO_TEST
cp $vep_data_dir/cache/Plugins/*.pm $vep_install_dir/modules/ #should not be necessary - probably a bug in the VEP installation script when using the CACHEDIR option (MS)
```
```

0 comments on commit cbf4191

Please sign in to comment.