Skip to content

Commit 87f6b5a

Browse files
Merge pull request #303 from micheldumontier/pharmgkb
PharmGKB hotfix for relationships
2 parents 8c80443 + ccdaf66 commit 87f6b5a

File tree

1 file changed

+101
-88
lines changed

1 file changed

+101
-88
lines changed

pharmgkb/pharmgkb.php

Lines changed: 101 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class PharmGKBParser extends Bio2RDFizer
3939

4040
function __construct($argv) {
4141
parent::__construct($argv, "pharmgkb");
42-
$this->AddParameter('files',true,'all|drugs|genes|diseases|relationships|pathways|rsid|variant_annotations|offsides|twosides','all','all or comma-separated list of files to process');
42+
$this->AddParameter('files',true,'all|drugs|genes|diseases|relationships|rsid|variant_annotations|offsides|twosides','all','all or comma-separated list of files to process'); /** pathways **/
4343
$this->AddParameter('download_url',false,null,'http://www.pharmgkb.org/commonFileDownload.action?filename=');
4444
parent::initialize();
4545
}
@@ -155,9 +155,9 @@ function process(){
155155
$zipentries = array('clinical_ann_metadata.tsv','var_drug_ann.tsv','var_pheno_ann.tsv','var_fa_ann.tsv'); //'study_parameters.tsv'
156156
else if($file == "pathways") {
157157
for( $i = 0; $i < $zin->numFiles; $i++ ){
158-
$stat = $zin->statIndex( $i );
159-
$entry = $stat['name'];
160-
$ext = pathinfo($entry, PATHINFO_EXTENSION);
158+
$stat = $zin->statIndex( $i );
159+
$entry = $stat['name'];
160+
$ext = pathinfo($entry, PATHINFO_EXTENSION);
161161
if($ext != "txt"){
162162
$zipentries[] = $entry;
163163
}
@@ -191,17 +191,19 @@ function process(){
191191
if($zipentry == "clinical_ann_metadata.tsv") $fnx = "clinical_ann_metadata";
192192
else $fnx = 'variant_annotation';
193193
echo "processing $zipentry..";
194+
} else if($file == 'pathways') {
195+
$fnx = 'pathways';
196+
echo "processing $fnx ($zipentry)... ";
194197
} else {
195198
$fnx = $file;
196-
echo "processing $fnx... ";
199+
echo "processing $fnx ... ";
197200
}
198201

199202
$this->$fnx();
200203
parent::writeRDFBufferToWriteFile();
201204
echo "done!".PHP_EOL;
202205

203206
// generate the dataset release file
204-
echo "Generating dataset description... ";
205207
$source_file = (new DataResource($this))
206208
->setURI($rfile)
207209
->setTitle("Pharmacogenomics Knowledge Base ($zipentry)")
@@ -240,7 +242,8 @@ function process(){
240242
}
241243
$this->GetWriteFile()->Close();
242244
} // foreach
243-
245+
246+
echo "Generating dataset description... ";
244247
parent::setGraphURI($graph_uri);
245248
parent::setWriteFile($odir.parent::getBio2RDFReleaseFile());
246249
parent::getWriteFile()->write($dataset_description);
@@ -259,16 +262,20 @@ function process(){
259262
5 Alternate Names
260263
6 Alternate Symbols
261264
7 Is Genotyped
262-
9 Is VIP
263-
12 Has Variant Annotation
265+
8 Is VIP
266+
9 Has Variant Annotation
267+
10 Has CPIC Dosing Guideline
264268
*/
265269
function genes()
266270
{
267-
if(($n = count(explode("\t",$this->GetReadFile()->Read()))) != 10) {
268-
trigger_error("Found $n columns in gene file - expecting 10!", E_USER_WARNING);
271+
$h = explode("\t",parent::getReadFile()->read());
272+
$expected_columns = 11;
273+
if(($n = count($h)) != $expected_columns) {
274+
trigger_error("Found $n columns in gene file - expecting $expected_columns!", E_USER_WARNING);
275+
return false;
269276
}
270277

271-
while($l = $this->GetReadFile()->Read(200000)) {
278+
while($l = parent::getReadFile()->read(200000)) {
272279
$a = explode("\t",$l);
273280

274281
$id = parent::getNamespace().$a[0];
@@ -401,32 +408,26 @@ function MapXrefs($xref, &$url = false)
401408
}
402409
return $ns.":".$id;
403410
}
404-
/*
405-
0 PharmGKB Accession Id
406-
1 Name
407-
2 Generic Names
408-
3 Trade Names
409-
4 Brand Mixtures
410-
5 Type
411-
6 Cross References
412-
7 SMILES
413-
8 External Vocabulary
414-
415-
0 PA164748388
416-
1 diphemanil methylsulfate
417-
2
418-
3 Prantal
419-
4
420-
5 Drug/Small Molecule
421-
6 drugBank:DB00729,pubChemCompound:6126,pubChemSubstance:149020
422-
7
423-
8 ATC:A03AB(Synthetic anticholinergics, quaternary ammonium compounds)
424-
425-
*/
411+
/*
412+
[0] => PharmGKB Accession Id
413+
[1] => Name
414+
[2] => Generic Names
415+
[3] => Trade Names
416+
[4] => Brand Mixtures
417+
[5] => Type
418+
[6] => Cross-references
419+
[7] => SMILES
420+
[8] => Dosing Guideline
421+
[9] => External Vocabulary
422+
*/
426423
function drugs()
427424
{
428425
$declared = '';
429-
$this->GetReadFile()->Read(1000); // first line is header
426+
$h = explode("\t",$this->GetReadFile()->Read(1000)); // first line is header
427+
if(count($h) != 10) {
428+
trigger_error("Change in number of columns for drugs file",E_USER_ERROR);
429+
return FALSE;
430+
}
430431
while($l = $this->GetReadFile()->Read(200000)) {
431432
$a = explode("\t",$l);
432433
$id = parent::getNamespace().$a[0];
@@ -503,17 +504,17 @@ function drugs()
503504
}
504505
}
505506
}
506-
if(trim($a[8])) {
507+
if(trim($a[9])) {
507508
// External Vocabulary
508509
// ATC:H01AC(Somatropin and somatropin agonists),ATC:V04CD(Tests for pituitary function)
509510
// ATC:D07AB(Corticosteroids, moderately potent (group II)) => this is why you don't use brackets and commas as separators.
510-
$b = explode(',',trim($a[8]),2);
511+
$b = explode(',',trim($a[9]),2);
511512
foreach($b as $c) {
512513
preg_match_all("/ATC:([A-Z0-9]+)\((.*)\)$/",$c,$m);
513514
if(isset($m[1][0])) {
514515
$atc = "atc:".$m[1][0];
515516
parent::addRDF(
516-
parent::triplify($id, parent::getVoc()."xref", $atc)
517+
parent::triplify($id, parent::getVoc()."x-atc", $atc)
517518
);
518519
if(!isset($declared[$atc])) {
519520
$declared[$atc] = '';
@@ -524,18 +525,25 @@ function drugs()
524525
}
525526
}
526527
}
528+
parent::WriteRDFBufferToWriteFile();
527529
}
528-
parent::WriteRDFBufferToWriteFile();
529530
}
530531

531-
/*
532-
0 PharmGKB Accession Id
533-
1 Name
534-
2 Alternate Names
535-
*/
532+
/*
533+
[0] => PharmGKB Accession Id
534+
[1] => Name
535+
[2] => Alternate Names
536+
[3] => Cross-references
537+
[4] => External Vocabulary
538+
*/
536539
function diseases()
537540
{
538-
$this->GetReadFile()->Read(10000);
541+
$h = explode("\t",$this->GetReadFile()->Read(10000)); // first line is header
542+
if(count($h) != 5) {
543+
trigger_error("Change in number of columns for diseases file",E_USER_ERROR);
544+
return FALSE;
545+
}
546+
539547
while($l = $this->GetReadFile()->Read(10000)) {
540548
$a = explode("\t",$l);
541549

@@ -571,24 +579,24 @@ function diseases()
571579
parent::addRDF(
572580
parent::triplify($id, "owl:sameAs", $sameID)
573581
);
574-
if(isset($a[4]) && trim($a[4]) != '') {
575-
$d = preg_match_all('/(MeSH|SnoMedCT|UMLS):([A-Z0-9]+)\(([^\)]+)\)/',$a[4],$m, PREG_SET_ORDER);
582+
if(isset($a[4]) && trim($a[4]) != '') {
583+
$d = preg_match_all('/[,]?([^\:]+):([A-Za-z0-9]+)\(([^\)]+)\)/',$a[4],$m, PREG_SET_ORDER);
576584
foreach($m AS $n) {
577-
$n[1] = strtolower($n[1]);
578-
if($n[1] == 'snomedct') $n[1] = 'snomed';
579-
$id2 = $n[1].':'.$n[2];
580-
parent::addRDF(
581-
parent::triplify($id, "rdfs:seeAlso", $id2)
582-
);
583-
if(isset($n[3]) && $n[2] != $n[3]){
585+
if(isset($n[1]) && isset($n[2]) && !strstr($n[1]," ")) {
586+
$id2 = $n[1].':'.$n[2];
584587
parent::addRDF(
585-
parent::triplifyString($id2, "rdfs:label", str_replace(array("\'", "\""),array("\\\'", ""),$n[3]))
588+
parent::triplify($id, "pharmgkb_vocabulary:xref", $id2)
586589
);
590+
if(isset($n[3]) && $n[2] != $n[3]){
591+
parent::addRDF(
592+
parent::triplifyString($id2, "rdfs:label", str_replace(array("\'", "\""),array("\\\'", ""),$n[3]))
593+
);
594+
}
587595
}
588-
}
596+
}
589597
}
598+
parent::writeRDFBufferToWriteFile();
590599
}
591-
parent::writeRDFBufferToWriteFile();
592600
}
593601

594602
/*
@@ -608,7 +616,12 @@ function diseases()
608616
function variant_annotations()
609617
{
610618
$hash = ''; // md5 hash list
611-
$this->GetReadFile()->Read();
619+
$h = explode("\t",$this->GetReadFile()->Read(100000)); // first line is header
620+
if(count($h) != 12) {
621+
triger_error("Change in number of columns for variant annotations file",E_USER_ERROR);
622+
return FALSE;
623+
}
624+
612625
while($l = $this->GetReadFile()->Read(10000)) {
613626
$a = explode("\t",$l);
614627
$id = parent::getNamespace().$a[11];
@@ -736,36 +749,36 @@ function variant_annotations()
736749
}
737750

738751
/*
739-
Entity1_id - PA267, rs5186, Haplotype for PA121
740-
Entity1_type - Drug, Gene, VariantLocation, Disease, Haplotype, Association
741-
Entity2_id - PA267, rs5186, Haplotype for PA121
742-
Entity2_type - Drug, Gene, VariantLocation, Disease, Haplotype, Association
743-
Evidence - VariantAnnotation, Pathway, VIP, ClinicalAnnotation, DosingGuideline, DrugLabel, Annotation
744-
Evidence Sources - Publication
745-
Pharmacodynamic - Y
746-
Pharmacokinetic - Y
747-
748-
Entity1_id Entity1_type Entity2_id Entity2_type Evidence Association PK PD PMIDs
749-
PA445738 Disease PA134866404 Gene VariantAnnotation associated PD 21912425
750-
752+
0 Entity1_id - PA267, rs5186, Haplotype for PA121
753+
1 Entity1_name
754+
2 Entity1_type - Drug, Gene, VariantLocation, Disease, Haplotype, Association
755+
3 Entity2_id - PA267, rs5186, Haplotype for PA121
756+
4 Entity2_name
757+
5 Entity2_type - Drug, Gene, VariantLocation, Disease, Haplotype, Association
758+
6 Evidence - VariantAnnotation, Pathway, VIP, ClinicalAnnotation, DosingGuideline, DrugLabel, Annotation
759+
7 Association
760+
8 Pharmacokinetic - Y
761+
9 P harmacodynamic - Y
762+
10 PMIDS
751763
*/
752764
function relationships()
753765
{
754766
$declared = '';
755767
$hash = ''; // md5 hash list
756-
$this->GetReadFile()->Read();
768+
$h = explode("\t", $this->GetReadFile()->Read());
769+
if(count($h) != 11) {
770+
trigger_error("Change in number of columns for relationships file (again)");
771+
return FALSE;
772+
}
773+
757774
while($l = $this->GetReadFile()->Read(10000)) {
758775
$a = explode("\t",$l);
759-
if(count($a) != 9) {
760-
trigger_error("Change in number of columns for relationships file");
761-
return FALSE;
762-
}
763776

764777
// id1
765778
$ns1 = parent::getNamespace();
766779
$id1 = $a[0];
767780
$id1 = str_replace(" ","_",$id1);
768-
$type1 = $a[1];
781+
$type1 = $a[2];
769782
if($id1[0] == 'r') {
770783
$ns1 = 'dbsnp:';
771784
} else if($id1[0] == 'H') {
@@ -774,9 +787,9 @@ function relationships()
774787

775788
// id2
776789
$ns2 = parent::getNamespace();
777-
$id2 = $a[2];
790+
$id2 = $a[3];
778791
$id2 = str_replace(" ","_",$id2);
779-
$type2 = $a[3];
792+
$type2 = $a[5];
780793
if($id2[0] == 'r') {
781794
$ns2 = 'dbsnp:';
782795
} else if($id2[0] == 'H') {
@@ -791,15 +804,15 @@ function relationships()
791804
$id = parent::getRes()."association_".$id1."_".$id2;
792805
$association = $type1.' '.$type2.' Association';
793806
parent::addRDF(
794-
parent::describeIndividual($id, $assocation, parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association").
807+
parent::describeIndividual($id, $association, parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association").
795808
parent::triplify($id, parent::getVoc().strtolower($type1), $ns1.$id1).
796809
parent::triplify($id, parent::getVoc().strtolower($type2), $ns2.$id2).
797810
parent::describeClass(parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association", "PharmGKB $type1 $type2 Association").
798811
parent::describeProperty(parent::getVoc().strtolower($type1), "Relationship between a PharmGKB association and a $type1").
799812
parent::describeProperty(parent::getVoc().strtolower($type2), "Relationship between a PharmGKB association and a $type2")
800813
);
801814

802-
$b = explode(',',$a[4]);
815+
$b = explode(',',$a[7]);
803816
foreach($b AS $c) {
804817
parent::addRDF(
805818
parent::triplifyString($id, parent::getVoc()."association_type", $c)
@@ -809,27 +822,27 @@ function relationships()
809822
parent::describeProperty(parent::getVoc()."association_type", "Relationship between a PharmGKB association and its type")
810823
);
811824

812-
if($a[6]){
825+
if($a[8]){
813826
parent::addRDF(
814827
parent::triplifyString($id, parent::getVoc()."pk_relationship", "true")
815828
);
816829
}
817-
if($a[7]){
830+
if($a[9]){
818831
parent::addRDF(
819832
parent::triplifyString($id, parent::getVoc()."pd_relationship", "true")
820833
);
821834
}
822-
$a[8] = trim($a[8]);
823-
if($a[8]) {
824-
$b = explode(',',$a[8]);
835+
$a[10] = trim($a[10]);
836+
if($a[10]) {
837+
$b = explode(';',$a[10]);
825838
foreach($b AS $pubmed_id) {
826839
parent::addRDF(
827840
parent::triplify($id, parent::getVoc()."article", "pubmed:".$pubmed_id)
828841
);
829842
}
830843
}
844+
parent::writeRDFBufferToWriteFile();
831845
}
832-
parent::writeRDFBufferToWriteFile();
833846
}
834847

835848

@@ -1231,10 +1244,10 @@ function variant_annotation()
12311244

12321245
function pathways()
12331246
{
1247+
/** @todo changed completely */
12341248
$entry = false;
12351249
while($l = $this->GetReadFile()->Read(20000)) {
12361250
$a = explode("\t",trim($l));
1237-
print_r($a);
12381251
if(strlen(trim($l)) == 0) {
12391252
// end of entry
12401253
$entry = false;
@@ -1273,8 +1286,8 @@ function pathways()
12731286
parent::describeProperty(parent::getVoc()."chemical", "Relationship between a PharmGKB entity and a chemical")
12741287
);
12751288
}
1289+
parent::writeRDFBufferToWriteFile();
12761290
}
1277-
parent::writeRDFBufferToWriteFile();
12781291
}
12791292

12801293
/*

0 commit comments

Comments
 (0)