From 42e2978b1f4de45cb98b070e06329aaaa245f5bf Mon Sep 17 00:00:00 2001 From: Jeremy Carroll Date: Mon, 18 Mar 2013 17:02:29 -0700 Subject: [PATCH 01/64] Added MIT license hopefully as per mailing list discussion. --- MIT-LICENSE.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 MIT-LICENSE.txt diff --git a/MIT-LICENSE.txt b/MIT-LICENSE.txt new file mode 100644 index 0000000..9088070 --- /dev/null +++ b/MIT-LICENSE.txt @@ -0,0 +1,21 @@ +Copyright 2013 Bio2RDF project team and other contributors +http://bio2rdf.org + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From ef0c9068de7c47dc03f8afc16ec8a5de52e7221b Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Thu, 8 Aug 2013 17:53:32 -0400 Subject: [PATCH 02/64] Update README.md slight update --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 1c9df08..0d1061e 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,10 @@ Requirements The majority of these scripts were created so as to run on Linux servers. Depending on the script you wish to run you will need: * PHP5 -* Perl 6 -* Ruby(1.9.3) * Java 1.5 * Bio2RDF API (available from: https://github.com/micheldumontier/php-lib) -The Bio2RDF API should be installed in the root directory of the project tree (i.e bio2rdf-scripts). +The Bio2RDF API should be installed in the root directory of the project tree (i.e sibling to bio2rdf-scripts). Usage ----- From 7c5b436887e2fa45944300477e7693b64256e501 Mon Sep 17 00:00:00 2001 From: Paul Rigor Date: Mon, 16 Sep 2013 17:19:36 -0700 Subject: [PATCH 03/64] Fixed (line 392): removed premature closing parenthesis in chembl quad instantiation ~Paul Rigor --- chembl/chembl.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chembl/chembl.php b/chembl/chembl.php index dad0bb9..8aae3ae 100644 --- a/chembl/chembl.php +++ b/chembl/chembl.php @@ -389,7 +389,7 @@ function process_assays() { //chembl assay id $chembl = "chembl:". $row['chembl_id']; - $this->AddRDF($this->QQuadl($assay,"dc:identifier"),$row['chembl_id'])); + $this->AddRDF($this->QQuadl($assay,"dc:identifier",$row['chembl_id'])); $this->AddRDF($this->QQuad($assay,"owl:equivalentClass",$chembl)); $this->AddRDF($this->QQuad($chembl,"owl:equivalentClass",$assay)); $this->WriteRDFBufferToWriteFile(); @@ -541,4 +541,4 @@ function process_properties(){ $parser = new ChemblParser($argv); $parser->Run(); -?> \ No newline at end of file +?> From 37775ec36a271b461fee0b4b4b84042a1e14984d Mon Sep 17 00:00:00 2001 From: Jose Cruz-Toledo Date: Fri, 24 Jan 2014 17:32:08 -0500 Subject: [PATCH 04/64] added license notice at the bottom --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0d1061e..7c370f9 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,5 @@ Each script is run independently. See README files inside each script directory. Contact Us --------- For more information visit http://bio2rdf.org or http://dumontierlab.com +--- +Licensed under [MIT License](http://en.wikipedia.org/wiki/MIT_License), see [license page](https://github.com/bio2rdf/bio2rdf-scripts/wiki/MIT-License) for details. From 5d3cfc8080e70e1e4f7c69072dac5a8bb775baca Mon Sep 17 00:00:00 2001 From: Jose Cruz-Toledo Date: Fri, 24 Jan 2014 17:32:32 -0500 Subject: [PATCH 05/64] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c370f9..c24cfad 100644 --- a/README.md +++ b/README.md @@ -19,5 +19,5 @@ Each script is run independently. See README files inside each script directory. Contact Us --------- For more information visit http://bio2rdf.org or http://dumontierlab.com ---- +--------------------------------------------------------------------- Licensed under [MIT License](http://en.wikipedia.org/wiki/MIT_License), see [license page](https://github.com/bio2rdf/bio2rdf-scripts/wiki/MIT-License) for details. From 7eddae2191758a84032b94c5828925e18c52b368 Mon Sep 17 00:00:00 2001 From: Jose Cruz-Toledo Date: Fri, 24 Jan 2014 17:33:27 -0500 Subject: [PATCH 06/64] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index c24cfad..10906a3 100644 --- a/README.md +++ b/README.md @@ -19,5 +19,4 @@ Each script is run independently. See README files inside each script directory. Contact Us --------- For more information visit http://bio2rdf.org or http://dumontierlab.com ---------------------------------------------------------------------- -Licensed under [MIT License](http://en.wikipedia.org/wiki/MIT_License), see [license page](https://github.com/bio2rdf/bio2rdf-scripts/wiki/MIT-License) for details. + From a383804ef04602c8d85fa47f73144fe5b76e6594 Mon Sep 17 00:00:00 2001 From: Jose Cruz-Toledo Date: Fri, 24 Jan 2014 17:36:11 -0500 Subject: [PATCH 07/64] Update MIT-LICENSE.txt --- MIT-LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MIT-LICENSE.txt b/MIT-LICENSE.txt index 9088070..4639eef 100644 --- a/MIT-LICENSE.txt +++ b/MIT-LICENSE.txt @@ -1,4 +1,4 @@ -Copyright 2013 Bio2RDF project team and other contributors +Copyright 2014 Bio2RDF project team and other contributors http://bio2rdf.org Permission is hereby granted, free of charge, to any person obtaining From 4baf88cd4b8356297a2cbf9e29b6d2ef1c4fae10 Mon Sep 17 00:00:00 2001 From: Jose Cruz-Toledo Date: Fri, 24 Jan 2014 17:39:40 -0500 Subject: [PATCH 08/64] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 10906a3..1c6e203 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,5 @@ Contact Us --------- For more information visit http://bio2rdf.org or http://dumontierlab.com +--- +Licensed under [MIT License](http://en.wikipedia.org/wiki/MIT_License), see [license page](https://github.com/bio2rdf/bio2rdf-scripts/wiki/MIT-License) for details. From bb06b37a81d6f358e433bbc5007ae75e5a586533 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Sun, 27 Mar 2016 10:29:55 +0300 Subject: [PATCH 09/64] changed gene id to uppercase organism id #430 --- kegg/kegg.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kegg/kegg.php b/kegg/kegg.php index fe393e9..8c189f5 100644 --- a/kegg/kegg.php +++ b/kegg/kegg.php @@ -82,7 +82,7 @@ function run() // get the list of genes for this organims echo "processing $org".PHP_EOL; - $this->org = $org; // local variable + $this->org = strtoupper($org); // local variable $lfile = $ldir.$org.".txt"; $rfile = parent::getParameterValue("download_url")."list/$org"; @@ -223,9 +223,10 @@ function process($db) if(isset($this->idlist) and !in_array($id,$this->idlist)) continue; if(isset($this->org)) { - $id = $ns."_".$id; + $id = strtoupper($ns)."_".$id; } $uri = $this->getNamespace().$id; + parent::addRDF( parent::describeIndividual($uri,$name,parent::getVoc().ucfirst($db)). parent::describeClass(parent::getVoc().ucfirst($db),"KEGG $db"). @@ -293,6 +294,7 @@ function parseEntry($lfile) $uri = parent::getNamespace().$e['id']; continue; } + // key with value if(in_array($k, array("NAME","DESCRIPTION","DEFINITION","EQUATION","COMMENT"))) { if($k == "NAME") { From 59801deaed1f3b479424295a250c555c603ac0a7 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Sun, 27 Mar 2016 10:31:18 +0300 Subject: [PATCH 10/64] include more than just human genes --- kegg/kegg.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kegg/kegg.php b/kegg/kegg.php index 8c189f5..8f613fe 100644 --- a/kegg/kegg.php +++ b/kegg/kegg.php @@ -59,7 +59,8 @@ function run() // handle genes separately if(in_array("genes",$files)) { - $orgs = array("hsa"); //,"mmu","eco","dre","dme","ath","sce","ddi"); + $orgs = array("hsa","mmu","eco","dre","dme","ath","sce","ddi"); + //$orgs = array("hsa"); echo "processing genes".PHP_EOL; $ofile = "kegg-genes.".parent::getParameterValue('output_format'); From 38e86a50b359e6304b16df81ca526e6c831f0eea Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Sun, 27 Mar 2016 10:58:20 +0300 Subject: [PATCH 11/64] fixed field tokenizer #429 --- pharmgkb/pharmgkb.php | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 117323a..dfdd7f9 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -773,8 +773,9 @@ function clinical_ann_metadata() trigger_error("Change in the number of columns. Expected ".count($header).", but found ".count($this_header),E_USER_ERROR); return (-1); } - while($l = $this->GetReadFile()->Read(200000)) { + while($l = $this->GetReadFile()->Read(2000000)) { $a = explode("\t",$l); + $id = parent::getNamespace().$a[0]; $label = "clinical annotation for ".$a[1]; // [0] => Clinical Annotation Id @@ -818,23 +819,24 @@ function clinical_ann_metadata() ); } - // [6] => Clinical Annotation Types + // [4] => Clinical Annotation Types if($a[4]) { - $types = explode(";",$a[4]); + $types = explode('","',$a[4]); foreach($types AS $t) { + $t = strtolower(str_replace('"','',$t)); parent::addRDF( - parent::triplifyString($id, parent::getVoc()."annotation-type", strtolower($t)) + parent::triplifyString($id, parent::getVoc()."annotation-type", $t) ); } } // [5] => Genotype-Phenotypes IDs // [6] => Text if($a[5]) { - $gps = explode(";",$a[5]); - $gps_texts = explode(";",$a[6]); + $gps = explode('","',$a[5]); + $gps_texts = explode('","',$a[6]); foreach($gps AS $i => $gp) { - $gp = trim($gp); - $gp_text = trim($gps_texts[$i]); + $gp = str_replace('"','',trim($gp)); + $gp_text = str_replace('"','',trim($gps_texts[$i])); $b = explode(":",$gp_text,2); parent::addRDF( @@ -851,11 +853,11 @@ function clinical_ann_metadata() // [7] => Variant Annotations IDs // [8] => Variant Annotations if($a[7]) { - $b = explode(";",$a[7]); - $b_texts = explode(";",$a[8]); + $b = explode('","',$a[7]); + $b_texts = explode('","',$a[8]); foreach($b AS $i => $variant) { - $variant = trim($variant); - $variant_text = trim ($b_texts[$i]); + $variant = str_replace('"','',trim($variant)); + $variant_text = str_replace('"','',trim ($b_texts[$i])); parent::addRDF( parent::describeIndividual(parent::getNamespace().$variant, $variant_text, parent::getVoc()."Variant"). parent::triplify($id, parent::getVoc()."variant", parent::getNamespace().$variant) @@ -865,9 +867,9 @@ function clinical_ann_metadata() // [9] => PMIDs if($a[9]) { - $b = explode(";",$a[9]); + $b = explode('","',$a[9]); foreach($b AS $i => $pmid) { - $pmid = trim($pmid); + $pmid = str_replace(',','',trim($pmid)); parent::addRDF( parent::triplify($id, parent::getVoc()."article", "pubmed:".$pmid) ); @@ -884,8 +886,9 @@ function clinical_ann_metadata() // [11] => Related Drugs if($a[11]) { - $b = explode(";",$a[11]); + $b = explode('","',$a[11]); foreach($b AS $drug_label) { + $drug_label = str_replace('"','',$drug_label); // find the id from the label $find = @array_search($drug_label, $this->drug_names_array); if($find !== FALSE and $find !== NULL){ @@ -906,8 +909,9 @@ function clinical_ann_metadata() } // [12] => Related Diseases if($a[12]) { - $b = explode(";",$a[12]); + $b = explode('","',$a[12]); foreach($b AS $disease_label) { + $disease_label = str_replace('"','',$disease_label); // find the id from the label $find = @array_search($disease_label, $this->disease_names_array); if($find !== FALSE and $find !== NULL){ From 7dbd974683e379ef8394c8427798d21b587bb291 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 29 Mar 2016 09:27:42 +0200 Subject: [PATCH 12/64] fixed multi entry parsing error #432 --- kegg/kegg.php | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kegg/kegg.php b/kegg/kegg.php index 8f613fe..e144e08 100644 --- a/kegg/kegg.php +++ b/kegg/kegg.php @@ -691,14 +691,19 @@ function parseEntry($lfile) parent::triplify($uri,parent::getVoc().strtolower($k),$id) ); preg_match_all("/ \[([^\]]+)\]/",$v,$m); - if(isset($m[1])) { - foreach($m[1] AS $item) { - if(!strstr($item,"KO")) $item = "kegg:".str_replace(":","_",$item); - else $item = str_replace("KO:","kegg:",$item); - parent::addRDF( - parent::triplify($id,parent::getVoc()."link",$item) - ); + if(isset($m[1]) and !empty($m[1])) { + foreach($m[1] AS $item) { + $a = explode(':',$item); // get the namespace + $b = explode(' ',$a[1]); + foreach($b AS $c) { + if(!strstr($item,"KO")) $i = "kegg:".$a[0].'_'.$c; + else $i = "kegg:".$c; + parent::addRDF( + parent::triplify($id,parent::getVoc()."link",$i) + ); + } } + $test = true; } continue; } From e32c86094215d2f1708176c8172680af1ff61b74 Mon Sep 17 00:00:00 2001 From: micheldumontier Date: Tue, 29 Mar 2016 06:24:32 -0700 Subject: [PATCH 13/64] parse the mesh ids out of the xml files #433 --- pubmed/pubmed.php | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pubmed/pubmed.php b/pubmed/pubmed.php index 92dd9eb..106828f 100644 --- a/pubmed/pubmed.php +++ b/pubmed/pubmed.php @@ -65,6 +65,8 @@ function process_dir(){ $files = glob($ldir."*.xml.gz"); foreach($files AS $i => $file) { + // if($file != '/data/download/pubmed/medline16n0345.xml.gz') continue; + echo "Processing $file (".($i+1)."/".count($files).") ..."; $this->process_file($file); parent::clear(); @@ -185,19 +187,17 @@ function pubmed() $i = 0; foreach($citation->MeshHeadingList->MeshHeading AS $mh){ $id = parent::getRes().$pmid."_mh_".++$i; - $did = parent::getRes().md5($mh->DescriptorName); + $did = "mesh:".$mh->DescriptorName['UI']; parent::addRDF( parent::describeIndividual($id, $mh->DescriptorName, parent::getVoc()."MeshHeading"). - parent::describeClass(parent::getVoc()."MeshHeading","MeSH Heading"). - parent::triplify($pmid_uri, parent::getVoc()."mesh-heading", $id). - + parent::triplify($id, parent::getVoc()."x-mesh", $did). parent::triplifyString($id, parent::getVoc()."descriptor-major-topic", "".$mh->DescriptorName['MajorTopicYN']). - parent::describeIndividual($did, "".$mh->DescriptorName, parent::getVoc()."Mesh-Descriptor"). - parent::triplify($id, parent::getVoc()."mesh-descriptor", $did) + parent::describeClass(parent::getVoc()."MeshHeading","MeSH Heading"). + parent::triplify($pmid_uri, parent::getVoc()."mesh-heading", $id) ); if(!empty($mh->QualifierName)){ foreach($mh->QualifierName AS $qualifier_name) { - $qid = parent::getRes().md5($qualifier_name); + $qid = "mesh:".$mh->QualifierName['UI']; parent::addRDF( parent::describeIndividual($qid, $qualifier_name, parent::getVoc()."Mesh-Qualifier"). parent::triplify($id, parent::getVoc()."mesh-qualifier", $qid) @@ -211,8 +211,10 @@ function pubmed() $i = 0; foreach($citation->ChemicalList->Chemical as $chemical){ $id = parent::getRes().$pmid."_ch_".++$i; + $mesh_id = "mesh:".$chemical->NameOfSubstance['UI']; parent::addRDF( parent::describeIndividual($id, $chemical->NameOfSubstance, parent::getVoc()."Chemical"). + parent::triplify($id,parent::getVoc()."x-mesh",$mesh_id). parent::describeClass(parent::getVoc()."Chemical","Chemical"). parent::triplify($pmid_uri, parent::getVoc()."chemical", $id) ); @@ -255,7 +257,8 @@ function pubmed() $label = str_replace(" ","-",$publicationType); parent::addRDF( parent::triplify($pmid_uri, parent::getVoc()."publication-type", $id). - parent::describeClass($id, $publicationType) + parent::describeClass($id, $publicationType). + parent::triplify($id,parent::getVoc()."x-mesh","mesh:".$publicationType['UI']) ); } From be0a89e31ddf2193fa822222c1b6b0716faa79ea Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 29 Mar 2016 17:52:22 +0200 Subject: [PATCH 14/64] use Bio2RDF's GO to replace labels with codes #431 --- drugbank/drugbank.php | 52 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/drugbank/drugbank.php b/drugbank/drugbank.php index 3254e99..22da4e7 100755 --- a/drugbank/drugbank.php +++ b/drugbank/drugbank.php @@ -56,6 +56,16 @@ function Run() if(parent::getParameterValue("id_list")) { $this->id_list = array_flip(explode(",",parent::getParameterValue('id_list'))); } + + $go_cache_file = parent::getParameterValue('indir')."go.cache.json"; + #unlink($go_cache_file); + if(!file_exists($go_cache_file) or parent::getParameterValue('download') == true) { + $this->getGO(); + file_put_contents($go_cache_file,json_encode($this->go)); + } else { + // read the file + $this->go = json_decode( file_get_contents($go_cache_file), true); + } $dataset_description = ''; foreach($files AS $f) { @@ -241,9 +251,16 @@ function parsePartnerEntry($did, $pid, $x) parent::triplify($pid, parent::getVoc()."x-pfam","pfam:"."".$v2->identifier) ); } else if($k2 == "go-classifier") { - parent::addRDF( - parent::triplifyString($pid, parent::getVoc()."go-".$v2->category, $v2->description) - ); + $e = array_search($v2->description, $this->go); + if($e !== FALSE) { + parent::addRDF( + parent::triplify($pid, parent::getVoc()."go-".$v2->category, $e) + ); + } else { + parent::addRDF( + parent::triplifyString($pid, parent::getVoc()."go-".$v2->category, $v2->description) + ); + } } else { trigger_error("no handler for $k2",E_USER_WARNING); /* parent::addRDF( @@ -773,6 +790,35 @@ function AddList(&$x, $id, $list_name, $item_name, $predicate, $list_item_name = } } + function getGO() + { + $this->go = null; + + $server = "http://bio2rdf.org/sparql"; + $sparql = "PREFIX dct: +SELECT distinct ?id ?title +{ + ?go a . + ?go dct:identifier ?id. + ?go dct:title ?title . +} "; + $url = $server."?query=".urlencode($sparql)."&format=".urlencode("text/tab-separated-values"); + + $results = file_get_contents($url); + if($results === FALSE) { + trigger_error("Unable to get Gene Ontology labels",E_USER_WARNING); + return false; + } + $list = explode("\n",$results); + array_shift($list); array_pop($list); // remove first and last + + foreach($list AS $v) { + $b = explode("\t",str_replace('"','',$v)); + $this->go[$b[0]] = $b[1]; + } + return true; + } + } // end class ?> From 5b2b5252a280b30bd6a9d487753fb90e76699e0a Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 1 Apr 2016 09:03:49 +0200 Subject: [PATCH 15/64] fixed changed column issue #436 --- pharmgkb/pharmgkb.php | 55 ++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index dfdd7f9..4b0ad6a 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -247,7 +247,8 @@ function run() /* 0 PharmGKB Accession Id - 1 Entrez Id + 1 NCBI Gene Id + 1.a HGNC Id 2 Ensembl Id 3 Name 4 Symbol @@ -264,9 +265,10 @@ function run() function genes() { $h = explode("\t",parent::getReadFile()->read()); - $expected_columns = 14; + $expected_columns = 15; if(($n = count($h)) != $expected_columns) { trigger_error("Found $n columns in gene file - expecting $expected_columns!", E_USER_WARNING); + //print_r($h); return false; } @@ -293,28 +295,32 @@ function genes() parent::triplify($id, parent::getVoc()."x-ncbigene", "ncbigene:".$a[1]) ); } - if($a[2]){ parent::addRDF( - parent::triplify($id, parent::getVoc()."x-ensembl", "ensembl:".$a[2]) + parent::triplify($id, parent::getVoc()."x-hgnc", "hgnc:".$a[2]) + ); + } + if($a[3]){ + parent::addRDF( + parent::triplify($id, parent::getVoc()."x-ensembl", "ensembl:".$a[3]) ); } - if($a[3]){ + if($a[4]){ parent::addRDF( - parent::triplifyString($id, parent::getVoc()."name", $a[3]). + parent::triplifyString($id, parent::getVoc()."name", $a[4]). parent::describeProperty(parent::getVoc()."name", "Relationship between a PharmGKB entity and its name") ); } - if($a[4]){ + if($a[5]){ parent::addRDF( - parent::triplify($id, parent::getVoc()."symbol", "symbol:".$a[4]). + parent::triplify($id, parent::getVoc()."symbol", "symbol:".$a[5]). parent::describeProperty(parent::getVoc()."symbol", "Relationship between a PharmGKB gene and a gene symbol") ); } - if($a[5]) { - $b = explode('","',substr($a[5],1,-2)); + if($a[6]) { + $b = explode('","',substr($a[6],1,-2)); foreach($b AS $alt_name) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."alternative-name", parent::safeLiteral(trim(stripslashes($alt_name)))) @@ -324,8 +330,8 @@ function genes() parent::describeProperty(parent::getVoc()."alternative-name", "Relationship between a PharmGKB gene and an alternative name") ); } - if($a[6]) { // these are not hgnc symbols - $b = explode('","',substr($a[6],1,-2)); + if($a[7]) { // these are not hgnc symbols + $b = explode('","',substr($a[7],1,-2)); foreach($b as $alt_symbol) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."alternate-symbol", trim($alt_symbol)) @@ -336,21 +342,22 @@ function genes() ); } - if($a[7]){ + if($a[8]){ parent::addRDF( - parent::triplifyString($id, parent::getVoc()."is-vip", $a[7]). + parent::triplifyString($id, parent::getVoc()."is-vip", $a[8]). parent::describeProperty(parent::getVoc()."is-vip", "Relationship between a PharmGKB gene and its vip status") ); } - if($a[8]){ + if($a[9]){ parent::addRDF( - parent::triplifyString($id, parent::getVoc()."has-variant-annotation", $a[8]). + parent::triplifyString($id, parent::getVoc()."has-variant-annotation", $a[9]). parent::describeProperty(parent::getVoc()."has-variant-annotation", "Relationship between a PharmGKB gene and whether it has a variant annotation") ); } - if($a[9]) { - $b = explode(",",$a[9]); + if($a[10]) { + $b = explode(",",$a[10]); + print_r($b); foreach($b AS $xref) { $xref = trim($xref); if(!$xref) continue; @@ -370,18 +377,18 @@ function genes() } } } - if($a[10]) { + if($a[11]) { parent::addRDF( - parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[10]) + parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[11]) ); } - if($a[11]) { + if($a[12]) { parent::addRDF( - parent::triplifyString($id,parent::getVoc()."chromosome",$a[11]). + parent::triplifyString($id,parent::getVoc()."chromosome",$a[12]). parent::describeProperty(parent::getVoc()."chrosomome","Relationship between a PharmGKB gene and its chromosomal position"). - parent::triplifyString($id,parent::getVoc()."chromosome-start",$a[12]). - parent::triplifyString($id,parent::getVoc()."chromosome-end",$a[13]) + parent::triplifyString($id,parent::getVoc()."chromosome-start",$a[13]). + parent::triplifyString($id,parent::getVoc()."chromosome-end",$a[14]) ); } parent::WriteRDFBufferToWriteFile(); From ff3b213d17aefd244ddaebe38a7bebbaed867314 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 1 Apr 2016 09:17:10 +0200 Subject: [PATCH 16/64] fixed parsing of multi-value fields for pharmgkb gene data #436 --- pharmgkb/pharmgkb.php | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 4b0ad6a..d048a91 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -320,8 +320,8 @@ function genes() ); } if($a[6]) { - $b = explode('","',substr($a[6],1,-2)); - foreach($b AS $alt_name) { + $list = $this->parseList($a[6]); + foreach($list AS $alt_name) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."alternative-name", parent::safeLiteral(trim(stripslashes($alt_name)))) ); @@ -331,8 +331,8 @@ function genes() ); } if($a[7]) { // these are not hgnc symbols - $b = explode('","',substr($a[7],1,-2)); - foreach($b as $alt_symbol) { + $list = $this->parseList($a[7]); + foreach($list as $alt_symbol) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."alternate-symbol", trim($alt_symbol)) ); @@ -356,9 +356,8 @@ function genes() } if($a[10]) { - $b = explode(",",$a[10]); - print_r($b); - foreach($b AS $xref) { + $list = $this->parseList($a[10]); + foreach($list AS $xref) { $xref = trim($xref); if(!$xref) continue; @@ -369,7 +368,6 @@ function genes() parent::addRDF( parent::QQuadO_URL($id, parent::getVoc()."x-$ns", $x) ); - } else { parent::addRDF( parent::triplify($id, parent::getVoc()."x-$ns", $x) @@ -377,6 +375,7 @@ function genes() } } } + if($a[11]) { parent::addRDF( parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[11]) @@ -386,16 +385,27 @@ function genes() if($a[12]) { parent::addRDF( parent::triplifyString($id,parent::getVoc()."chromosome",$a[12]). - parent::describeProperty(parent::getVoc()."chrosomome","Relationship between a PharmGKB gene and its chromosomal position"). - parent::triplifyString($id,parent::getVoc()."chromosome-start",$a[13]). - parent::triplifyString($id,parent::getVoc()."chromosome-end",$a[14]) + parent::describeProperty(parent::getVoc()."chrosomome","Relationship between a PharmGKB gene and its chromosomal position") ); + if($a[13] != '-1' and $a[14] != '-1') { + parent::addRDF( + parent::triplifyString($id,parent::getVoc()."chromosome-start",$a[13]). + parent::triplifyString($id,parent::getVoc()."chromosome-end",$a[14]) + ); + } } - parent::WriteRDFBufferToWriteFile(); - + parent::writeRDFBufferToWriteFile(); } } + function parseList($str) + { + $list = ''; + if($str[0] == '"') $list = explode('","', substr($str,1,-1)); + else $list[] = $str; + return $list; + } + function MapXrefs($xref, &$url = false, &$ns = null, &$id = null) { $xrefs = array( @@ -419,6 +429,7 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null) if(isset($xrefs[$ns])) { $ns = $xrefs[$ns]; } + $url = false; if($ns == "url") { $url = true; From 6fd2ca6fd597b3acef9c625cb7ad90da7a122fdd Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 1 Apr 2016 10:40:39 +0200 Subject: [PATCH 17/64] added parselist to drugs. more ids extracted #437 --- pharmgkb/pharmgkb.php | 56 +++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index d048a91..128005d 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -457,7 +457,7 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null) function drugs() { $declared = ''; - $h = explode("\t",$this->GetReadFile()->Read(1000)); // first line is header + $h = explode("\t",$this->GetReadFile()->Read(10000)); // first line is header if(count($h) != 10) { trigger_error("Change in number of columns for drugs file",E_USER_ERROR); return FALSE; @@ -465,7 +465,6 @@ function drugs() while($l = $this->GetReadFile()->Read(200000)) { $a = explode("\t",$l); $id = parent::getNamespace().$a[0]; - $this->drugs[$a[0]] = $a[1]; parent::addRDF( @@ -476,8 +475,8 @@ function drugs() if(trim($a[2])) { // generic names // Entacapona [INN-Spanish],Entacapone [Usan:Inn],Entacaponum [INN-Latin],entacapone - $b = explode(',',trim($a[2])); - foreach($b AS $c) { + $list = $this->parseList(trim($a[2])); + foreach($list AS $c) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."generic_name", str_replace('"','',$c)) ); @@ -489,8 +488,8 @@ function drugs() if(trim($a[3])) { // trade names //Disorat,OptiPranolol,Trimepranol - $b = explode(',',trim($a[3])); - foreach($b as $c) { + $list = $this->parseList(trim($a[3])); + foreach($list as $c) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."trade_name", str_replace(array("'", "\""), array("\\\'", "") ,$c)) ); @@ -502,8 +501,8 @@ function drugs() if(trim($a[4])) { // Brand Mixtures // Benzyl benzoate 99+ %,"Dermadex Crm (Benzoic Acid + Benzyl Benzoate + Lindane + Salicylic Acid + Zinc Oxide + Zinc Undecylenate)", - $b = explode(',',trim($a[4])); - foreach($b as $c) { + $list = $this->parseList(trim($a[4])); + foreach($list as $c) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."brand_mixture", str_replace(array("'", "\""),array("\\\'",""), $c)) ); @@ -522,8 +521,8 @@ function drugs() if(trim($a[6])) { // Cross References // drugBank:DB00789,keggDrug:D01707,pubChemCompound:55466,pubChemSubstance:192903,url:http://en.wikipedia.org/wiki/Gadopentetate_dimeglumine - $b = explode(',',trim(str_replace('"','',$a[6]))); - foreach($b as $c) { + $list = $this->parseList(trim($a[6])); + foreach($list as $c) { $this->getRegistry()->parseQName($c,$ns,$id1); $ns = str_replace(array('"',' '),'',$ns); $ns = str_replace(array('keggcompound','keggdrug','drugbank','uniprotkb','clinicaltrials.gov','drugsproductdatabase(dpd)','nationaldrugcodedirectory','therapeutictargetsdatabase','fdadruglabelatdailymed'), @@ -531,7 +530,7 @@ function drugs() strtolower(str_replace('"','',$ns))); if($ns == "url") { parent::addRDF( - parent::QQuadO_URL($id, "rdfs:seeAlso", $id) + parent::QQuad($id, "rdfs:seeAlso", $id) ); } else { parent::addRDF( @@ -540,28 +539,43 @@ function drugs() } } } + + if(trim($a[7])) { + parent::addRDF( + parent::triplifyString($id, parent::getVoc()."smiles", substr($a[7],1,-1)). + parent::describeProperty(parent::getVoc()."smiles", "Relationship between a PharmGKB drug and its SMILES string") + ); + } + if($a[8]) { + parent::addRDF( + parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[8]) + ); + } if(trim($a[9])) { // External Vocabulary // ATC:H01AC(Somatropin and somatropin agonists),ATC:V04CD(Tests for pituitary function) // ATC:D07AB(Corticosteroids, moderately potent (group II)) => this is why you don't use brackets and commas as separators. - $b = explode(',',trim($a[9]),2); - foreach($b as $c) { - preg_match_all("/ATC:([A-Z0-9]+)\((.*)\)$/",$c,$m); - if(isset($m[1][0])) { - $atc = "atc:".$m[1][0]; + $list = $this->parseList(trim($a[9])); + foreach($list as $c) { + preg_match("/([^\(]+)?\((.*)\)/", $c, $m); + if(isset($m[1])) { + $this->getRegistry()->parseQName($m[1],$ns,$id1); + $myid = $ns.":".$id1; + $label = $m[2]; + parent::addRDF( - parent::triplify($id, parent::getVoc()."x-atc", $atc) + parent::triplify($id, parent::getVoc()."x-$ns", $myid) ); - if(!isset($declared[$atc])) { - $declared[$atc] = ''; + if(!isset($declared[$myid])) { + $declared[$myid] = ''; parent::addRDF( - parent::triplifyString($atc, "rdfs:label", $m[2][0]) + parent::triplifyString($myid, "rdfs:label", $m[2]) ); } } } } - parent::WriteRDFBufferToWriteFile(); + parent::writeRDFBufferToWriteFile(); } } From 84dcc7377a5217397b04161be6985fa22b77c10e Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 1 Apr 2016 10:54:57 +0200 Subject: [PATCH 18/64] Fixed disease data parsing #438 --- pharmgkb/pharmgkb.php | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 128005d..60a4696 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -610,9 +610,8 @@ function diseases() parent::describeProperty(parent::getVoc()."name", "Relationship between a PharmGKB entity and its name") ); - if(!isset($a[2])) continue; if($a[2] != '') { - $names = explode('",',$a[2]); + $names = $this->parseList($a[2]); foreach($names AS $name) { if($name != ''){ parent::addRDF( @@ -623,30 +622,26 @@ function diseases() } } - // MeSH:D001145(Arrhythmias, Cardiac),SnoMedCT:195107004(Cardiac dysrhythmia NOS),UMLS:C0003811(C0003811) + // $a[3] appears to be null. - $sameID = parent::getRes().md5($a[1]); - parent::addRDF( - parent::triplify($id, "owl:sameAs", $sameID) - ); + // MeSH:D001145(Arrhythmias, Cardiac),SnoMedCT:195107004(Cardiac dysrhythmia NOS),UMLS:C0003811(C0003811) if(isset($a[4]) && trim($a[4]) != '') { - $xrefs = explode('","', $a[4]); + $xrefs = $this->parseList($a[4]); foreach($xrefs AS $xref) { - $xref = str_replace('"','',$xref); - $d = preg_match_all('/[,]?([^\:]+):([A-Za-z0-9]+)\(([^\)]+)\)/',$xref,$m, PREG_SET_ORDER); - foreach($m AS $n) { - if(isset($n[1]) && isset($n[2]) && !strstr($n[1]," ")) { - $n[1] = str_replace("),","",strtolower($n[1])); - $id2 = $n[1].':'.$n[2]; + preg_match("/([^\(]+)?\((.*)\)/", str_replace('"','',$xref), $m); + if(isset($m[1])) { + $this->getRegistry()->parseQName($m[1],$ns,$id1); + $myid = $ns.":".$id1; + $label = $m[2]; + parent::addRDF( + parent::triplify($id, "pharmgkb_vocabulary:x-".$ns, $myid) + ); + if(!isset($declared[$myid]) and $id1 != $label) { + $declared[$myid] = ''; parent::addRDF( - parent::triplify($id, "pharmgkb_vocabulary:x-".$n[1], $id2) + parent::triplifyString($myid, "rdfs:label", $label) ); - if(isset($n[3]) && $n[2] != $n[3]){ - parent::addRDF( - parent::triplifyString($id2, "rdfs:label", str_replace(array("\'", "\""),array("\\\'", ""),$n[3])) - ); - } - } + } } } } From 36001d33ee2346247ef3b4092e2eb655ff7b5284 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 1 Apr 2016 11:16:54 +0200 Subject: [PATCH 19/64] fixed pharmgkb gene labels #436 --- pharmgkb/pharmgkb.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 60a4696..617eec5 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -275,8 +275,8 @@ function genes() while($l = parent::getReadFile()->read(200000)) { $a = explode("\t",$l); $id = parent::getNamespace().$a[0]; - $label = $a[3]; - $this->genes[$a[0]] = $a[3]; + $label = $a[4]; + $this->genes[$a[0]] = $a[4]; parent::addRDF( parent::describeIndividual($id, $label, parent::getVoc()."Gene"). From 349384cbbab7f227f9b787938088f58b50589683 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 1 Apr 2016 13:07:44 +0200 Subject: [PATCH 20/64] generating only a single entity-entity-association #439 --- pharmgkb/pharmgkb.php | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 617eec5..089bd60 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -667,7 +667,6 @@ function relationships() $declared = ''; $hash = ''; // md5 hash list $h = explode("\t", $this->GetReadFile()->Read()); - if(count($h) != 11) { trigger_error("Change in number of columns for relationships file (again)", E_USER_ERROR); return FALSE; @@ -676,7 +675,7 @@ function relationships() while($l = $this->getReadFile()->read(100000)) { $a = explode("\t",$l); - + $id1_list = explode(",",trim($a[0])); $id1_names = explode(",",trim($a[1])); $type1 = $a[2]; @@ -704,14 +703,21 @@ function relationships() // association $z++; $id = parent::getRes().$z; - $label = $id1_names[$i]." - ".$id2_names[$j]." association"; + if($type1 < $type2) { + $type = $type1.'-'.$type2.'-Assocation'; + $label = $id1_names[$i]." - ".$id2_names[$j]." association"; + } else { + $type = $type2.'-'.$type1.'-Assocation'; + $label = $id2_names[$i]." - ".$id1_names[$j]." association"; + } + parent::addRDF( - parent::describeIndividual($id, $label, parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association"). + parent::describeIndividual($id, $label, parent::getVoc().$type). parent::triplify($id, parent::getVoc().strtolower($type1), $i1). parent::triplify($id, parent::getVoc().strtolower($type2), $i2). parent::triplify($i1, parent::getVoc().strtolower($type2), $i2). parent::triplify($i2, parent::getVoc().strtolower($type1), $i1). - parent::describeClass(parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association", "PharmGKB $type1 $type2 Association"). + parent::describeClass(parent::getVoc().$type, "PharmGKB $type"). parent::describeProperty(parent::getVoc().strtolower($type1), "Relationship between a PharmGKB association and a $type1"). parent::describeProperty(parent::getVoc().strtolower($type2), "Relationship between a PharmGKB association and a $type2") ); @@ -848,9 +854,9 @@ function clinical_ann_metadata() // [4] => Clinical Annotation Types if($a[4]) { - $types = explode('","',$a[4]); + $types = $this->parseList($a[4]); foreach($types AS $t) { - $t = strtolower(str_replace('"','',$t)); + $t = strtolower($t); parent::addRDF( parent::triplifyString($id, parent::getVoc()."annotation-type", $t) ); From be0c05c076c4ffbb4a07c74aff75b8fb4ea9ea91 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 11 Apr 2016 15:37:22 +0200 Subject: [PATCH 21/64] Fixed clinical and variant annotations parsing #441 --- pharmgkb/pharmgkb.php | 84 +++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 089bd60..da1a5b7 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -147,7 +147,10 @@ function run() $zipentries = array(); if($file == "annotations") { // exclude: 'clinical_ann.tsv','study_parameters.tsv' - $zipentries = array('clinical_ann_metadata.tsv','var_drug_ann.tsv','var_pheno_ann.tsv','var_fa_ann.tsv'); + $zipentries = array( + 'clinical_ann_metadata.tsv', + 'var_drug_ann.tsv','var_pheno_ann.tsv','var_fa_ann.tsv' + ); } else if($file == "pathways") { for( $i = 0; $i < $zin->numFiles; $i++ ){ $stat = $zin->statIndex( $i ); @@ -801,7 +804,7 @@ function rsid() function clinical_ann_metadata() { $header = array("Clinical Annotation Id","Location","Gene","Level of Evidence","Clinical Annotation Types","Genotype-Phenotype IDs","Annotation Text","Variant Annotations IDs","Variant Annotations","PMIDs","Evidence Count","Related Drugs","Related Diseases","Race"); - $this_header = explode("\t",$this->GetReadFile()->Read()); + $this_header = explode("\t",$this->getReadFile()->read()); if(count($this_header) != count($header)) { trigger_error("Change in the number of columns. Expected ".count($header).", but found ".count($this_header),E_USER_ERROR); return (-1); @@ -810,7 +813,7 @@ function clinical_ann_metadata() $a = explode("\t",$l); $id = parent::getNamespace().$a[0]; - $label = "clinical annotation for ".$a[1]; + $label = "clinical genotype to phenotype annotations for ".$a[1]; // [0] => Clinical Annotation Id parent::addRDF( parent::describeIndividual($id, $label, parent::getVoc()."Clinical-Annotation"). @@ -873,10 +876,10 @@ function clinical_ann_metadata() $b = explode(":",$gp_text,2); parent::addRDF( - parent::describeIndividual(parent::getNamespace().$gp, $gp_text, parent::getVoc()."Genotype-Phenotype"). + parent::describeIndividual(parent::getNamespace().$gp, $gp_text, parent::getVoc()."Genotype-Phenotype-Association"). parent::triplify($id, parent::getVoc()."genotype_phenotype", parent::getNamespace().$gp). parent::triplifyString(parent::getNamespace().$gp, parent::getVoc()."genotype", trim($b[0])). - parent::describeClass(parent::getVoc()."Genotype-Phenotype", "PharmGKB Genotype Phenotype"). + parent::describeClass(parent::getVoc()."Genotype-Phenotype-Association", "PharmGKB Genotype Phenotype Association"). parent::describeProperty(parent::getVoc()."genotype_phenotype", "Relationship between a PharmGKB entity and a Genotype Phenotype"). parent::describeProperty(parent::getVoc()."genotype", "Relationship between a PharmGKB Genotype Phenotype and a genotype") ); @@ -892,7 +895,7 @@ function clinical_ann_metadata() $variant = str_replace('"','',trim($variant)); $variant_text = str_replace('"','',trim ($b_texts[$i])); parent::addRDF( - parent::describeIndividual(parent::getNamespace().$variant, $variant_text, parent::getVoc()."Variant"). + parent::describeIndividual(parent::getNamespace().$variant, $variant_text, parent::getVoc()."Variant-Annotation"). parent::triplify($id, parent::getVoc()."variant", parent::getNamespace().$variant) ); } @@ -900,9 +903,8 @@ function clinical_ann_metadata() // [9] => PMIDs if($a[9]) { - $b = explode('","',$a[9]); + $b = $this->parseList($a[9]); foreach($b AS $i => $pmid) { - $pmid = str_replace(',','',trim($pmid)); parent::addRDF( parent::triplify($id, parent::getVoc()."article", "pubmed:".$pmid) ); @@ -919,22 +921,17 @@ function clinical_ann_metadata() // [11] => Related Drugs if($a[11]) { - $b = explode('","',$a[11]); + $b = $this->parseList($a[11]); foreach($b AS $drug_label) { - $drug_label = str_replace('"','',$drug_label); - // find the id from the label - $find = @array_search($drug_label, $this->drug_names_array); - if($find !== FALSE and $find !== NULL){ + preg_match('/\(PA(.*)\)/',$drug_label,$m); + + if(isset($m[1])) { parent::addRDF( - parent::triplify($id, parent::getVoc()."related-drug", $find) + parent::triplify($id, parent::getVoc()."related-drug", "pharmgkb:PA".$m[1]) ); } else { - $drug_id = parent::getRes().md5($drug_label); - parent::addRDF( - parent::describeIndividual($drug_id, $drug_label, parent::getVoc()."Drug"). - parent::triplify($id, parent::getVoc()."related-drug", $drug_id) - ); - } + echo "Error in parsing drug label - ".$drug_label." ".PHP_EOL; + } } parent::addRDF( parent::describeProperty(parent::getVoc()."related-drug", "Relationship between a PharmGKB annotation and a related drug") @@ -942,21 +939,15 @@ function clinical_ann_metadata() } // [12] => Related Diseases if($a[12]) { - $b = explode('","',$a[12]); + $b = $this->parseList($a[12]); foreach($b AS $disease_label) { - $disease_label = str_replace('"','',$disease_label); - // find the id from the label - $find = @array_search($disease_label, $this->disease_names_array); - if($find !== FALSE and $find !== NULL){ - parent::addRDF( - parent::triplify($id, parent::getVoc()."related-disease", $find) - ); - }else { - $disease_id = parent::getRes().md5($disease_label); + preg_match('/\(PA(.*)\)/',$disease_label,$m); + if(isset($m[1])) { parent::addRDF( - parent::describeIndividual($disease_id, $disease_label, parent::getVoc()."Disease"). - parent::triplify($id, parent::getVoc()."related-disease", $disease_id) + parent::triplify($id, parent::getVoc()."related-disease", "pharmgkb:PA".$m[1]) ); + } else { + echo "Error in parsing disease label - ".$disease_label." ".PHP_EOL; } } parent::addRDF( @@ -981,20 +972,29 @@ function var_pheno_ann() {return $this->variant_annotation();} function variant_annotation() { - $canonical_header = array("Annotation ID","Variant","Gene","Drug","Literature Id","Phenotype Category","Significance","Notes","Sentence","StudyParameters"," Alleles"); + $canonical_header = array("Annotation ID","Variant","Gene","Chemical","Literature Id","Phenotype Category","Significance","Notes","Sentence","StudyParameters","Alleles"); $header = explode("\t",$this->getReadFile()->read(20000)); if(count($header) != count($canonical_header)) { trigger_error("column mismatch! Expected ".count($canonical_header).",but found ".count($header),E_USER_ERROR); return (-1); } + foreach($canonical_header AS $i => $ch) { + if($header[$i] != $ch) { + trigger_error("Change in the column header. Expecting $ch and found $header[$i] instead.",E_USER_ERROR); + return (-1); + } + } $declaration = ''; - while($l = $this->GetReadFile()->Read(20000)) { + while($l = $this->getReadFile()->read(20000)) { $a = explode("\t",$l); + //[0] => Annotation ID $id = parent::getNamespace().$a[0]; + $label = "Variant annotation $a[0]"; + if($a[8]) $label = $a[8]; parent::addRDF( - parent::describeIndividual($id, "Variant Annotation $a[0]", parent::getVoc()."Variant-Annotation"). + parent::describeIndividual($id, $label, parent::getVoc()."Variant-Annotation"). parent::describeClass(parent::getVoc()."Variant-Annotation", "PharmGKB Variant Annotation") ); @@ -1016,7 +1016,7 @@ function variant_annotation() //[2] => Gene //CYP3A (PA27114),CYP3A4 (PA130) if($a[2]) { - $genes = explode(",",$a[2]); + $genes = $this->parseList($a[2]); foreach($genes AS $gene) { preg_match("/\((PA[A-Za-z0-9]+)\)/",$gene,$m); if(isset($m[1])) { @@ -1030,7 +1030,7 @@ function variant_annotation() //[3] => Drug if($a[3]) { - $drugs = explode(",",$a[3]); + $drugs = $this->parseList($a[3]); foreach($drugs AS $drug) { preg_match("/\((PA[A-Za-z0-9]+)\)/",$drug,$m); if(isset($m[1])) { @@ -1044,6 +1044,11 @@ function variant_annotation() // [4] => Literature Id if($a[4]) { + if($a[4][0] == 'h') { + // occurs in var_pheno_ann for 2 entries. 10-04-2016 + $a[4] = str_replace('http://sfx.stanford.edu/local?sid=Entrez:PubMed&id=pmid:','',$a[4]); + } + $b = explode(";",$a[4]); foreach($b AS $i => $pmid) { $pmid = trim($pmid); @@ -1056,7 +1061,7 @@ function variant_annotation() //[5] => Phenotype if($a[5]) { - $types = explode(";",$a[5]); + $types = $this->parseList($a[5]); foreach($types AS $t) { parent::addRDF( parent::triplifyString($id, parent::getVoc()."annotation-type", strtolower($t)) @@ -1089,11 +1094,12 @@ function variant_annotation() //[9] => StudyParameters if($a[9]) { - $sps = explode(";",$a[9]); + $sps = $this->parseList($a[9]); foreach($sps AS $sp) { $t = parent::getNamespace().trim($sp); parent::addRDF( parent::describeIndividual($t, $sp, parent::getVoc()."Study-Parameter"). + parent::describeClass(parent::getVoc()."Study-Parameter", "PharmGKB Study Parameter"). parent::triplify($id, parent::getVoc()."study-parameter", $t) ); } From 8959c9d9869ee413a112eca83b971f2304196d30 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 11 Apr 2016 15:39:07 +0200 Subject: [PATCH 22/64] Fixed phargmbk pathway download #442 --- pharmgkb/pharmgkb.php | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index da1a5b7..6d14528 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -128,6 +128,13 @@ function run() echo "Contact PharmGKB to get access to variants/clinical variants; save file as annotations.zip".PHP_EOL; continue; } + } elseif($file == 'pathways') { + if(!file_exists($lfile)) { + echo "Downloading $lfile ... "; + Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId='.$file.'-tsv.zip&dlCls=common', $lfile); + echo "done".PHP_EOL; + } + } else { if(!file_exists($lfile) or parent::getParameterValue('download') == true) { echo "Downloading $lfile ... "; From f7e4a6f8aeeec0b11de5aecbca4cba40f860358c Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 11 Apr 2016 17:24:38 +0200 Subject: [PATCH 23/64] complete pharmgkb pathway parse #442 --- pharmgkb/pharmgkb.php | 137 +++++++++++++++++++++++++++++++++--------- 1 file changed, 109 insertions(+), 28 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 6d14528..f83707b 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -193,10 +193,11 @@ function run() $this->GetReadFile()->SetFilePointer($fp); if($file == "annotations") { - $fnx = substr($zipentry,0,strpos($zipentry,".tsv")); + $fnx = substr($zipentry,0,strpos($zipentry,".tsv")); echo "processing $zipentry.."; } else if($file == 'pathways') { $fnx = 'pathways'; + $this->pathway_name = $zipentry; echo "processing $fnx ($zipentry)... "; } else { $fnx = $file; @@ -1124,39 +1125,119 @@ function variant_annotation() function pathways() { - // needs to be finished - return; - + preg_match('/(PA[0-9]+)-([^\.]+)\.tsv/',$this->pathway_name,$m); + if(!isset($m[1]) and !isset($m[2])) { + trigger_error("unable to find pathway identifier in ".$this->pathway_name); + return false; + } + $pathway_id = parent::getNamespace().$m[1]; + $pathway_name = $m[2]; + + parent::addRDF( + parent::describeIndividual($pathway_id,$pathway_name,parent::getVoc()."Pathway"). + parent::describeClass(parent::getVoc()."Pathway","PharmGKB Pathway") + ); + + $fields = array('From','To','Reaction Type','Controller','Control Type','Cell Type','PubMed Id','Genes','Drugs','Diseases'); + $h = explode("\t", $this->getReadFile()->read(50000)); + // @todo check that the fields match + while($l = $this->getReadFile()->read(50000)) { - $a = explode("\t",trim($l)); + $a = explode("\t",$l); - // From To Reaction Type Controller Control Type Cell Type PubMed Id Genes Drugs Diseases - // hmg coa reductase inhibitors Active & Inactive metabolites Biochemical Reaction CYP2C19,CYP2C8,CYP2C9,CYP2D6,CYP3A4,CYP3A5,UGT1A1,UGT1A3,UGT2B7 Catalysis hepatocyte CYP3A4,CYP3A5,UGT1A3,CYP2C19,CYP2C9,CYP2C8,CYP2D6,UGT1A1,UGT2B7 hmg coa reductase inhibitors - - $c1 = array_search($a[0],$this->drugs); - if($c1 === FALSE) { - $c1 = array_search($a[0],$this->genes); - if($c1 === FALSE) { - $c1 = parent::getRes().url_encode($c1); - } else { - $c1 = parent::getNamespace().$c1; - } - } + $id = md5($l); + $uri = parent::getRes().$id; + $label = $a[2]." in ".$pathway_name; + $type = parent::getVoc().urlencode($a[2]); + $from = parent::getRes().md5($a[0]); + $to = parent::getRes().md5($a[1]); - $c2 = array_search($a[1],$this->drugs); - if($c2 === FALSE) { - $c2 = array_search($a[1],$this->genes); - if($c2 === FALSE) { - // not found - $c2 = parent::getRes().url_encode($c2); - } else { - // actual id - $c2 = parent::getNamespace().$c2; + parent::addRDF( + parent::describeIndividual($uri, $label, $type). + parent::describeClass($type, $a[2]). + parent::describeIndividual($from, $a[0], parent::getVoc()."Resource"). + parent::describeIndividual($to, $a[1], parent::getVoc()."Resource"). + parent::triplify($uri, parent::getVoc()."from", $from). + parent::triplify($uri, parent::getVoc()."to", $to). + parent::triplify($uri, parent::getVoc()."pathway", $pathway_id). + parent::triplify($pathway_id, parent::getVoc()."pathway-component", $uri) + ); + + if($a[4]) { + // control type + $ctid= parent::getRes().md5($a[4]); + parent::addRDF( + parent::describeIndividual($ctid, $a[4], parent::getVoc()."Control-Type"). + parent::describeClass(parent::getVoc()."Control-Type", "PharmGKB Control Type"). + parent::triplify($uri, parent::getVoc()."control-type",$ctid) + ); + } + if($a[5]) { + // cell type + $ctid= parent::getRes().md5($a[5]); + parent::addRDF( + parent::describeIndividual($ctid, $a[5], parent::getVoc()."Cell-Type"). + parent::describeClass(parent::getVoc()."Cell-Type", "PharmGKB Cell Type"). + parent::triplify($uri, parent::getVoc()."cell-type",$ctid) + ); + } + if($a[6]) { + $pmids = explode(",",$a[6]); + foreach($pmids AS $pmid) { + parent::addRDF( + parent::triplify($uri, parent::getVoc()."x-pubmed", "pubmed:$pmid") + ); } } - $id = md5($l); - $uri = parent::getRes().$id; + if($a[7]) { + $genes = $this->parseList($a[7]); + foreach($genes AS $gene) { + $c1 = array_search($gene,$this->genes); + if(!$c1) { + $c1 = parent::getRes().urlencode($gene); + } else { + $c1 = parent::getNamespace().$c1; + } + + if($c1 !== FALSE) { + parent::addRDF( + parent::triplify($uri, parent::getVoc()."gene", $c1) + ); + } + }} + + if($a[8]) { + $drugs = $this->parseList($a[8]); + foreach($drugs AS $drug) { + $c2 = array_search($drug,$this->drugs); + if(!$c2) { + $c2 = parent::getRes().urlencode($drug); + } else { + $c2 = parent::getNamespace().$c2; + } + if($c2 !== FALSE) { + parent::addRDF( + parent::triplify($uri, parent::getVoc()."drug", $c2) + ); + } + }} + if($a[9]) { + $diseases = $this->parseList($a[9]); + foreach($diseases AS $disease) { + $c2 = array_search($disease,$this->diseases); + if(!$c2) { + $c2 = parent::getRes().urlencode($disease); + } else { + $c2 = parent::getNamespace().$c2; + } + if($c2 !== FALSE) { + parent::addRDF( + parent::triplify($uri, parent::getVoc()."disease", $c2) + ); + } + }} + parent::writeRDFBufferToWriteFile(); From b565f38de265f0a70d4df13666a4c55a3f233a7c Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 12 Apr 2016 09:53:37 +0200 Subject: [PATCH 24/64] fix for multiple control and cell types #442 --- pharmgkb/pharmgkb.php | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index f83707b..6fed651 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -1148,7 +1148,7 @@ function pathways() $id = md5($l); $uri = parent::getRes().$id; $label = $a[2]." in ".$pathway_name; - $type = parent::getVoc().urlencode($a[2]); + $type = parent::getVoc().urlencode(str_replace(' ','-',$a[2])); $from = parent::getRes().md5($a[0]); $to = parent::getRes().md5($a[1]); @@ -1165,21 +1165,26 @@ function pathways() if($a[4]) { // control type - $ctid= parent::getRes().md5($a[4]); - parent::addRDF( - parent::describeIndividual($ctid, $a[4], parent::getVoc()."Control-Type"). - parent::describeClass(parent::getVoc()."Control-Type", "PharmGKB Control Type"). - parent::triplify($uri, parent::getVoc()."control-type",$ctid) - ); - } + $types = explode(',',$a[4]); + foreach($types as $type) { + $ctid= parent::getRes().md5($type); + parent::addRDF( + parent::describeIndividual($ctid, $type, parent::getVoc()."Control-Type"). + parent::describeClass(parent::getVoc()."Control-Type", "PharmGKB Control Type"). + parent::triplify($uri, parent::getVoc()."control-type",$ctid) + ); + }} if($a[5]) { // cell type - $ctid= parent::getRes().md5($a[5]); - parent::addRDF( - parent::describeIndividual($ctid, $a[5], parent::getVoc()."Cell-Type"). - parent::describeClass(parent::getVoc()."Cell-Type", "PharmGKB Cell Type"). - parent::triplify($uri, parent::getVoc()."cell-type",$ctid) - ); + $list = $this->parseList($a[5]); + foreach($list AS $item) { + $ctid= parent::getRes().md5($item); + parent::addRDF( + parent::describeIndividual($ctid, $item, parent::getVoc()."Cell-Type"). + parent::describeClass(parent::getVoc()."Cell-Type", "PharmGKB Cell Type"). + parent::triplify($uri, parent::getVoc()."cell-type",$ctid) + ); + } } if($a[6]) { $pmids = explode(",",$a[6]); From 108801d1eb904d3b337087f2fc31f15c83c6e9d0 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 12 Apr 2016 10:27:08 +0200 Subject: [PATCH 25/64] added 'map' to kegg pathway identifiers, to conform with source #440 --- ctd/ctd.php | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ctd/ctd.php b/ctd/ctd.php index 7fc4260..2fc474f 100644 --- a/ctd/ctd.php +++ b/ctd/ctd.php @@ -420,13 +420,13 @@ function CTD_chem_pathways_enriched() return FALSE; } $first = false; - } - + } $chemical_id = $a[1]; $this->getRegistry()->parseQName($a[4], $pathway_ns, $pathway_id); if($pathway_ns == "react") $pathway_ns = "reactome"; - + if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id; + $pathway_resource_id = parent::getRes().md5($chemical_id.$pathway_ns.$pathway_id.$a[6]); $pathway_resource_label = "Chemical-pathway association between mesh:".$chemical_id." and ".$pathway_ns.":".$pathway_id." with p-value ".$a[6]; @@ -509,7 +509,8 @@ function CTD_diseases_pathways() $this->getRegistry()->parseQName($a[1],$disease_ns,$disease_id); $this->getRegistry()->parseQName($a[3],$pathway_ns,$pathway_id); if($pathway_ns == 'react') $pathway_ns = 'reactome'; - + if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id; + $this->AddRDF( parent::triplify($disease_ns.":".$disease_id, $this->getVoc()."pathway", $pathway_ns.":".$pathway_id). parent::triplifyString($disease_ns.":".$disease_id, "rdfs:label", $a[0]." [$disease_ns:$disease_id]"). @@ -612,6 +613,7 @@ function CTD_genes_pathways() $this->getRegistry()->parseQName($a[3],$pathway_ns,$pathway_id); $pathway_id = trim($pathway_id); if($pathway_ns == "react") $pathway_ns = "reactome"; + if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id; $this->ADDRDF( parent::triplify($gene_ns.":".$gene_id, $this->getVoc()."pathway", $pathway_ns.":".$pathway_id). @@ -645,7 +647,8 @@ function CTD_Pathways() $this->getRegistry()->parseQName(trim($a[1]),$pathway_ns,$pathway_id); if($pathway_ns == "react") $pathway_ns = "reactome"; - + if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id; + $this->AddRDF( parent::describeIndividual($pathway_ns.":".$pathway_id, $a[0], $this->getVoc()."Pathway"). parent::describeClass($this->getVoc()."Pathway", "CTD Pathway") From 8e945742b97febe0f289ee4fa83cce7c45f08127 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Thu, 12 May 2016 17:09:04 -0700 Subject: [PATCH 26/64] Update README.md --- README.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README.md b/README.md index 1c6e203..1840c13 100644 --- a/README.md +++ b/README.md @@ -4,21 +4,7 @@ Bio2RDF-scripts Requirements ------------- -The majority of these scripts were created so as to run on Linux servers. Depending on the script you wish to run you will need: - -* PHP5 -* Java 1.5 -* Bio2RDF API (available from: https://github.com/micheldumontier/php-lib) - -The Bio2RDF API should be installed in the root directory of the project tree (i.e sibling to bio2rdf-scripts). - -Usage ------ -Each script is run independently. See README files inside each script directory. - -Contact Us ---------- -For more information visit http://bio2rdf.org or http://dumontierlab.com +See the [wiki](https://github.com/bio2rdf/bio2rdf-scripts/wiki) for details. --- Licensed under [MIT License](http://en.wikipedia.org/wiki/MIT_License), see [license page](https://github.com/bio2rdf/bio2rdf-scripts/wiki/MIT-License) for details. From 913d1390b31c0031e472fd003d93d2cc06c1314c Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Wed, 18 May 2016 17:39:32 -0700 Subject: [PATCH 27/64] fixed premature end of parse due to blank entry --- bioportal/bioportal.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bioportal/bioportal.php b/bioportal/bioportal.php index 19e1065..136853c 100644 --- a/bioportal/bioportal.php +++ b/bioportal/bioportal.php @@ -394,7 +394,7 @@ function OBO2RDF($abbv) $graph_uri = '<'.parent::getRegistry()->getFQURI(parent::getGraphURI()).'>'; $bid = 1; - while($l = parent::getReadFile()->read()) { + while(FALSE !== ($l = parent::getReadFile()->read())) { $lt = trim($l); if(strlen($lt) == 0) continue; if($lt[0] == '!') continue; @@ -676,7 +676,8 @@ function OBO2RDF($abbv) } else { //header //format-version: 1.0 - $buf .= parent::triplifyString($ouri,"obo_vocabulary:$a[0]",str_replace( array('"','\:'), array('\"',':'), isset($a[1])?$a[1]:"")); + $buf .= parent::triplifyString($ouri,"obo_vocabulary:$a[0]", + str_replace( array('"','\:'), array('\"',':'), isset($a[1])?$a[1]:"")); } if($minimal || $minimalp) parent::getWriteFile()->write($min); From 83f3c7c4ad98fdbac511fbfc4cac8866e8909962 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Fri, 22 Jul 2016 16:26:13 -0700 Subject: [PATCH 28/64] simplify column id and includes a column fix for aliases --- irefindex/irefindex.php | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/irefindex/irefindex.php b/irefindex/irefindex.php index 227f6b1..176dd09 100644 --- a/irefindex/irefindex.php +++ b/irefindex/irefindex.php @@ -163,7 +163,7 @@ function Parse() trigger_erorr("Expecting 54 columns, found $c!"); return FALSE; } - + #print_r($header);exit; // check # of columns while($l = parent::getReadFile()->read(500000)) { $a = explode("\t",trim($l)); @@ -280,14 +280,14 @@ function Parse() } // add the alternatives through the taxon + seq redundant group - for($i=2;$i<=3;$i++) { + for($i=0;$i<1;$i++) { $taxid = ''; - $rogid = "irefindex.".$a[32+($i-2)]; + $rogid = "irefindex.".$a[32+$i]; parent::addRDF( parent::describeIndividual($rogid,"",parent::getVoc()."Taxon-Sequence-Identical-Group"). parent::describeClass(parent::getVoc()."Taxon-Sequence-Identical-Group","Taxon + Sequence Identical Group") ); - $tax = $a[9+($i-2)]; + $tax = $a[9+$i]; if($tax && $tax != '-' && $tax != '-1') { $data = $this->ParseStringArray($tax); $taxid = trim($data["ns"]).":".trim($data["id"]); @@ -296,7 +296,8 @@ function Parse() ); } - $list = explode("|",$a[3+($i-2)]); + // parse the alternative identifiers + $list = explode("|",$a[2+$i]); foreach($list AS $item) { $data = $this->ParseStringArray($item); $ns = trim($data["ns"]); @@ -311,6 +312,24 @@ function Parse() ); } } + + // parse the aliases + $list = explode("|",$a[4+$i]); + foreach($list AS $item) { + $data = $this->ParseStringArray($item); + $ns = trim($data["ns"]); + $id = trim($data["id"]); + $qname = $ns.":".$id; + if($ns && $ns != 'rogid' && $ns != 'irogid' and $ns != 'icrogid' and $id != '-') { + parent::addRDF( + parent::triplify($rogid,parent::getVoc()."has-member",$qname) + ); + if($taxid && $taxid != '-' && $taxid != '-1') parent::addRDF( + parent::triplify($qname,parent::getVoc()."x-taxonomy",$taxid) + ); + } + } + } // publications From f1e28556139660136265d348035542ffd32ec1a2 Mon Sep 17 00:00:00 2001 From: maulikkamdar Date: Tue, 1 Nov 2016 15:31:44 -0700 Subject: [PATCH 29/64] MD5 Hash Drugbank Mixture URIs --- drugbank/drugbank.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drugbank/drugbank.php b/drugbank/drugbank.php index 22da4e7..3f26ad9 100755 --- a/drugbank/drugbank.php +++ b/drugbank/drugbank.php @@ -454,7 +454,7 @@ function parseDrugEntry(&$xml) foreach($x->mixtures->mixture AS $item) { if(isset($item)) { $o = $item; - $mid = parent::getRes().str_replace(" ","-",$o->name[0]); + $mid = parent::getRes().md5(str_replace(" ","-",$o->name[0])); parent::addRDF( parent::triplify($did,parent::getVoc()."mixture",$mid). From ce09d8386f71f8a6c7b18f78ee0699e53133de23 Mon Sep 17 00:00:00 2001 From: Johan van Soest Date: Fri, 14 Apr 2017 08:11:57 +0200 Subject: [PATCH 30/64] Array_shift only pops the first location from the stack, therefore randomly selecting a location where the trial was executed. Converted into a loop which parses all locations --- clinicaltrials/clinicaltrials.php | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/clinicaltrials/clinicaltrials.php b/clinicaltrials/clinicaltrials.php index 0025506..ef5ff4e 100644 --- a/clinicaltrials/clinicaltrials.php +++ b/clinicaltrials/clinicaltrials.php @@ -762,11 +762,12 @@ function process_file($infile) { # location of facility doing the testing ############################################################## try { - $location = @array_shift($root->xpath('//location')); - if($location){ + $locations = $root->xpath('//location'); + foreach($locations AS $location) { $location_uri = parent::getRes().md5($location->asXML()); - $name = $this->getString('//facility/name',$location); - $address = @array_shift($location->xpath('//facility/address')); + $facility = $location->facility; + $name = $facility->name[0]; + $address = $facility->address;//@array_shift($location->xpath('//facility/address')); $contact = @array_shift($location->xpath('//contact')); $backups = @array_shift($location->xpath('//contact_backup')); $investigators = @array_shift($location->xpath('//investigator')); From c30e9b3dd2576bc1a153fa8ef7a0bd6c7384abae Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Wed, 30 Aug 2017 09:21:55 +0200 Subject: [PATCH 31/64] adding arc2 through composer --- composer.json | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 composer.json diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..bf9a794 --- /dev/null +++ b/composer.json @@ -0,0 +1,5 @@ +{ + "require": { + "semsol/arc2": "2.3.*" + } +} \ No newline at end of file From 9619f973269ce44e3b2a3a3102b2f799238f3e21 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 23 Jan 2018 21:26:17 +0100 Subject: [PATCH 32/64] added check for null oversight value --- clinicaltrials/clinicaltrials.php | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/clinicaltrials/clinicaltrials.php b/clinicaltrials/clinicaltrials.php index 0025506..2f0d480 100644 --- a/clinicaltrials/clinicaltrials.php +++ b/clinicaltrials/clinicaltrials.php @@ -238,6 +238,7 @@ function parse_dir(){ function process_file($infile) { $indir = parent::getParameterValue('indir'); $xml = new CXML($infile); + $this->setCheckPoint('file'); while($xml->Parse("clinical_study") == TRUE) { $this->setCheckPoint('record'); @@ -364,16 +365,18 @@ function process_file($infile) { ###################################################################################### try { $oversight = @array_shift($root->xpath('//oversight_info')); - $oversight_id = parent::getRes().md5($oversight->asXML()); + if($oversight !== null) { + $oversight_id = parent::getRes().md5($oversight->asXML()); - $authority = $this->getString('//authority', $oversight); - $authority_id = parent::getRes().md5($authority); - parent::addRDF( - parent::describeIndividual($oversight_id,$authority,parent::getVoc()."Organization"). - parent::triplify($study_id,$this->getVoc()."oversight",$oversight_id). - parent::triplify($study_id,$this->getVoc()."authority",$authority_id). - parent::triplifyString($oversight_id, parent::getVoc()."has-dmc", $this->getString('//has_dmc', $oversight)) - ); + $authority = $this->getString('//authority', $oversight); + $authority_id = parent::getRes().md5($authority); + parent::addRDF( + parent::describeIndividual($oversight_id,$authority,parent::getVoc()."Organization"). + parent::triplify($study_id,$this->getVoc()."oversight",$oversight_id). + parent::triplify($study_id,$this->getVoc()."authority",$authority_id). + parent::triplifyString($oversight_id, parent::getVoc()."has-dmc", $this->getString('//has_dmc', $oversight)) + ); + } } catch(Exception $e){ echo "There was an error in the oversight info element: $e\n"; From 8b9922dea43a99ec577ec88a0382f848ed4eb368 Mon Sep 17 00:00:00 2001 From: Katrin Leinweber Date: Tue, 27 Mar 2018 08:21:06 +0200 Subject: [PATCH 33/64] Link DOIs to preferred resolver --- miriam/miriam.php | 2 +- taxonomy/taxonomy.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/miriam/miriam.php b/miriam/miriam.php index 31bd80d..5c740ab 100644 --- a/miriam/miriam.php +++ b/miriam/miriam.php @@ -159,7 +159,7 @@ function parseItem($item) else $mylist = $i; foreach($mylist AS $myitem) { if(strstr($myitem, "pubmed")) $uri = "pubmed:".substr($myitem, strrpos($myitem, ":")+1); - else if(strstr($myitem, "doi")) $uri = "http://dx.doi.org/".substr($myitem, strpos($myitem, "doi:")); + else if(strstr($myitem, "doi")) $uri = "https://doi.org/".substr($myitem, strpos($myitem, "doi:")); else $uri = $myitem; parent::addRDF( diff --git a/taxonomy/taxonomy.php b/taxonomy/taxonomy.php index 5807007..c1bbac8 100644 --- a/taxonomy/taxonomy.php +++ b/taxonomy/taxonomy.php @@ -329,7 +329,7 @@ private function citations() $c = parent::getRes()."citation-id-".$a[0]; $seealso = isset($a[4])?trim($a[4]):""; if($seealso) { - $seealso = str_replace(array("lx: DOI ","http;//"), array("http://dx.doi.org/","http://"), $seealso); + $seealso = str_replace(array("lx: DOI ","http;//"), array("https://doi.org/","http://"), $seealso); if(strlen($seealso) > 2 and !strstr($seealso,"http")) $seealso = "http://".$seealso; $seealso = parent::triplify($c, "rdfs:seeAlso", $seealso); } From 93a2c327e03b1174a52577be0387815da9c1b17a Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 29 Oct 2018 15:02:09 +0100 Subject: [PATCH 34/64] fixed merge error --- chembl/chembl.php | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/chembl/chembl.php b/chembl/chembl.php index 5433933..b0f5646 100644 --- a/chembl/chembl.php +++ b/chembl/chembl.php @@ -1017,8 +1017,10 @@ function compounds($connection) { } parent::writeRDFBufferToWriteFile(); } + } + $result->free(); + } -<<<<<<< HEAD /* * parse the assays tables */ @@ -1075,12 +1077,8 @@ function process_assays() { $this->WriteRDFBufferToWriteFile(); } - $this->AddRDF($this->QQuad($assay,"chembl_vocabulary:hasAssayType","chembl_vocabulary:".$row['assay_desc'])); $this->WriteRDFBufferToWriteFile(); -======= - $result->free(); ->>>>>>> release3 } } From 4bc0cce46e4862dddf941e4b19e3e99950eedf37 Mon Sep 17 00:00:00 2001 From: Jonathan Leitschuh Date: Mon, 10 Feb 2020 23:15:48 -0500 Subject: [PATCH 35/64] Use HTTPS instead of HTTP to resolve dependencies This fixes a security vulnerability in this project where the `pom.xml` files were configuring Maven to resolve dependencies over HTTP instead of HTTPS. Signed-off-by: Jonathan Leitschuh --- pdb/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdb/pom.xml b/pdb/pom.xml index d757bed..0a28a4d 100644 --- a/pdb/pom.xml +++ b/pdb/pom.xml @@ -119,7 +119,7 @@ virtuoso-repo - http://maven.aksw.org/repository/internal/ + https://maven.aksw.org/repository/internal/ true From 45cd715c0e20502c364ccf7f242d9a592f8054f6 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 25 May 2020 16:15:15 +0200 Subject: [PATCH 36/64] update path --- hgnc/hgnc.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hgnc/hgnc.php b/hgnc/hgnc.php index 164bcf5..ad4bc95 100755 --- a/hgnc/hgnc.php +++ b/hgnc/hgnc.php @@ -36,7 +36,7 @@ class HGNCParser extends Bio2RDFizer { function __construct($argv){ parent::__construct($argv, "hgnc"); parent::addParameter('files',true,'all','all','files to process'); - parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz'); + parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt'); parent::initialize(); }//constructor From 87330b6d60472d7d9208243740017d2b77859aac Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 25 May 2020 18:54:32 +0200 Subject: [PATCH 37/64] updated with new format --- hgnc/hgnc.php | 473 ++++++++++++++++---------------------------------- 1 file changed, 150 insertions(+), 323 deletions(-) diff --git a/hgnc/hgnc.php b/hgnc/hgnc.php index ad4bc95..168d63d 100755 --- a/hgnc/hgnc.php +++ b/hgnc/hgnc.php @@ -41,7 +41,7 @@ function __construct($argv){ }//constructor function Run(){ - $file = "hgnc_complete_set.txt.gz"; + $file = "hgnc_complete_set.txt"; $ldir = parent::getParameterValue('indir'); $odir = parent::getParameterValue('outdir'); $rdir = parent::getParameterValue('download_url'); @@ -80,7 +80,7 @@ function Run(){ ->setFormat('text/tab-separated-value') ->setFormat('application/zip') ->setPublisher('http://www.genenames.org/') - ->setHomepage('http://www.genenames.org/data/gdlw_columndef.html') + ->setHomepage('https://www.genenames.org/help/statistics-and-downloads/') ->setRights('use') ->setRights('attribution') ->setLicense('http://www.genenames.org/about/overview') @@ -115,400 +115,227 @@ function Run(){ }//Run function process(){ - $header = $this->GetReadFile()->Read(200000); + $header = $this->getReadFile()->read(200000); $header_arr = explode("\t", $header); - $n = 41; - $c = count($header_arr); + $h = array_flip($header_arr); + + $c = count($h); + $n = 52; if ($c != $n) { echo PHP_EOL; print_r($header_arr); - trigger_error ("Expected $n columns, found $c . please update the script",E_USER_ERROR); - exit; + trigger_error ("Expected $n columns, found $c . some fields may not be properly processed. update the script",E_USER_ERROR); } + $this->getReadFile()->read(200000); // skip a line - while($l = $this->GetReadFile()->Read(4096)) { - $fields = explode("\t", $l); - $id = strtolower($fields[0]); - $approved_symbol = $fields[1]; - $approved_name = $fields[2]; - $status = $fields[3]; - $locus_type = $fields[4]; - $locus_group = $fields[5]; - $previous_symbols = $fields[6]; - $previous_names = $fields[7]; - $synonyms = $fields[8]; - $name_synonyms = $fields[9]; - $chromosome = $fields[10]; - $date_approved = $fields[11]; - $date_modified = $fields[12]; - $date_symbol_changed = $fields[13]; - $date_name_changed = $fields[14]; - $accession_numbers = $fields[15]; - $enzyme_ids = $fields[16]; - $entrez_gene_id = $fields[17]; - $ensembl_gene_id = $fields[18]; - $mouse_genome_database_id = $fields[19]; - $specialist_database_links = $fields[20]; - $specialist_database_ids = $fields[21]; - $pubmed_ids = $fields[22]; - $refseq_ids = $fields[23]; - $gene_family_tag = $fields[24]; - $gene_family_description = $fields[25]; - $record_type = $fields[26]; - $primary_ids = $fields[27]; - $secondary_ids = $fields [28]; - $ccd_ids = $fields[29]; - $vega_ids = $fields[30]; - $locus_specific_databases = $fields[31]; - $entrez_gene_id_mappeddatasuppliedbyNCBI = $fields[32]; - $omim_id_mappeddatasuppliedbyNCBI = $fields[33]; - $refseq_mappeddatasuppliedbyNCBI = $fields[34]; - $uniprot_id_mappeddatasuppliedbyUniProt = $fields[35]; - $ensembl_id_mappeddatasuppliedbyEnsembl = $fields[36]; - $vega_id_mappeddatasuppliedbyVega = $fields[37]; - $ucsc_id_mappeddatasuppliedbyUCSC = $fields[38]; - $mouse_genome_database_id_mappeddatasuppliedbyMGI = $fields[39]; - $rat_genome_database_id_mappeddatasuppliedbyRGD = $fields[40]; + while($l = $this->getReadFile()->read(4096)) { + $l = str_replace('"','',$l); + $r = explode("\t", $l); + + $id = strtolower($r[$h['hgnc_id']]); + $uid = str_replace(":","_",$id); + $symbol = $r[$h['symbol']]; - $id_res = $id; - $id_label = "Gene Symbol for ".$approved_symbol; - parent::AddRDF( - parent::triplify($id_res, "rdf:type", $this->getVoc()."Gene-Symbol"). - parent::describeIndividual($id_res, $id_label, $this->getVoc()."Gene-Symbol"). + parent::addRDF( + parent::triplify($id, "rdf:type", $this->getVoc()."Gene-Symbol"). + parent::describeIndividual($id, "Gene symbol for ".$symbol, $this->getVoc()."Gene-Symbol"). parent::describeClass($this->getVoc()."Gene-Symbol", "HGNC Official Gene Symbol") ); - if(!empty($approved_symbol)){ - $s = "hgnc.symbol:".$approved_symbol; - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."approved-symbol",utf8_encode(htmlspecialchars($approved_symbol))). + + if(!empty($symbol)){ + $s = "hgnc.symbol:".$symbol; + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."approved-symbol",utf8_encode(htmlspecialchars($symbol))). parent::describeProperty($this->getVoc()."approved-symbol", "HGNC approved gene symbol","The official gene symbol that has been approved by the HGNC and is publicly available. Symbols are approved based on specific HGNC nomenclature guidelines. In the HTML results page this ID links to the HGNC Symbol Report for that gene"). - parent::describeIndividual($s, $approved_symbol, parent::getVoc()."Approved-Gene-Symbol"). + parent::describeIndividual($s, $symbol, parent::getVoc()."Approved-Gene-Symbol"). parent::describeClass(parent::getVoc()."Approved-Gene-Symbol","Approved Gene Symbol"). - parent::triplify($id_res, parent::getVoc()."has-approved-symbol", $s). - parent::triplify($s, parent::getVoc()."is-approved-symbol-of", $id_res) + parent::triplify($id, parent::getVoc()."has-approved-symbol", $s). + parent::triplify($s, parent::getVoc()."is-approved-symbol-of", $id) ); - } - if(!empty($approved_name)){ - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."approved-name",utf8_encode(htmlspecialchars($approved_name))). + if(!empty($r[$h['name']])){ + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."approved-name",utf8_encode(htmlspecialchars($r[$h['name']]))). parent::describeProperty($this->getVoc()."approved-name","HGNC approved name", "The official gene name that has been approved by the HGNC and is publicly available. Names are approved based on specific HGNC nomenclature guidelines.") ); } - if(!empty($status)){ - $s = $this->getVoc().str_replace(" ","-",$status); - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."status",$s). + if(!empty($r[$h['status']])){ + $s = $this->getVoc().str_replace(" ","-",$r[$h['status']]); + parent::addRDF( + parent::triplify($id, $this->getVoc()."status",$s). parent::describeProperty($this->getVoc()."status","HGNC status", "Indicates whether the gene is classified as: Approved - these genes have HGNC-approved gene symbols. Entry withdrawn - these previously approved genes are no longer thought to exist. Symbol withdrawn - a previously approved record that has since been merged into a another record."). - parent::describeClass($s,$status,$this->getVoc()."Status") + parent::describeClass($s,$r[$h['status']],$this->getVoc()."Status") ); } - if(!empty($locus_id)){ - $locus_res = $this->getRes().$id."_LOCUS"; - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."locus", $locus_res). - parent::triplifyString($locus_res, $this->getVoc()."locus-type",utf8_encode(htmlspecialchars($locus_type))). - parent::triplifyString($locus_res, $this->getVoc()."locus-group", utf8_encode(htmlspecialchars($locus_group))). + if(!empty($r[$h['locus_group']])){ + $locus_res = $this->getRes().$uid."_locus"; + parent::addRDF( + parent::triplify($id, $this->getVoc()."locus", $locus_res). + parent::triplifyString($locus_res, $this->getVoc()."locus-type",utf8_encode(htmlspecialchars($r[$h['locus_type']]))). + parent::triplifyString($locus_res, $this->getVoc()."locus-group", utf8_encode(htmlspecialchars($r[$h['locus_group']]))). parent::describeProperty($this->getVoc()."locus-type", "locus type","Specifies the type of locus described by the given entry"). parent::describeProperty($this->getVoc()."locus-group", "locus group", "Groups locus types together into related sets. Below is a list of groups and the locus types within the group") ); } - if(!empty($previous_symbols)){ - $previous_symbols = explode(", ", $previous_symbols); + if(!empty($r[$h['prev_symbol']])){ + $s = $r[$h['prev_symbol']]; + $previous_symbols = explode("|", $s); foreach($previous_symbols as $previous_symbol){ $previous_symbol_uri = "hgnc.symbol:".$previous_symbol; - parent::AddRDF( + parent::addRDF( parent::describeIndividual($previous_symbol_uri, $previous_symbol, parent::getVoc()."Previous-Symbol"). parent::describeClass(parent::getVoc()."Previous-Symbol","Previous Symbol"). - parent::triplify($id_res, $this->getVoc()."previous-symbol", $previous_symbol_uri). + parent::triplify($id, $this->getVoc()."previous-symbol", $previous_symbol_uri). parent::describeProperty($this->getVoc()."previous-symbol", "HGNC previous symbol","Symbols previously approved by the HGNC for this gene") ); } } - if(!empty($previous_names)){ - $previous_names = explode(", ", $previous_names); + if(!empty($r[$h['prev_name']])){ + $s = $r[$h['prev_name']]; + $previous_names = explode("|", $s); foreach($previous_names as $previous_name){ - $previous_name = str_replace("\"", "", $previous_name); - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."previous-name", utf8_encode(htmlspecialchars($previous_name))). + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."previous-name", utf8_encode(htmlspecialchars($previous_name))). parent::describeProperty($this->getVoc()."previous-name", "HGNC previous name","Gene names previously approved by the HGNC for this gene") ); } } - if(!empty($synonyms)){ - $synonyms = explode(", ", $synonyms); - foreach ($synonyms as $synonym) { - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."synonym", utf8_encode(htmlspecialchars($synonym))). - parent::describeProperty($this->getVoc()."synonym", "synonym","Other symbols used to refer to this gene") + if(!empty($r[$h['prev_symbol']])){ + $s = $r[$h['prev_symbol']]; + $prev_symbols = explode('|',$s); + foreach ($prev_symbols as $prev_symbol) { + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."prev-symbol", utf8_encode(htmlspecialchars($prev_symbol))). + parent::describeProperty($this->getVoc()."prev-symbol", "previous symbol","Previously used symbols used to refer to this gene") ); } } - if(!empty($name_synonyms)){ - $name_synonyms = explode(", ", $name_synonyms); - foreach ($name_synonyms as $name_synonym) { - $name_synonym = str_replace("\"", "", $name_synonym); - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."name-synonym", utf8_encode(htmlspecialchars($name_synonym))). - parent::describeProperty($this->getVoc()."name-synonym", "name synonym","Other names used to refer to this gene") + if(!empty($r[$h['alias_name']])){ + $s = $r[$h['alias_name']]; + $alias_names = explode("|", $s); + foreach ($alias_names as $alias_name) { + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."alias-name", utf8_encode(htmlspecialchars($alias_name))). + parent::describeProperty($this->getVoc()."alias-name", "alias name","Other names used to refer to this gene") ); } } - if(!empty($chromosome)){ - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."chromosome", utf8_encode(htmlspecialchars($chromosome))). - parent::describeProperty($this->getVoc()."chromosome", "chromosome", "Indicates the location of the gene or region on the chromosome") + if(!empty($r[$h['alias_symbol']])){ + $s = $r[$h['alias_symbol']]; + $alias_symbols = explode("|", $s); + foreach ($alias_symbols as $alias_symbol) { + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."alias-symbol", utf8_encode(htmlspecialchars($alias_symbol))). + parent::describeProperty($this->getVoc()."alias-symbol", "alias symbol","Other symbols used to refer to this gene") + ); + } + } + if(!empty($r[$h['location']])){ + $s = $r[$h['location']]; + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."location", utf8_encode(htmlspecialchars($s))). + parent::describeProperty($this->getVoc()."location", "location", "Indicates the location of the gene or region on the chromosome") ); } - if(!empty($date_approved)){ - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."date-approved", $date_approved, "xsd:date"). + if(!empty($r[$h['date_approved_reserved']])){ + $s = $r[$h['date_approved_reserved']]; + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."date-approved", $s, "xsd:date"). parent::describeProperty($this->getVoc()."date-approved", "date approved","Date the gene symbol and name were approved by the HGNC") ); } - if(!empty($date_modified)){ + if(!empty($r[$h['date_modified']])){ + $s = $r[$h['date_modified']]; parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."date-modified", $date_modified, "xsd:date"). + parent::triplifyString($id, $this->getVoc()."date-modified", $s, "xsd:date"). parent::describeProperty($this->getVoc()."date-modified", "date modified", "the date the entry was modified by the HGNC") ); } - if(!empty($date_symbol_changed)){ + if(!empty($r[$h['date_symbol_changed']])){ + $s = $r[$h['date_symbol_changed']]; parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."date-symbol-changed", $date_symbol_changed, "xsd:date"). + parent::triplifyString($id, $this->getVoc()."date-symbol-changed", $s, "xsd:date"). parent::describeProperty($this->getVoc()."date-symbol-changed", "date symbol changed","The date the gene symbol was last changed by the HGNC from a previously approved symbol. Many genes receive approved symbols and names which are viewed as temporary (eg C2orf#) or are non-ideal when considered in the light of subsequent information. In the case of individual genes a change to the name (and subsequently the symbol) is only made if the original name is seriously misleading") ); } - if(!empty($date_name_changed)){ - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."date-name-changed", $date_name_changed, "xsd:date"). + if(!empty($r[$h['date_name_changed']])){ + $s = $r[$h['date_name_changed']]; + parent::addRDF( + parent::triplifyString($id, $this->getVoc()."date-name-changed", $s, "xsd:date"). parent::describeProperty($this->getVoc()."date-name-changed", "date name changed", "The date the gene name was last changed by the HGNC from a previously approved name") ); } - if(!empty($accession_numbers)){ - $accession_numbers = explode(", ", $accession_numbers); - foreach ($accession_numbers as $accession_number) { - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."accession", utf8_encode(htmlspecialchars($accession_number))). - parent::describeProperty($this->getVoc()."accession", "accession number", "Accession numbers for each entry selected by the HGNC") - ); - } - } - if(!empty($enzyme_ids)){ - $enzyme_ids = explode(", ", $enzyme_ids); - foreach ($enzyme_ids as $enzyme_id) { - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."x-ec", utf8_encode(htmlspecialchars($enzyme_id))). - parent::describeProperty($this->getVoc()."x-ec","Enzyme Commission (EC) number", "Enzyme entries have Enzyme Commission (EC) numbers associated with them that indicate the hierarchical functional classes to which they belong") - ); - } - } - if(!empty($entrez_gene_id)){ - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-ncbigene", "ncbigene:$entrez_gene_id"). - parent::describeProperty($this->getVoc()."x-ncbigene", "NCBI Gene", "NCBI Gene provides curated sequence and descriptive information about genetic loci including official nomenclature, synonyms, sequence accessions, phenotypes, EC numbers, MIM numbers, UniGene clusters, homology, map locations, and related web sites") - ); - } - if(!empty($ensembl_gene_id)){ - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-ensembl", "ensembl:$ensembl_gene_id"). - parent::describeProperty($this->getVoc()."x-ensembl", "Ensembl Gene") - ); - } - if(!empty($mouse_genome_database_id)){ - if(strpos($mouse_genome_database_id, "MGI:") !== FALSE){ - $mouse_genome_database_id = substr($mouse_genome_database_id, 4); - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-mgi", "mgi:$mouse_genome_database_id"). - parent::describeProperty($this->getVoc()."x-mgi", "MGI entry") - ); - } - } - if(!empty($specialist_database_links)){ - $specialist_database_links = explode(", ", $specialist_database_links); - foreach ($specialist_database_links as $specialist_database_link) { - preg_match('/href="(\S+)"/', $specialist_database_link, $matches); - if(!empty($matches[1])){ - parent::AddRDF( - parent::QQuadO_URL($id_res, $this->getVoc()."xref", $matches[1]). - parent::describeProperty($this->getVoc()."xref", "Specialist database references.") + $idmap = array( + "entrez_id" => "ncbigene", + "ensembl_gene_id" => "ensembl", + "vega_id" => "vega", + "ucsc_id" => "ucsc", + "ena" => "ena", + "refseq_accession" => "refseq", + "ccds_id" => "ccds", + "uniprot_ids" => "uniprot", + "pubmed_id" => "pubmed", + "mgd_id" => "mgd", + "rgd_id" => "rgd", + // "lsdb" => "lsdb", # special structure + "cosmic" => "cosmic", + "omim_id" => "omim", + "mirbase" => "mirbase", + "homeodb" => "homeodb", + "snornabase" => "snornabase", + "bioparadigms_slc" => "bioparadigms_slc", + "orphanet" =>"orphanet", + "pseudogene.org" => "pseudogene", + "horde_id" => "horde", + "merops" => "merops", + "imgt" => "imgt", + "iuphar" => "iuphar", + "kznf_gene_catalog" => "kznf", + "mamit-trnadb" => "mamit", + "cd" => "hcdm", + "lncrnadb" => "lncrnadb", + "enzyme_id" => "ec", + "intermediate_filament_db" => "intermediate_filament_db", + "rna_central_ids" => "rna_central_ids", + "lncipedia" => "lncipedia", + "gtrnadb" => "gtrnadb", + // "agr" => "agr" #uses hgnc? + ); + foreach($idmap AS $fieldname => $prefix) { + if(!empty($r[$h[$fieldname]])){ + $s = $r[$h[$fieldname]]; + $identifiers = explode("|", $s); + foreach ($identifiers as $identifier) { + // some identifiers are prefixed... + $pos = strpos($identifier,":"); + if($pos !== FALSE) { + $identifier = substr($identifier, strpos($identifier, ":")+1); + } + + parent::addRDF( + parent::triplify($id, $this->getVoc()."x-".$prefix, $prefix.":".$identifier). + parent::describeProperty($this->getVoc()."x-".$prefix, $prefix, "reference to an entry in the $prefix database") ); } } } - if(!empty($pubmed_ids)){ - $pubmed_ids = explode(", ", $pubmed_ids); - foreach ($pubmed_ids as $pubmed_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-pubmed", "pubmed:".trim($pubmed_id)). - parent::describeProperty($this->getVoc()."x-pubmed", "NCBI PubMed entry","Identifier that links to published articles relevant to the entry in the NCBI's PubMed database.") - ); - } - } - if(!empty($refseq_ids)){ - $refseq_ids = explode(", ", $refseq_ids); - foreach ($refseq_ids as $refseq_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-refseq", "refseq:".trim($refseq_id)). - parent::describeProperty($this->getVoc()."x-refseq", "NCBI Refseq entry","The Reference Sequence (RefSeq) identifier for that entry, provided by the NCBI. As we do not aim to curate all variants of a gene only one selected RefSeq is displayed per gene report. RefSeq aims to provide a comprehensive, integrated, non-redundant set of sequences, including genomic DNA, transcript (RNA), and protein products. RefSeq identifiers are designed to provide a stable reference for gene identification and characterization, mutation analysis, expression studies, polymorphism discovery, and comparative analyses. In the HTML results page this ID links to the RefSeq page for that entry.") - ); - } - } - if(!empty($gene_family_tag)){ + if(!empty($r[$h['gene_family_id']])){ + $s = $r[$h['gene_family_id']]; parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."gene-family-tag", utf8_encode(htmlspecialchars($gene_family_tag))). + parent::triplifyString($id_res, $this->getVoc()."gene-family-tag", utf8_encode(htmlspecialchars($s))). parent::describeProperty($this->getVoc()."gene-family-tag", "Gene Family Tag","Tag used to designate a gene family or group the gene has been assigned to, according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group. These tags are used to generate gene family or grouping specific pages at genenames.org and do not necessarily reflect an official nomenclature. Each gene family has an associated gene family tag and gene family description. If a particular gene is a member of more than one gene family, the tags and the descriptions will be shown in the same order.") ); } - if(!empty($gene_family_description)){ - $gene_family_description = str_replace("\"", "", $gene_family_description); + if(!empty($r[$h['gene_family']])){ + $s = $r[$h['gene_family']]; parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."gene-family-description", utf8_encode(htmlspecialchars($gene_family_description))). + parent::triplifyString($id_res, $this->getVoc()."gene-family-description", utf8_encode(htmlspecialchars($s))). parent::describeProperty($this->getVoc()."gene-family-description", "gene family name","Name given to a particular gene family. The gene family description has an associated gene family tag. Gene families are used to group genes according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group.") ); } - if(!empty($record_type)){ - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."record-type", utf8_encode(htmlspecialchars($record_type))) - ); - } - if(!empty($primary_ids)){ - $primary_ids = explode(", ", $primary_ids); - foreach ($primary_ids as $primary_id) { - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."primary-id", utf8_encode(htmlspecialchars($primary_id))). - parent::describeProperty($this->getVoc()."primary-id", "primary identifier") - ); - } - } - if(!empty($secondary_ids)){ - $secondary_ids = explode(", ", $secondary_ids); - foreach ($secondary_ids as $secondary_id) { - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."secondary-id", utf8_encode(htmlspecialchars($secondary_id))). - parent::describeProperty($this->getVoc()."secondary-id", "secondary identifier") - ); - } - } - if(!empty($ccd_ids)){ - $ccd_ids = explode(", ", $ccd_ids); - foreach ($ccd_ids as $ccd_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-ccds", "ccds:".trim($ccd_id)). - parent::describeProperty($this->getVoc()."x-ccds", "consensus CDS entry","The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.") - ); - } - } - if(!empty($vega_ids)){ - $vega_ids = explode(", ", $vega_ids); - foreach ($vega_ids as $vega_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-vega", "vega:".trim($vega_id)). - parent::describeProperty($this->getVoc()."x-vega", "VEGA gene entry") - ); - } - } - if(!empty($locus_specific_databases)){ - parent::AddRDF( - parent::triplifyString($id_res, $this->getVoc()."locus-specific-xref", utf8_encode(htmlspecialchars($locus_specific_databases))). - parent::describeProperty($this->getVoc()."locus-specific-xref", "locus specific xref", "This contains a list of links to databases or database entries pertinent to the gene") - ); - } - if(!empty($entrez_gene_id_mappeddatasuppliedbyNCBI)){ - $entrez_gene_id_mappeddatasuppliedbyNCBI = explode(", ", $entrez_gene_id_mappeddatasuppliedbyNCBI); - foreach ($entrez_gene_id_mappeddatasuppliedbyNCBI as $gene_id) { - if(strstr($gene_id, ":") !== FALSE){ - $a = explode(":", $gene_id); - $gene_id = $a[1]; - } - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-ncbigene", "ncbigene:".trim($gene_id)). - parent::describeProperty($this->getVoc()."x-ncbigene", "NCBI Gene entry") - ); - - } - } - if(!empty($omim_id_mappeddatasuppliedbyNCBI)){ - $omim_id_mappeddatasuppliedbyNCBI = explode(", ", $omim_id_mappeddatasuppliedbyNCBI); - foreach ($omim_id_mappeddatasuppliedbyNCBI as $omim_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-omim", "omim:".trim($omim_id)). - parent::describeProperty($this->getVoc()."x-omim", "OMIM entry","Identifier provided by Online Mendelian Inheritance in Man (OMIM) at the NCBI. This database is described as a catalog of human genes and genetic disorders containing textual information and links to MEDLINE and sequence records in the Entrez system, and links to additional related resources at NCBI and elsewhere. In the HTML results page this ID links to the OMIM page for that entry.") - ); - } - } - if(!empty($refseq_mappeddatasuppliedbyNCBI)){ - $refseq_mappeddatasuppliedbyNCBI = explode(", ", $refseq_mappeddatasuppliedbyNCBI); - foreach ($refseq_mappeddatasuppliedbyNCBI as $refseq_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-refseq", "refseq:".trim($refseq_id)). - parent::describeProperty($this->getVoc()."x-refseq", "NCBI Refseq entry","The Reference Sequence (RefSeq) identifier for that entry, provided by the NCBI. As we do not aim to curate all variants of a gene only one selected RefSeq is displayed per gene report. RefSeq aims to provide a comprehensive, integrated, non-redundant set of sequences, including genomic DNA, transcript (RNA), and protein products. RefSeq identifiers are designed to provide a stable reference for gene identification and characterization, mutation analysis, expression studies, polymorphism discovery, and comparative analyses. In the HTML results page this ID links to the RefSeq page for that entry.") - ); - } - } - if(!empty($uniprot_id_mappeddatasuppliedbyUniProt)){ - $uniprot_id_mappeddatasuppliedbyUniProt = explode(", ", $uniprot_id_mappeddatasuppliedbyUniProt); - foreach ($uniprot_id_mappeddatasuppliedbyUniProt as $uniprot_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-uniprot", "uniprot:".trim($uniprot_id)). - parent::describeProperty($this->getVoc()."x-uniprot", "Uniprot entry","The UniProt identifier, provided by the EBI. The UniProt Protein Knowledgebase is described as a curated protein sequence database that provides a high level of annotation, a minimal level of redundancy and high level of integration with other databases. In the HTML results page this ID links to the UniProt page for that entry.") - ); - } - } - if(!empty($ensembl_id_mappeddatasuppliedbyEnsembl)){ - $ensembl_id_mappeddatasuppliedbyEnsembl = explode(", ", $ensembl_id_mappeddatasuppliedbyEnsembl); - foreach ($ensembl_id_mappeddatasuppliedbyEnsembl as $ensembl_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-ensembl", "ensembl:".trim($refseq_id)). - parent::describeProperty($this->getVoc()."x-ensembl", "Ensembl entry","The Ensembl ID is derived from the current build of the Ensembl database and provided by the Ensembl team.") - ); - } - } - - if(!empty($ucsc_id_mappeddatasuppliedbyVega)){ - $ucsc_id_mappeddatasuppliedbyVega = explode(", ", $ucsc_id_mappeddatasuppliedbyVega); - foreach ($ucsc_id_mappeddatasuppliedbyVega as $vega_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-vega", "vega:".trim($vega_id)). - parent::describeProperty($this->getVoc()."x-vega", "Vega entry") - ); - } - } - if(!empty($ucsc_id_mappeddatasuppliedbyUCSC)){ - $ucsc_id_mappeddatasuppliedbyUCSC = explode(", ", $ucsc_id_mappeddatasuppliedbyUCSC); - foreach ($ucsc_id_mappeddatasuppliedbyUCSC as $ucsc_id) { - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-ucsc", "ucsc:".trim($ucsc_id)). - parent::describeProperty($this->getVoc()."x-ucsc", "UCSC entry") - ); - } - } - if(!empty($mouse_genome_database_id_mappeddatasuppliedbyMGI)){ - $mouse_genome_database_id_mappeddatasuppliedbyMGI = explode(", ", $mouse_genome_database_id_mappeddatasuppliedbyMGI); - foreach ($mouse_genome_database_id_mappeddatasuppliedbyMGI as $mgi_id) { - if(strpos($mgi_id, "MGI:") !== FALSE){ - $mgi_id = substr($mgi_id, 4); - } - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-mgi", "mgi:".trim($mgi_id)). - parent::describeProperty($this->getVoc()."x-mgi", "MGI entry") - ); - } - } - if(!empty($rat_genome_database_id_mappeddatasuppliedbyRGD)){ - $rat_genome_database_id_mappeddatasuppliedbyRGD = explode(", ", trim($rat_genome_database_id_mappeddatasuppliedbyRGD)); - foreach ($rat_genome_database_id_mappeddatasuppliedbyRGD as $rgd_id) { - $rgd_id = trim($rgd_id); - if(!empty($rgd_id)){ - parent::AddRDF( - parent::triplify($id_res, $this->getVoc()."x-rgd", trim($rgd_id)). - parent::describeProperty($this->getVoc()."x-rgd", "RGD entry") - ); - } - } - } //write RDF to file $this->WriteRDFBufferToWriteFile(); }//while From fc42265ea64ccaa1bfa54b8bcd37b8db35379149 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 16:03:54 +0200 Subject: [PATCH 38/64] updates to file processing; removed automatic download --- pharmgkb/pharmgkb.php | 244 +++++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 120 deletions(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index 6fed651..cf3fa30 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -37,9 +37,9 @@ class PharmGKBParser extends Bio2RDFizer function __construct($argv) { parent::__construct($argv, "pharmgkb"); - $this->AddParameter('files',true,'all|drugs|genes|diseases|pathways|relationships|annotations|rsid','all','all or comma-separated list of files to process'); /** pathways **/ + $this->AddParameter('files',true,'all|drugs|genes|phenotypes|pathways|relationships|annotations|variants','all','all or comma-separated list of files to process'); /** pathways **/ $this->addParameter('additional',false,'none|offsides|twosides','none','process offsides and/or twosides'); - $this->AddParameter('download_url',false,null,'https://www.pharmgkb.org/download.do?dlCls=common&objId='); + $this->AddParameter('download_url',false,null,'https://www.pharmgkb.org/downloads'); parent::initialize(); } @@ -60,33 +60,7 @@ function download() $ldir = $this->GetParameterValue('indir'); $rdir = $this->GetParameterValue('download_url'); - foreach($files AS $file) { - $lfile = $ldir.$file.".zip"; - if($file == 'annotations' or $file == 'relationships') { - if(!file_exists($lfile)) { - echo "Unable to file $lfile . Contact PharmGKB to get access to license-restricted data".PHP_EOL; - continue; - } - } - - // download - $rfile = $rdir.$file.".zip"; - echo "Downloading $file ..."; - if($file == 'offsides') { - if(!file_exists($lfile)) { - Utils::DownloadSingle('https://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip', $lfile); - } - } elseif($file == 'twosides') { - if(!file_exists($lfile)) { - Utils::DownloadSingle('https://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip', $lfile); - } - } elseif($file == 'pathways') { - Utils::DownloadSingle('https://www.pharmgkb.org/download.do?dlCls=common&objId='.$file.'-tsv.zip', $lfile); - } else { - Utils::DownloadSingle('https://www.pharmgkb.org/download.do?dlCls=common&objId='.$file.'.zip', $lfile); - } - echo "done.".PHP_EOL; - } + echo "Download the data from https://www.pharmgkb.org/downloads.".PHP_EOL; } function run() @@ -109,40 +83,11 @@ function run() $dataset_description = ''; foreach($files AS $file) { + if($file == "pathways") $file = "pathways-tsv"; $suffix = ".zip"; $lfile = $ldir.$file.$suffix; $rfile = $rdir.$file.$suffix; - if($file == "offsides" and !file_exists($lfile)){ - echo "downloading twosides..."; - $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip"; - utils::DownloadSingle($rfile,$lfile); - echo "done".PHP_EOL; - } elseif($file == "twosides" and !file_exists($lfile)){ - echo "downloading $file ..."; - $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip"; - utils::DownloadSingle($rfile,$lfile); - echo "done".PHP_EOL; - } elseif($file == 'annotations' or $file == 'relationships') { - if(!file_exists($lfile)) { - echo "Contact PharmGKB to get access to variants/clinical variants; save file as annotations.zip".PHP_EOL; - continue; - } - } elseif($file == 'pathways') { - if(!file_exists($lfile)) { - echo "Downloading $lfile ... "; - Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId='.$file.'-tsv.zip&dlCls=common', $lfile); - echo "done".PHP_EOL; - } - - } else { - if(!file_exists($lfile) or parent::getParameterValue('download') == true) { - echo "Downloading $lfile ... "; - Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId='.$file.'.zip&dlCls=common', $lfile); - echo "done".PHP_EOL; - } - } - // get a pointer to the file in the zip archive if(!file_exists($lfile)) {echo "no local copy of $lfile . skipping".PHP_EOL;continue;} @@ -158,7 +103,7 @@ function run() 'clinical_ann_metadata.tsv', 'var_drug_ann.tsv','var_pheno_ann.tsv','var_fa_ann.tsv' ); - } else if($file == "pathways") { + } else if($file == "pathways-tsv") { for( $i = 0; $i < $zin->numFiles; $i++ ){ $stat = $zin->statIndex( $i ); $entry = $stat['name']; @@ -195,7 +140,7 @@ function run() if($file == "annotations") { $fnx = substr($zipentry,0,strpos($zipentry,".tsv")); echo "processing $zipentry.."; - } else if($file == 'pathways') { + } else if($file == 'pathways-tsv') { $fnx = 'pathways'; $this->pathway_name = $zipentry; echo "processing $fnx ($zipentry)... "; @@ -276,14 +221,14 @@ function run() function genes() { $h = explode("\t",parent::getReadFile()->read()); - $expected_columns = 15; + $expected_columns = 17; if(($n = count($h)) != $expected_columns) { trigger_error("Found $n columns in gene file - expecting $expected_columns!", E_USER_WARNING); //print_r($h); return false; } - while($l = parent::getReadFile()->read(200000)) { + while($l = parent::getReadFile()->read(2000000)) { $a = explode("\t",$l); $id = parent::getNamespace().$a[0]; $label = $a[4]; @@ -302,19 +247,29 @@ function genes() ); if($a[1]){ - parent::addRDF( - parent::triplify($id, parent::getVoc()."x-ncbigene", "ncbigene:".$a[1]) - ); + $list = $this->parseList($a[1]); + foreach($list as $c) { + parent::addRDF( + parent::triplify($id, parent::getVoc()."x-ncbigene", "ncbigene:".$c) + ); + } } if($a[2]){ - parent::addRDF( - parent::triplify($id, parent::getVoc()."x-hgnc", "hgnc:".$a[2]) - ); + $list = $this->parseList($a[2]); + foreach($list as $c) { + parent::addRDF( + parent::triplify($id, parent::getVoc()."x-hgnc", "hgnc:".$c) + ); + } + } if($a[3]){ - parent::addRDF( - parent::triplify($id, parent::getVoc()."x-ensembl", "ensembl:".$a[3]) - ); + $list = $this->parseList($a[3]); + foreach($list as $c) { + parent::addRDF( + parent::triplify($id, parent::getVoc()."x-ensembl", "ensembl:".$c) + ); + } } if($a[4]){ @@ -400,8 +355,14 @@ function genes() ); if($a[13] != '-1' and $a[14] != '-1') { parent::addRDF( - parent::triplifyString($id,parent::getVoc()."chromosome-start",$a[13]). - parent::triplifyString($id,parent::getVoc()."chromosome-end",$a[14]) + parent::triplifyString($id,parent::getVoc()."grch37.p13-chromosome-start",$a[13]). + parent::triplifyString($id,parent::getVoc()."grch37.p13-chromosome-end",$a[14]) + ); + } + if($a[15] != '-1' and $a[16] != '-1') { + parent::addRDF( + parent::triplifyString($id,parent::getVoc()."grch38.p7-chromosome-start",$a[13]). + parent::triplifyString($id,parent::getVoc()."grch38.p7-chromosome-end",$a[14]) ); } } @@ -413,7 +374,7 @@ function parseList($str) { $list = ''; if($str[0] == '"') $list = explode('","', substr($str,1,-1)); - else $list[] = $str; + else $list = array($str); return $list; } @@ -433,7 +394,9 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null) 'refseqprotein' => 'refseq', 'refseqdna' => 'refseq', 'comparativetoxicogenomicsdatabase' => 'ctd', - 'humancycgene' => 'humancyc' + 'humancycgene' => 'humancyc', + 'chemicalabstractsservice' => 'cas', + 'chebi:chebi' => 'chebi' ); $this->getRegistry()->ParseQName($xref,$ns,$id); $ns = str_replace(array('"',' '),'',$ns); @@ -467,14 +430,19 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null) */ function drugs() { - $declared = ''; + $declared = array(); $h = explode("\t",$this->GetReadFile()->Read(10000)); // first line is header - if(count($h) != 10) { - trigger_error("Change in number of columns for drugs file",E_USER_ERROR); - return FALSE; + $ncols = count($h); + $nexp = 24; + if($ncols != $nexp) { + trigger_error("Change in number of columns for drugs file. Expected $nexp but found $ncols.",E_USER_ERROR); + #return FALSE; } + $this->GetReadFile()->Read(200000); + while($l = $this->GetReadFile()->Read(200000)) { $a = explode("\t",$l); + $id = parent::getNamespace().$a[0]; $this->drugs[$a[0]] = $a[1]; @@ -535,10 +503,13 @@ function drugs() $list = $this->parseList(trim($a[6])); foreach($list as $c) { $this->getRegistry()->parseQName($c,$ns,$id1); - $ns = str_replace(array('"',' '),'',$ns); - $ns = str_replace(array('keggcompound','keggdrug','drugbank','uniprotkb','clinicaltrials.gov','drugsproductdatabase(dpd)','nationaldrugcodedirectory','therapeutictargetsdatabase','fdadruglabelatdailymed'), - array('kegg','kegg','drugbank', 'uniprot','clinicaltrials','dpd','ndc','ttd','dailymed'), - strtolower(str_replace('"','',$ns))); + if($ns == "chebi") $id1 = substr($id1, 6); + $ns = str_replace( + array('chemicalabstractsservice','keggcompound','keggdrug','drugbank','uniprotkb','clinicaltrials.gov','drugsproductdatabase(dpd)','nationaldrugcodedirectory','therapeutictargetsdatabase','fdadruglabelatdailymed'), + array('cas','kegg','kegg','drugbank', 'uniprot','clinicaltrials','dpd','ndc','ttd','dailymed'), + strtolower(str_replace(' ','',$ns))); + + #echo $ns." ".$id1.PHP_EOL; if($ns == "url") { parent::addRDF( parent::QQuad($id, "rdfs:seeAlso", $id) @@ -553,20 +524,27 @@ function drugs() if(trim($a[7])) { parent::addRDF( - parent::triplifyString($id, parent::getVoc()."smiles", substr($a[7],1,-1)). + parent::triplifyString($id, parent::getVoc()."smiles", addslashes(substr($a[7],1,-1))). parent::describeProperty(parent::getVoc()."smiles", "Relationship between a PharmGKB drug and its SMILES string") ); } - if($a[8]) { + if(trim($a[8])) { parent::addRDF( - parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[8]) + parent::triplifyString($id, parent::getVoc()."inchi", $a[8]). + parent::describeProperty(parent::getVoc()."smiles", "Relationship between a PharmGKB drug and its SMILES string") + ); + } + + if($a[9]) { + parent::addRDF( + parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[9]) ); } - if(trim($a[9])) { + if(trim($a[10])) { // External Vocabulary // ATC:H01AC(Somatropin and somatropin agonists),ATC:V04CD(Tests for pituitary function) // ATC:D07AB(Corticosteroids, moderately potent (group II)) => this is why you don't use brackets and commas as separators. - $list = $this->parseList(trim($a[9])); + $list = $this->parseList(trim($a[10])); foreach($list as $c) { preg_match("/([^\(]+)?\((.*)\)/", $c, $m); if(isset($m[1])) { @@ -586,6 +564,16 @@ function drugs() } } } + if(trim($a[22])) { + // ATC identifiers + $list = $this->parseList(trim($a[22])); + foreach($list as $c) { + parent::addRDF( + parent::triplify($id, parent::getVoc()."x-atc", "atc:".$c) + ); + } + } + parent::writeRDFBufferToWriteFile(); } } @@ -597,7 +585,7 @@ function drugs() [3] => Cross-references [4] => External Vocabulary */ - function diseases() + function phenotypes() { $h = explode("\t",$this->GetReadFile()->Read(10000)); // first line is header if(count($h) != 5) { @@ -778,49 +766,64 @@ function relationships() /* - THIS FILE ONLY INCLUDES RSIDs IN GENES - RSID Gene IDs Gene Symbols - rs8331 PA27674;PA162375713 EGR2;ADO + THIS FILE ONLY INCLUDES variants IN GENES + +Variant ID Variant Name Gene IDs Gene Symbols Location Variant Annotation count Clinical Annotation count Level 1/2 Clinical Annotation count Guideline Annotation count Label Annotation count Synonyms +PA166156302 rs1000002 PA395 ABCC5 NC_000003.11:183635768 1 0 0 0 0 rs17623022, NC_000003.12:g.183917980C>T, rs386508637, rs1000002, 1000002, [GRCh37]chr3:183635768, rs60664316, NC_000003.11:g.183635768C>T + */ - function rsid() + function variants() { $z = 0; - $this->GetReadFile()->Read(); - $this->GetReadFile()->Read(); + $header = $this->GetReadFile()->Read(); parent::addRDF( - parent::describeClass(parent::getVoc()."Variation", "PharmGKB Variation") + parent::describeClass(parent::getVoc()."Variant", "PharmGKB Variant") ); while($l = $this->GetReadFile()->Read()) { - if($z % 10000 == 0) { - parent::writeRDFBufferToWriteFile(); - } $a = explode("\t",$l); - $rsid = "dbsnp:".$a[0]; - $genes = explode(";",$a[1]); - parent::addRDF( - parent::describeIndividual($rsid, $rsid, parent::getVoc()."Variation") - ); - foreach($genes AS $gene) { + if(isset($a[1])) { + $id = parent::getNamespace().$a[0]; + $rsid = "dbsnp:".$a[1]; + $genes = explode(",",$a[2]); parent::addRDF( - parent::triplify($rsid, parent::getVoc()."gene", parent::getNamespace().$gene) + parent::describeIndividual($id, $id, parent::getVoc()."Variant"). + parent::triplify($id, parent::getVoc()."x-dbsnp", $rsid) ); + foreach($genes AS $gene) { + parent::addRDF( + parent::triplify($id, parent::getVoc()."gene", parent::getNamespace().$gene) + ); + } } } + parent::writeRDFBufferToWriteFile(); } function clinical_ann_metadata() { - $header = array("Clinical Annotation Id","Location","Gene","Level of Evidence","Clinical Annotation Types","Genotype-Phenotype IDs","Annotation Text","Variant Annotations IDs","Variant Annotations","PMIDs","Evidence Count","Related Drugs","Related Diseases","Race"); + $header = array("Clinical Annotation Id","Location","Gene","Level of Evidence","Clinical Annotation Types","Genotype-Phenotype IDs","Annotation Text","Variant Annotations IDs","Variant Annotations","PMIDs","Evidence Count","Related Drugs","Related Diseases","Biogeographical groups", "Chromosome"); $this_header = explode("\t",$this->getReadFile()->read()); if(count($this_header) != count($header)) { trigger_error("Change in the number of columns. Expected ".count($header).", but found ".count($this_header),E_USER_ERROR); return (-1); } - while($l = $this->GetReadFile()->Read(2000000)) { + while($l = $this->GetReadFile()->Read(20000000)) { $a = explode("\t",$l); $id = parent::getNamespace().$a[0]; + # fixing bad file formatting + if($a[0] == "982040598" or $a[0] == "982037603") { + $a[8] .= $a[11]; + $a[9] = $a[12]; + $a[10] = $a[13]; + $a[11]= $a[14]; + $a[12] = $a[15]; + $a[13] = $a[16]; + $a[14] = $a[17]; + } + + $label = "clinical genotype to phenotype annotations for ".$a[1]; // [0] => Clinical Annotation Id parent::addRDF( @@ -929,6 +932,7 @@ function clinical_ann_metadata() // [11] => Related Drugs if($a[11]) { + //print_r($a);exit; $b = $this->parseList($a[11]); foreach($b AS $drug_label) { preg_match('/\(PA(.*)\)/',$drug_label,$m); @@ -938,7 +942,7 @@ function clinical_ann_metadata() parent::triplify($id, parent::getVoc()."related-drug", "pharmgkb:PA".$m[1]) ); } else { - echo "Error in parsing drug label - ".$drug_label." ".PHP_EOL; + echo "Error in parsing drug label for $id - ".$drug_label." ".PHP_EOL; } } parent::addRDF( @@ -955,18 +959,20 @@ function clinical_ann_metadata() parent::triplify($id, parent::getVoc()."related-disease", "pharmgkb:PA".$m[1]) ); } else { - echo "Error in parsing disease label - ".$disease_label." ".PHP_EOL; + print_r($a); + echo $l.PHP_EOL; + echo "Error in parsing disease label for $id - ".$disease_label." ".PHP_EOL; } } parent::addRDF( parent::describeProperty(parent::getVoc()."related-disease", "Relationship between a PharmGKB annotation and a related disease") ); } - // [13] => OMB Races + // [13] => Biogeographical groupss if($a[13]) { parent::addRDF( - parent::triplifyString($id, parent::getVoc()."race", $a[13]). - parent::describeProperty(parent::getVoc()."race", "Relationship between a PharmGKB annotation and a race") + parent::triplifyString($id, parent::getVoc()."biogeographical-group", $a[13]). + parent::describeProperty(parent::getVoc()."biogeographical-group", "Relationship between a PharmGKB annotation and a biogeographical group") ); } } @@ -980,7 +986,7 @@ function var_pheno_ann() {return $this->variant_annotation();} function variant_annotation() { - $canonical_header = array("Annotation ID","Variant","Gene","Chemical","Literature Id","Phenotype Category","Significance","Notes","Sentence","StudyParameters","Alleles"); + $canonical_header = array("Annotation ID","Variant","Gene","Chemical","PMID","Phenotype Category","Significance","Notes","Sentence","StudyParameters","Alleles","Chromosome"); $header = explode("\t",$this->getReadFile()->read(20000)); if(count($header) != count($canonical_header)) { trigger_error("column mismatch! Expected ".count($canonical_header).",but found ".count($header),E_USER_ERROR); @@ -1138,7 +1144,7 @@ function pathways() parent::describeClass(parent::getVoc()."Pathway","PharmGKB Pathway") ); - $fields = array('From','To','Reaction Type','Controller','Control Type','Cell Type','PubMed Id','Genes','Drugs','Diseases'); + $fields = array('From','To','Reaction Type','Controller','Control Type','Cell Type','PMIDs','Genes','Drugs','Diseases'); $h = explode("\t", $this->getReadFile()->read(50000)); // @todo check that the fields match @@ -1155,7 +1161,7 @@ function pathways() parent::addRDF( parent::describeIndividual($uri, $label, $type). parent::describeClass($type, $a[2]). - parent::describeIndividual($from, $a[0], parent::getVoc()."Resource"). + parent::describeIndividual($from, str_replace('"', '', $a[0]), parent::getVoc()."Resource"). parent::describeIndividual($to, $a[1], parent::getVoc()."Resource"). parent::triplify($uri, parent::getVoc()."from", $from). parent::triplify($uri, parent::getVoc()."to", $to). @@ -1190,7 +1196,7 @@ function pathways() $pmids = explode(",",$a[6]); foreach($pmids AS $pmid) { parent::addRDF( - parent::triplify($uri, parent::getVoc()."x-pubmed", "pubmed:$pmid") + parent::triplify($uri, parent::getVoc()."x-pubmed", "pubmed:".trim($pmid)) ); } } @@ -1243,8 +1249,6 @@ function pathways() } }} - - parent::writeRDFBufferToWriteFile(); } } From 6fc18ae3f1e4a17f66afd987181aceea1bc33944 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 16:45:24 +0200 Subject: [PATCH 39/64] update the orphanet disease processor --- orphanet/orphanet.php | 111 ++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 41 deletions(-) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index ae5651e..8bc7675 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -35,14 +35,14 @@ class ORPHANETParser extends Bio2RDFizer { private $filemap = array( 'disease' => 'en_product1.xml', - 'epi' => 'en_product2.xml', - 'd2s' => 'en_product4.xml', - 'signs' => 'en_product5.xml', + 'epi' => 'en_product9_prev.xml', + # 'd2s' => 'en_product4.xml', + # 'signs' => 'en_product5.xml', 'genes' => 'en_product6.xml' ); function __construct($argv) { parent::__construct($argv, "orphanet"); - parent::addParameter('files',true,'all|disease|epi|d2s|signs|genes','all','all or comma-separated list of ontology short names to process'); + parent::addParameter('files',true,'all|disease|genes','all','all or comma-separated list of ontology short names to process'); parent::addParameter('download_url',false,null,'http://www.orphadata.org/data/xml/'); parent::initialize(); } @@ -79,43 +79,43 @@ function run() $ofile = "orphanet-".$file.'.'.$suffix; $gz = strstr(parent::getParameterValue('output_format'), "gz")?($gz=true):($gz=false); -/* parent::setWriteFile($odir.$ofile, $gz); + parent::setWriteFile($odir.$ofile, $gz); $this->$file($lfile); parent::getWriteFile()->close(); -*/ parent::getReadFile()->close(); + parent::getReadFile()->close(); parent::clear(); echo "done!".PHP_EOL; // dataset description - $source_file = (new DataResource($this)) - ->setURI($rfile) - ->setTitle("Orphanet: $file") - ->setRetrievedDate(parent::getDate(filemtime($lfile))) - ->setFormat("application/xml") - ->setPublisher("http://www.orpha.net") - ->setHomepage("http://www.orpha.net/") - ->setRights("use") - ->setRights("sharing-modified-version-needs-permission") - ->setLicense("http://creativecommons.org/licenses/by-nd/3.0/") - ->setDataset("http://identifiers.org/orphanet/"); + $source_file = (new DataResource($this)) + ->setURI($rfile) + ->setTitle("Orphanet: $file") + ->setRetrievedDate(parent::getDate(filemtime($lfile))) + ->setFormat("application/xml") + ->setPublisher("http://www.orpha.net") + ->setHomepage("http://www.orpha.net/") + ->setRights("use") + ->setRights("sharing-modified-version-needs-permission") + ->setLicense("http://creativecommons.org/licenses/by-nd/3.0/") + ->setDataset("http://identifiers.org/orphanet/"); - $prefix = parent::getPrefix(); - $bVersion = parent::getParameterValue('bio2rdf_release'); - $date = parent::getDate(filemtime($odir.$ofile)); + $prefix = parent::getPrefix(); + $bVersion = parent::getParameterValue('bio2rdf_release'); + $date = parent::getDate(filemtime($odir.$ofile)); - $output_file = (new DataResource($this)) - ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile") - ->setTitle("Bio2RDF v$bVersion RDF version of $prefix") - ->setSource($source_file->getURI()) - ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php") - ->setCreateDate($date) - ->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html") - ->setPublisher("http://bio2rdf.org") - ->setRights("use-share-modify") - ->setRights("by-attribution") - ->setRights("restricted-by-source-license") - ->setLicense("http://creativecommons.org/licenses/by/3.0/") - ->setDataset(parent::getDatasetURI()); + $output_file = (new DataResource($this)) + ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile") + ->setTitle("Bio2RDF v$bVersion RDF version of $prefix") + ->setSource($source_file->getURI()) + ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php") + ->setCreateDate($date) + ->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html") + ->setPublisher("http://bio2rdf.org") + ->setRights("use-share-modify") + ->setRights("by-attribution") + ->setRights("restricted-by-source-license") + ->setLicense("http://creativecommons.org/licenses/by/3.0/") + ->setDataset(parent::getDatasetURI()); $gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true; if($gz) $output_file->setFormat("application/gzip"); @@ -137,6 +137,7 @@ function disease($file) foreach($x->Disorder AS $d) { // var_dump($d);exit; + $internal_id = (string) $d->attributes()->id; $orphanet_id = parent::getNamespace().((string)$d->OrphaNumber); $name = (string) $d->Name; @@ -150,7 +151,7 @@ function disease($file) ); foreach($d->SynonymList AS $s) { - $synonym = (string) $s->Synonym; + $synonym = str_replace('"','', (string) $s->Synonym); parent::addRDF( parent::triplifyString($orphanet_id, parent::getVoc()."synonym", $synonym) ); @@ -165,13 +166,41 @@ function disease($file) } } foreach($d->ExternalReferenceList AS $erl) { - $er = $erl->ExternalReference; - $source = (string) $er->Source; - $db = parent::getRegistry()->getPreferredPrefix($source); - $id = (string) $er->Reference; - parent::addRDF( - parent::triplify($orphanet_id, parent::getVoc()."x-$db", "$db:$id") - ); + foreach($erl->ExternalReference AS $er) { + $source = (string) $er->Source; + $db = parent::getRegistry()->getPreferredPrefix($source); + $id = (string) $er->Reference; + parent::addRDF( + parent::triplify($orphanet_id, parent::getVoc()."x-$db", "$db:$id") + ); + } + } + /* + + + + + + Definition + + Multiple epiphyseal dysplasia, Al-Gazali type is a skeletal dysplasia characterized by multiple epiphyseal dysplasia (see this term), macrocephaly and facial dysmorphism. + + + + + */ + foreach($d->TextualInformationList AS $til) { + foreach($til->TextualInformation As $ti) { + foreach($ti->TextSectionList AS $tsl) { + foreach($tsl->TextSection AS $ts) { + if(((string) $ts->TextSectionType->Name) == "Definition") { + parent::addRDF( + parent::triplifyString($orphanet_id, parent::getVoc()."definition", (string) $ts->Contents) + ); + }; + } + } + } } parent::writeRDFBufferToWriteFile(); } From e26d8d50d0fc6f08653ef3f37f5fbe30c92eff13 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 16:48:24 +0200 Subject: [PATCH 40/64] escape the definition --- orphanet/orphanet.php | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index 8bc7675..3fd6e15 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -149,7 +149,7 @@ function disease($file) parent::triplifyString($orphanet_id, parent::getVoc()."internal-id", $internal_id). parent::triplify($orphanet_id, parent::getVoc()."expert-link-url", $expert_link) ); - + // get the synonyms foreach($d->SynonymList AS $s) { $synonym = str_replace('"','', (string) $s->Synonym); parent::addRDF( @@ -165,6 +165,7 @@ function disease($file) ); } } + // get external references foreach($d->ExternalReferenceList AS $erl) { foreach($erl->ExternalReference AS $er) { $source = (string) $er->Source; @@ -175,27 +176,14 @@ function disease($file) ); } } - /* - - - - - - Definition - - Multiple epiphyseal dysplasia, Al-Gazali type is a skeletal dysplasia characterized by multiple epiphyseal dysplasia (see this term), macrocephaly and facial dysmorphism. - - - - - */ + // get the definition foreach($d->TextualInformationList AS $til) { foreach($til->TextualInformation As $ti) { foreach($ti->TextSectionList AS $tsl) { foreach($tsl->TextSection AS $ts) { if(((string) $ts->TextSectionType->Name) == "Definition") { parent::addRDF( - parent::triplifyString($orphanet_id, parent::getVoc()."definition", (string) $ts->Contents) + parent::triplifyString($orphanet_id, parent::getVoc()."definition", addslashes((string) $ts->Contents)) ); }; } From 103506db43c034f634276d5b1b24c7ff9c0f872f Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 16:58:33 +0200 Subject: [PATCH 41/64] fixes for orphanet genes --- orphanet/orphanet.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index 3fd6e15..bf79d3c 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -384,15 +384,16 @@ function genes($file) foreach($d->DisorderGeneAssociationList->DisorderGeneAssociation AS $dga) { // gene $gene = $dga->Gene; - $gene_id = parent::getNamespace().((string) $gene->OrphaNumber); - $gene_internal_id = ((string) $gene->attributes()->id); + $gid = ((string) $gene->attributes()->id); + $gene_id = parent::getNamespace().$gid; $gene_label = (string) $gene->Name; $gene_symbol = (string) $gene->Symbol; parent::addRDF( parent::describeIndividual($gene_id,$gene_label,parent::getVoc()."Gene"). - parent::describeClass(parent::getVoc()."Gene","orphanet gene"). + parent::describeClass(parent::getVoc()."Gene","Orphanet Gene"). parent::triplifyString($gene_id,parent::getVoc()."symbol",$gene_symbol) ); + foreach($gene->SynonymList AS $s) { $synonym = (string) $s->Synonym; parent::addRDF( From b021002760aa30c77c7ad92e023e1b2e83187204 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 17:00:40 +0200 Subject: [PATCH 42/64] correctly parse the list of external references for the genes --- orphanet/orphanet.php | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index bf79d3c..0860986 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -401,14 +401,15 @@ function genes($file) ); } foreach($gene->ExternalReferenceList AS $erl) { - $er = $erl->ExternalReference; - $db = (string) $er->Source; - $db = parent::getRegistry()->getPreferredPrefix($db); - $id = (string) $er->Reference; - $xref = "$db:$id"; - parent::addRDF( - parent::triplify($gene_id, parent::getVoc()."x-$db", $xref) - ); + foreach($erl->ExternalReference AS $er) { + $db = (string) $er->Source; + $db = parent::getRegistry()->getPreferredPrefix($db); + $id = (string) $er->Reference; + $xref = "$db:$id"; + parent::addRDF( + parent::triplify($gene_id, parent::getVoc()."x-$db", $xref) + ); + } } $dga_id = parent::getRes().((string)$d->OrphaNumber)."_".md5($dga->asXML()); From 777a99b1f0db186d27fd936b5f1fc2c595d84f2e Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 17:21:17 +0200 Subject: [PATCH 43/64] added source of validation to gene disease association --- orphanet/orphanet.php | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index 0860986..1cbcb66 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -412,6 +412,19 @@ function genes($file) } } + // parse the sources of validation + //16150725[PMID]_16150725[PMID]_21771795[PMID] + $sources = explode("_",$dga->SourceOfValidation); + foreach($sources AS $source) { + preg_match_all("/([0-9]*)\[([^\]]*)?\]/",$source, $m, PREG_PATTERN_ORDER ); + if(isset($m[1][0])) { + $prefix = parent::getRegistry()->getPreferredPrefix($m[2][0]); + parent::addRDF( + parent::triplify($gene_id,parent::getVoc()."source-of-validation", "$prefix:".$m[1][0]) + ); + } + } + $dga_id = parent::getRes().((string)$d->OrphaNumber)."_".md5($dga->asXML()); $ga = $dga->DisorderGeneAssociationType; $ga_id = parent::getNamespace().((string) $ga->attributes()->id); From 85d2f7c1862fd74ceb5df7c3309b7b7bf3c972d7 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 26 May 2020 18:19:02 +0200 Subject: [PATCH 44/64] revised processing of orphanet signs and frequencies --- orphanet/orphanet.php | 83 +++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index 1cbcb66..a1c6010 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -36,13 +36,13 @@ class ORPHANETParser extends Bio2RDFizer private $filemap = array( 'disease' => 'en_product1.xml', 'epi' => 'en_product9_prev.xml', - # 'd2s' => 'en_product4.xml', + 'phenotypefreq' => 'en_product4.xml', # 'signs' => 'en_product5.xml', 'genes' => 'en_product6.xml' ); function __construct($argv) { parent::__construct($argv, "orphanet"); - parent::addParameter('files',true,'all|disease|genes','all','all or comma-separated list of ontology short names to process'); + parent::addParameter('files',true,'all|disease|phenotypefreq|genes','all','all or comma-separated list of ontology short names to process'); parent::addParameter('download_url',false,null,'http://www.orphadata.org/data/xml/'); parent::initialize(); } @@ -282,41 +282,64 @@ function epi ($file) unset($xml); } - function d2s($file) + function phenotypefreq($file) { /* - - - - Macrocephaly/macrocrania/megalocephaly/megacephaly - - - Very frequent - - - */ + + + 558 + http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=558 + Marfan syndrome + + Disease + + + Disorder + + + + + + HP:0000768 + Pectus carinatum + + + Very frequent (99-80%) + + + Diagnostic criterion + + + */ $xml = new CXML($file); - while($xml->parse("DisorderList") == TRUE) { + while($xml->parse("HPODisorderSetStatus") == TRUE) { $x = $xml->GetXMLRoot(); foreach($x->Disorder AS $d) { $orphanet_id = parent::getNamespace().((string)$d->OrphaNumber); - foreach($d->DisorderSignList->DisorderSign AS $ds) { - $sfid = parent::getRes().md5($ds->asXML()); - if($ds->ClinicalSign) { - $sid = parent::getVoc().((string)$ds->ClinicalSign->attributes()->id); - $s = (string) $ds->ClinicalSign->Name; - $fid = parent::getRes().((string) $ds->SignFreq->attributes()->id); - $f = (string) $ds->SignFreq->Name; - parent::addRDF( - parent::describeIndividual($sfid, "$f $s",parent::getVoc()."Clinical-Sign-And-Frequency"). - parent::describeClass(parent::getVoc()."Clinical-Sign-And-Frequency","Clinical Sign and Frequency"). - parent::triplify($orphanet_id, parent::getVoc()."sign-freq", $sfid). - parent::triplify($sfid,parent::getVoc()."sign", $sid). - parent::describeClass($sid,$s,parent::getVoc()."Clinical-Sign"). - parent::triplify($sfid,parent::getVoc()."frequency",$fid). - parent::describeClass($fid,$f,parent::getVoc()."Frequency") - ); + $disease_name = ((string)$d->Name); + foreach($d->HPODisorderAssociationList->HPODisorderAssociation AS $ds) { + $sfid = parent::getNamespace().((string)$ds->attributes()->id); + $s = (string) $ds->HPO->HPOTerm; + $sid = $ds->HPO->HPOId; + $f = (string) $ds->HPOFrequency->Name; + $fid = parent::getRes().((string) $ds->HPOFrequency->attributes()->id); + + $diagnostic = false; + if($ds->DiagnosticCriteria->Name) { + $diagnostic = true; } + $sflabel = "$f $s".(($diagnostic == true)?" that is diagnostic":"")." for ".$disease_name; + + parent::addRDF( + parent::describeIndividual($sfid, $sflabel, parent::getVoc()."Clinical-Sign-And-Frequency"). + parent::describeClass(parent::getVoc()."Clinical-Sign-And-Frequency","Clinical Sign and Frequency"). + parent::triplify($orphanet_id, parent::getVoc()."sign-freq", $sfid). + parent::triplify($sfid,parent::getVoc()."sign", $sid). + parent::triplify($sfid,parent::getVoc()."frequency",$fid). + parent::triplifyString($sfid, parent::getVoc()."is-diagnostic", (isset($diagnostic)?"true":"false")). + parent::triplifyString($fid, "rdfs:label", $fid). + parent::describeClass($fid,$f,parent::getVoc()."Frequency") + ); } parent::writeRDFBufferToWriteFile(); } From 103b82675f3c9af25a337f5036ffd3845f47dbd7 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Wed, 27 May 2020 14:49:14 +0200 Subject: [PATCH 45/64] id fixes; addition of prevalence parser --- orphanet/orphanet.php | 147 +++++++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 39 deletions(-) diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php index a1c6010..54a9029 100644 --- a/orphanet/orphanet.php +++ b/orphanet/orphanet.php @@ -35,14 +35,13 @@ class ORPHANETParser extends Bio2RDFizer { private $filemap = array( 'disease' => 'en_product1.xml', - 'epi' => 'en_product9_prev.xml', + 'prevalence' => 'en_product9_prev.xml', 'phenotypefreq' => 'en_product4.xml', - # 'signs' => 'en_product5.xml', 'genes' => 'en_product6.xml' ); function __construct($argv) { parent::__construct($argv, "orphanet"); - parent::addParameter('files',true,'all|disease|phenotypefreq|genes','all','all or comma-separated list of ontology short names to process'); + parent::addParameter('files',true,'all|disease|genes|phenotypefreq|prevalence','all','all or comma-separated list of ontology short names to process'); parent::addParameter('download_url',false,null,'http://www.orphadata.org/data/xml/'); parent::initialize(); } @@ -196,52 +195,121 @@ function disease($file) unset($xml); } - function epi ($file) + function prevalence ($file) { $seen = ''; $xml = new CXML($file); while($xml->parse("DisorderList") == TRUE) { $x = $xml->GetXMLRoot(); foreach($x->Disorder AS $d) { - // var_dump($d);exit; + $orphanet_id = parent::getNamespace().((string)$d->OrphaNumber); - if(isset($d->ClassOfPrevalence)) { - $id = parent::getNamespace().((string) $d->ClassOfPrevalence->attributes()->id); - $name = (string) $d->ClassOfPrevalence->Name; - if($name != '' && $name != 'Unknown' && $name != 'No data available') { - if(!isset($seen[$name])) { - $seen[$name] = true; - $a = explode (" / ", $name); - $size = str_replace(" ","",$a[1]); - $upper_bound = $lower_bound = ''; - if($a[0][0] == '<') { - $upper_bound = substr($a[0],1) / $size; - } else if($a[0][0] == '>') { - $lower_bound = substr($a[0],1) / $size; - } else { - $b = explode("-",$a[0]); - $lower_bound = $b[0] / $size; - $upper_bound = $b[1] / $size; - } - if($upper_bound) { + $disease_name = (string) $d->Name; + + foreach($d->PrevalenceList->Prevalence AS $pl) { + $id = parent::getRes()."pl".((string) $pl->attributes()->id); + parent::addRDF( + parent::describeClass($id,"Prevalence",parent::getVoc()."Prevalence"). + parent::describeIndividual($id, "Prevalence for $disease_name", parent::getVoc()."Prevalence") + ); + $type_id = parent::getRes()."pt".(string) $pl->PrevalenceType->attributes()->id; + $type_label = (string) $pl->PrevalenceType->Name; + if($type_label != "") { + parent::addRDF( + parent::describeIndividual($type_id, $type_label, parent::getVoc()."Prevalence-Type"). + parent::triplify($id, parent::getVoc()."prevalence-type", $type_id). + parent::triplify($orphanet_id, parent::getVoc()."prevalence", $id) + ); + } + + $qual_id = parent::getRes()."qu".(string) $pl->PrevalenceQualification->attributes()->id; + $qual_label = (string) $pl->PrevalenceQualification->Name; + if($qual_label != "") { + parent::addRDF( + parent::describeIndividual($qual_id, $qual_label, parent::getVoc()."Prevalence-Qualification"). + parent::triplify($id, parent::getVoc()."prevalence-qualification", $qual_id) + ); + } + + $prev_id = parent::getRes()."pr".(string) $pl->PrevalenceClass->attributes()->id; + $prev_label = (string) $pl->PrevalenceClass->Name; + if($prev_label != "") { + parent::addRDF( + parent::describeIndividual($prev_id, $prev_label, parent::getVoc()."Prevalence-Value"). + parent::triplify($id, parent::getVoc()."prevalence-value", $prev_id) + ); + } + + $geo_id = parent::getRes()."geo".(string) $pl->PrevalenceGeographic->attributes()->id; + $geo_label = (string) $pl->PrevalenceGeographic->Name; + if($geo_label != "") { + parent::addRDF( + parent::describeIndividual($geo_id, $geo_label, parent::getVoc()."Geographic-Prevalence"). + parent::triplify($id, parent::getVoc()."prevalence-geo", $geo_id) + ); + } + + $val_id = parent::getRes()."val".(string) $pl->PrevalenceValidationStatus->attributes()->id; + $val_label = (string) $pl->PrevalenceValidationStatus->Name; + if($val_label != "") { + parent::addRDF( + parent::describeIndividual($val_id, $val_label, parent::getVoc()."Prevalence-Validation-Status"). + parent::triplify($id, parent::getVoc()."prevalence-status", $val_id) + ); + } + $valmoy = (string) $pl->ValMoy; + if($valmoy != "") { + parent::addRDF( + parent::triplifyString($id, parent::getVoc()."val-moy", $valmoy) + ); + } + + + $source = trim((string) $pl->Source); + if($source and (strlen($source) != 0)) { + //23712425[PMID] + preg_match_all("/([0-9]*)\[([^\]]*)?\]/",$source, $m, PREG_SET_ORDER ); + foreach($m AS $i) { + if(isset($i[2]) and ($i[2] == "PMID")) { + $source_id = "PMID:".$i[1]; parent::addRDF( - parent::triplifyString($id,parent::getVoc()."upper-bound",$upper_bound, "xsd:float") + parent::triplify($id, parent::getVoc()."source", $source_id) ); - } - if($lower_bound) { + } else { parent::addRDF( - parent::triplifyString($id,parent::getVoc()."lower-bound",$lower_bound, "xsd:float") + parent::triplifyString($id, parent::getVoc()."source", $i[0]) ); } - } - parent::addRDF( - parent::triplify($orphanet_id, parent::getVoc()."prevalence", $id). - parent::describeClass($id,$name,parent::getVoc()."Prevalence") - ); - //echo parent::getRDF();exit; + } + } } + parent::writeRDFBufferToWriteFile(); + } + } + unset($xml); + } + + function onset ($file) + { + $seen = ''; + $xml = new CXML($file); + while($xml->parse("DisorderList") == TRUE) { + $x = $xml->GetXMLRoot(); + foreach($x->Disorder AS $d) { + // var_dump($d);exit; + $orphanet_id = parent::getNamespace().((string)$d->OrphaNumber); + $disease_name = (string) $d->Name; + foreach($d->PrevalanceList AS $pl) { + $id = parent::getNamespace().((string) $pl->attributes()->id); + + parent::addRDF( + parent::triplify($orphanet_id, parent::getVoc()."prevalence", $id). + parent::describeClass($id,$name,parent::getVoc()."Prevalence") + ); + + } if(isset($d->AverageAgeofOnset)) { $id = parent::getNamespace().((string) $d->AverageAgeOfOnset->attributes()->id); $name = (string) $d->AverageAgeOfOnset->Name; @@ -281,7 +349,8 @@ function epi ($file) } unset($xml); } - + + function phenotypefreq($file) { /* @@ -318,11 +387,11 @@ function phenotypefreq($file) $orphanet_id = parent::getNamespace().((string)$d->OrphaNumber); $disease_name = ((string)$d->Name); foreach($d->HPODisorderAssociationList->HPODisorderAssociation AS $ds) { - $sfid = parent::getNamespace().((string)$ds->attributes()->id); + $sfid = parent::getRes()."sf".((string)$ds->attributes()->id); $s = (string) $ds->HPO->HPOTerm; $sid = $ds->HPO->HPOId; $f = (string) $ds->HPOFrequency->Name; - $fid = parent::getRes().((string) $ds->HPOFrequency->attributes()->id); + $fid = parent::getRes()."f".((string) $ds->HPOFrequency->attributes()->id); $diagnostic = false; if($ds->DiagnosticCriteria->Name) { @@ -450,11 +519,11 @@ function genes($file) $dga_id = parent::getRes().((string)$d->OrphaNumber)."_".md5($dga->asXML()); $ga = $dga->DisorderGeneAssociationType; - $ga_id = parent::getNamespace().((string) $ga->attributes()->id); + $ga_id = parent::getRes()."ga".((string) $ga->attributes()->id); $ga_label = (string) $ga->Name; $s = $dga->DisorderGeneAssociationStatus; - $s_id = parent::getNamespace().((string) $s->attributes()->id); + $s_id = parent::getRes()."st".((string) $s->attributes()->id); $s_label = (string) $s->Name; parent::addRDF( From c62841fb9fec952c73d49bc6854ef9d464ac6457 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Sun, 31 May 2020 22:04:25 +0200 Subject: [PATCH 46/64] fix for weird character exceptions and multiple entries --- kegg/kegg.php | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kegg/kegg.php b/kegg/kegg.php index e144e08..01a7404 100644 --- a/kegg/kegg.php +++ b/kegg/kegg.php @@ -315,13 +315,16 @@ function parseEntry($lfile) parent::addRDF( parent::triplifyString($uri,"dc:description",$v) ); - } else if($k == "DEFINITION" and $e['type'] == "KO") { - preg_match("/\[([^\]]+)\]/",$v,$m); - if(isset($m[1])) { + } else if($k == "DEFINITION" and $e['type'] == "KO") { + preg_match("/\[EC:([^\]]+)/",$v,$m); + if(isset($m[1])) { + $a = explode(" ", $m[1]); + foreach($a AS $b) { parent::addRDF( - parent::triplify($uri,parent::getVoc()."x-ec",$m[1]) - ); + parent::triplify($uri,parent::getVoc()."x-ec","ec:".$b) + ); } + } } else if($k == "COMMENT") { preg_match("/ICD-O: ([^,]+),/",$v,$m); if(isset($m[1])) { @@ -471,12 +474,12 @@ function parseEntry($lfile) echo "parse error: ".$k." ".$v.PHP_EOL;continue; } $str = $a[1]; - + foreach($ids AS $id) { - $o = ''; - $o['id'] = $id; - $o['label'] = $str; - $o['type'] = strtolower($k); + #$o = ''; + #$o['id'] = $id; + #$o['label'] = $str; + #$o['type'] = strtolower($k); parent::addRDF( parent::triplify($uri,parent::getVoc().strtolower($k),"kegg:$id") ); From 1d2803189814b86a95082d7524cb9bc3f7b207c2 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 17:20:46 +0200 Subject: [PATCH 47/64] added check for entries without abstract --- interpro/interpro.php | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/interpro/interpro.php b/interpro/interpro.php index d4342ca..69a893b 100644 --- a/interpro/interpro.php +++ b/interpro/interpro.php @@ -173,15 +173,16 @@ function Parse($xml) } } } - $abstract = (string) $o->abstract->p->asXML(); - if(isset($pubs)) { - $abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract); - } - - parent::addRDF( - parent::triplifyString($s,"dc:description",$abstract) - ); + if(isset($o->abstract)) { + $abstract = (string) $o->abstract->p->asXML(); + if(isset($pubs)) { + $abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract); + } + parent::addRDF( + parent::triplifyString($s,"dc:description",$abstract) + ); + } if(isset($o->example_list)) { foreach($o->example_list->example AS $example) { $db = (string) $example->db_xref->attributes()->db; From b7b5382aa3231cccb86c4f979572b80ba7a83cb7 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 17:36:10 +0200 Subject: [PATCH 48/64] changed output file names --- pharmgkb/pharmgkb.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php index cf3fa30..fec5e8a 100644 --- a/pharmgkb/pharmgkb.php +++ b/pharmgkb/pharmgkb.php @@ -120,7 +120,7 @@ function run() // set the write file, parse, write and close $suffix = parent::getParameterValue('output_format'); - $outfile = $file.'.'.$suffix; + $outfile = "pharmgkb-".$file.'.'.$suffix; $gz=false; if(strstr(parent::getParameterValue('output_format'), "gz")) { From 9164b906532e6ff3ba164199664e52699b1ed621 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 17:36:58 +0200 Subject: [PATCH 49/64] removed html tags, extra spaces, and escaped special chars in abstract --- interpro/interpro.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/interpro/interpro.php b/interpro/interpro.php index 69a893b..8d074bb 100644 --- a/interpro/interpro.php +++ b/interpro/interpro.php @@ -179,6 +179,10 @@ function Parse($xml) if(isset($pubs)) { $abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract); } + $abstract= preg_replace('/(?i)<[^>]*>/', ' ', $abstract); #remove html tags + $abstract = trim(preg_replace("/\s+/",' ',$abstract)); # remove extra spaces + $abstract = addslashes($abstract); + parent::addRDF( parent::triplifyString($s,"dc:description",$abstract) ); From b5bff4e082368f54630681196334f8bf2f4197a1 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 17:51:06 +0200 Subject: [PATCH 50/64] update the path and version number of the latest entry --- irefindex/irefindex.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/irefindex/irefindex.php b/irefindex/irefindex.php index 176dd09..db27c55 100644 --- a/irefindex/irefindex.php +++ b/irefindex/irefindex.php @@ -34,8 +34,8 @@ class irefindexParser extends Bio2RDFizer function __construct($argv) { // parent::__construct($argv,"irefindex"); parent::addParameter('files',true,'all|10090|10116|4932|559292|562|6239|7227|9606','all','all or comma-separated list of files to process'); - parent::addParameter('version',false,'07042015|08122013|03022013|10182011','07042015','dated version of files to download'); - parent::addParameter('download_url',false,null,'http://irefindex.org/download/irefindex/data/current/psi_mitab/MITAB2.6/'); + parent::addParameter('version',false,'07042015|08122013|03022013|10182011','05-29-2019','dated version of files to download'); + parent::addParameter('download_url',false,null,'https://irefindex.vib.be/download/irefindex/data/archive/release_16.0/psi_mitab/MITAB2.6/'); parent::initialize(); } From f9e22c20e7dedce5ea5bcec0d876dc5d431828f9 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 18:49:02 +0200 Subject: [PATCH 51/64] added domain namespaces; new url for goa; changed output file format --- sgd/sgd.php | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sgd/sgd.php b/sgd/sgd.php index 013a325..88adbe6 100644 --- a/sgd/sgd.php +++ b/sgd/sgd.php @@ -40,7 +40,7 @@ function __construct($argv) { parent::addParameter('download_url',false,null,'http://downloads.yeastgenome.org/'); parent::addParameter('ncbo_download_dir', false, null, '/data/download/bioportal/', 'directory of bioportal ontologies'); parent::addParameter('ncbo_api_key',true,null,null,'your NCBO API key'); - parent::addParameter('one_file',false,'true|false','true',"whether to produce a single file output"); + parent::addParameter('one_file',false,'true|false','false',"whether to produce a single file output"); parent::initialize(); } @@ -74,7 +74,7 @@ function download(){ "features" => "curation/chromosomal_feature/SGD_features.tab", "domains" => "curation/calculated_protein_info/domains/domains.tab", "protein" => "curation/calculated_protein_info/protein_properties.tab", - "goa" => "curation/literature/gene_association.sgd.gz", + "goa" => "curation/literature/gene_association.sgd.gaf.gz", "goslim" => "curation/literature/go_slim_mapping.tab", "complex" => "curation/literature/go_protein_complex_slim.tab", "interaction" => "curation/literature/interaction_data.tab", @@ -121,7 +121,7 @@ function process(){ "features" => "curation/chromosomal_feature/SGD_features.tab", "domains" => "curation/calculated_protein_info/domains/domains.tab", "protein" => "curation/calculated_protein_info/protein_properties.tab", - "goa" => "curation/literature/gene_association.sgd.gz", + "goa" => "curation/literature/gene_association.sgd.gaf.gz", "goslim" => "curation/literature/go_slim_mapping.tab", "complex" => "curation/literature/go_protein_complex_slim.tab", "interaction" => "curation/literature/interaction_data.tab", @@ -135,7 +135,7 @@ function process(){ $gz = false;if(strstr(parent::getParameterValue('output_format'), "gz")) $gz = true; if(parent::getParameterValue('one_file') == true) { - $ofile = "sgd.".parent::getParameterValue('output_format'); + $ofile = "bio2rdf-sgd-".parent::getParameterValue('output_format'); parent::setWriteFile($odir.$ofile, $gz); } $dataset_description = ''; @@ -156,7 +156,7 @@ function process(){ } if(parent::getParameterValue('one_file') == false) { - $ofile = "sgd_".$file.'.'.parent::getParameterValue('output_format'); + $ofile = "bio2rdf-sgd-".$file.'.'.parent::getParameterValue('output_format'); parent::setWriteFile($odir.$ofile, $gz); } @@ -619,6 +619,7 @@ function domains(){ "BlastProDom" => "prodom", "FPrintScan" => "fprintscan", "Gene3D" => "gene3d", + "CDD" => "cdd", "Coil" => "coil", "Coils" => "coil", "Pfam" => "pfam", @@ -627,6 +628,7 @@ function domains(){ "PIRSF" => "pirsf", "PRINTS" => "prints", "Seg" => "seg", + "SFLD" => "sfld", "SMART" => "smart", "SUPERFAMILY" => "superfamily", "TIGRPFAM" => "pfam", @@ -636,6 +638,7 @@ function domains(){ "HMMPfam" => "pfam", "HMMPIR" => "pir", "HMMTigr" => "tigr", + "signalp" => "signalp", "SignalP_GRAM_POSITIVE" => "signalp", "SignalP_GRAM_NEGATIVE" => "signalp", "SignalP_EUK" => "signalp", From 15e100f8abdad5d0c7209f10e63675915d7e8b0f Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 18:58:49 +0200 Subject: [PATCH 52/64] made it so you can create the bioportal download directory for apo.obo --- sgd/sgd.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sgd/sgd.php b/sgd/sgd.php index 88adbe6..a4e3211 100644 --- a/sgd/sgd.php +++ b/sgd/sgd.php @@ -1347,7 +1347,10 @@ function GetMethodID($label, &$id, &$type) { }//GetMethodID function GetLatestNCBOOntology($ontology_id,$apikey,$target_filepath){ - Utils::DownloadSingle('http://data.bioontology.org/ontologies/'.$ontology_id.'/download?apikey='.$apikey, $target_filepath); + $url = 'http://data.bioontology.org/ontologies/'.$ontology_id.'/download?apikey='.$apikey; + $path = pathinfo($target_filepath); + @mkdir($path['dirname'],'0777'); + Utils::DownloadSingle($url, $target_filepath); } }//SGDParser From dc77276dc35d9d9aff08e7477ac067c34f59f57c Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 19:14:52 +0200 Subject: [PATCH 53/64] replaced empty string initialisation to array --- common/php/oboparser.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/php/oboparser.php b/common/php/oboparser.php index fe0c83e..11c27ec 100644 --- a/common/php/oboparser.php +++ b/common/php/oboparser.php @@ -9,9 +9,9 @@ function OBOParser($in) if(strstr($l,"[Term]")) { if(isset($term)) { - $terms[$term['id'][0]] = $term; + $terms[$term['id'][0]] = $term; } - $term = ''; + $term = array(); } else if(strstr($l,"[Typedef]")) { if(isset($term)) { $terms[$term['id'][0]] = $term; @@ -27,7 +27,7 @@ function OBOParser($in) if(count($m)) { $a[1] = $m[1]; } - $term[$a[0]][] = $a[1]; + $term[$a[0]][] = $a[1]; } else if(isset($typedef)) { From 62a1b478c850cbeb522facbe1811adc6eae28a02 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 19:22:14 +0200 Subject: [PATCH 54/64] fixed string initialisation of array --- sgd/sgd.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgd/sgd.php b/sgd/sgd.php index a4e3211..27decb7 100644 --- a/sgd/sgd.php +++ b/sgd/sgd.php @@ -1092,7 +1092,7 @@ function phenotype(){ function pathways(){ $sp = false; - $e = ''; + $e = array(); while($l = $this->GetReadFile()->Read(96000)) { $a = explode("\t",$l); From 0d1b76b24e9e30b2fda42d0936599fd995455466 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 20:54:03 +0200 Subject: [PATCH 55/64] removed files no longer available; changed output file names --- ncbigene/ncbigene.php | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ncbigene/ncbigene.php b/ncbigene/ncbigene.php index 5943d2c..31c7f5a 100644 --- a/ncbigene/ncbigene.php +++ b/ncbigene/ncbigene.php @@ -39,10 +39,10 @@ class NCBIGeneParser extends Bio2RDFizer "gene2ensembl" => "gene2ensembl.gz", "gene2go" => "gene2go.gz", "gene2pubmed" => "gene2pubmed.gz", - "gene2refseq" => "gene2refseq.gz", - "gene2sts" => "gene2sts", - "gene2unigene" => "gene2unigene", - "gene2vega" => "gene2vega.gz", + "gene2refseq" => "gene2refseq.gz" + #"gene2sts" => "gene2sts", + #"gene2unigene" => "gene2unigene", + #"gene2vega" => "gene2vega.gz", ); private $taxids = null; private $default_taxids = array( @@ -64,7 +64,7 @@ function __construct($argv) { parent::__construct($argv,"ncbigene"); // set and print application parameters - parent::addParameter('files',true,'all|geneinfo|gene2accession|gene2ensembl|gene2go|gene2pubmed|gene2refseq|gene2sts|gene2unigene|gene2vega','','files to process'); + parent::addParameter('files',true,'all|geneinfo|gene2accession|gene2ensembl|gene2go|gene2pubmed|gene2refseq','','files to process'); # |gene2sts|gene2unigene|gene2vega were removed parent::addParameter('download_url',false,null,'ftp://ftp.ncbi.nih.gov/gene/DATA/'); parent::addParameter('limit_organisms',false,'true|false','true','flag to use specified organisms'); parent::addParameter('organisms',false,null,implode(",",array_keys($this->default_taxids)),'taxonomy ids for organisms to process'); @@ -129,7 +129,7 @@ function process() $file = $module.".gz"; $lfile = $ldir.$file; $rfile = $rdir.$rfilename; - $ofile = $module.".".parent::getParameterValue('output_format'); + $ofile = "bio2rdf-".$module.".".parent::getParameterValue('output_format'); $gz = false; if(strstr(parent::getParameterValue('output_format'), "gz")) $gz = true; @@ -140,6 +140,7 @@ function process() $fnx = $module; if($module == 'gene2refseq') $fnx = 'gene2accession'; $this->$fnx(); + parent::clear(); echo 'done!'.PHP_EOL; @@ -343,7 +344,7 @@ private function gene2accession() $z = 1; while($l = $this->getReadFile()->read(200000)){ if($l[0] == "#") continue; - if(($z++) % 10000 == 0) {echo $z.PHP_EOL;parent::clear();} + if(($z++) % 100000 == 0) {echo $z.PHP_EOL;parent::clear();} $a = explode("\t",rtrim($l)); if(count($a) != 16) { trigger_error("gene2accession: expecting 16 columns, found ".count($a)." instead", E_USER_ERROR);} $taxid = $a[0]; From 2570f3a352eca76f47820cffe8412eaa68bff72c Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 20:54:41 +0200 Subject: [PATCH 56/64] escape allele label --- sgd/sgd.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sgd/sgd.php b/sgd/sgd.php index 27decb7..9dd23da 100644 --- a/sgd/sgd.php +++ b/sgd/sgd.php @@ -1052,8 +1052,9 @@ function phenotype(){ */ if(trim($a[7]) != ''){ + $allele = addslashes($a[7]); $this->AddRDF( - parent::triplifyString($this->getRes().$eid, $this->getVoc()."allele", $a[7]). + parent::triplifyString($this->getRes().$eid, $this->getVoc()."allele", $allele). parent::describeProperty($this->getVoc()."allele", "Relationship between an SGD experiment and an allele") ); } From c16a16ac6a7ceacfc12305b70191fb0ff76eea0a Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 20:56:00 +0200 Subject: [PATCH 57/64] downloads zip file with all entries; fixes xml string processing problems --- clinicaltrials/clinicaltrials.php | 339 ++++++++++++++---------------- 1 file changed, 157 insertions(+), 182 deletions(-) diff --git a/clinicaltrials/clinicaltrials.php b/clinicaltrials/clinicaltrials.php index 2f0d480..3875a29 100644 --- a/clinicaltrials/clinicaltrials.php +++ b/clinicaltrials/clinicaltrials.php @@ -22,7 +22,7 @@ @author :: Dana Klasen @author :: Michel Dumontier - @version :: 0.3 + @version :: 0.4 @description :: clinicaltrials.gov parser */ @@ -35,167 +35,103 @@ function __construct($argv) { parent::__construct($argv,"clinicaltrials"); parent::addParameter('files',true,'all','all','files to process'); - parent::addParameter('download_url',false,null,'http://clinicaltrials.gov/ct2/crawl'); - parent::addParameter('overwrite',false,'true|false','false','overwrite existing files with download option'); + parent::addParameter('download_url',false,null,'https://clinicaltrials.gov/AllPublicXML.zip'); parent::initialize(); } function run() { - if(parent::getParameterValue('download') === true) $this->crawl(); - $this->parse_dir(); - } - - - /** - * generate the proper subdir based on the file name - **/ - function get_sub_dir($entry) - { - $bin_range = 10; - - preg_match('/NCT[0]+(\d+)\.xml$/', $entry,$matches); - $record_number = $matches[1]; - - // find last multiple of bin_range - $count = -strlen($bin_range); - $marker = substr($record_number, $count); - - $curr_bin = substr($marker, 0,1). str_repeat(0,intval(strlen($bin_range))-1); - - $sub_dir = substr($record_number, 0,$count).$curr_bin; - - return $sub_dir; - } - /** - * scape the clinical gov site for the links to invididual records - **/ - function crawl(){ - $crawl_url = parent::getParameterValue("download_url"); //"http://clinicaltrials.gov/ct2/crawl"; - echo "Fetching clinical trial list...".PHP_EOL; - $html = file_get_contents($crawl_url); - if($html === FALSE) { - trigger_error("unable to get crawl file"); - return false; - } - echo "done.".PHP_EOL; - - $dom = new DOMDocument(); - @$dom->loadHTML($html); - - // grab all the links on the page - $xpath = new DOMXPath($dom); - $hrefs = $xpath->evaluate("/html/body//a"); - - for ($i = 0; $i < $hrefs->length; $i++) { - $href = $hrefs->item($i); - if(preg_match("/crawl\/([0-9]+)/",$href->getAttribute('href'))){ - $record_block_url = "http://clinicaltrials.gov".$href->getAttribute('href'); - $this->fetch_record_block($record_block_url); + $ldir = parent::getParameterValue('indir'); + $tdir = $ldir."clinicaltrials"; + $odir = parent::getParameterValue('outdir'); + + $lfile = $ldir.'clinicaltrials.zip'; # giving it this local file name + $rfile = parent::getParameterValue('download_url'); + if(!file_exists($lfile) || parent::getParameterValue('download') == 'true') { + #download and extract to temp dir + $ret = utils::downloadSingle($rfile,$lfile); + if($ret === false) { + trigger_error("unable to download $file", E_USER_ERROR); } - } - } - - /** - * Fetch the page holding a block of records - **/ - function fetch_record_block($url){ - echo "Fetching record block...".PHP_EOL; - $html = file_get_contents($url); - if($html === FALSE) { - trigger_error("unable to fetch record block at $url",E_USER_ERROR); - return false; - } - echo "done.".PHP_EOL; - - $dom = new DOMDocument(); - @$dom->loadHTML($html); - - $xpath = new DOMXPath($dom); - $hrefs = $xpath->evaluate("/html/body//a"); - - for ($i = 0; $i < $hrefs->length; $i++) { - $href = $hrefs->item($i); - if(preg_match("/ct2\/show\//",$href->getAttribute('href'))){ - $page_uri = "http://clinicaltrials.gov/".$href->getAttribute('href')."?resultsxml=true"; - $this->fetch_page($page_uri); + $zip = new ZipArchive(); + if ($zip->open($lfile) === FALSE) { + trigger_error("Unable to open $lfile"); + exit; } + $zip->extractTo($tdir); + $zip->close(); } - } - /** - * fetch the individual record page using - **/ - function fetch_page($url){ - preg_match("/show\/(NCT[0-9]+)/",$url,$m); - $file = $m[1]; - $outfile = parent::getParameterValue("indir")."/".$file.".xml"; - if(!file_exists($outfile) or - ((parent::getParameterValue("download") === true) and (parent::getParameterValue('overwrite') === true))) { - echo "fetching $url".PHP_EOL; - $xml = file_get_contents($url); - - # save the file - $ret = file_put_contents($outfile,$xml); - if($ret === FALSE) { - trigger_error("unable to save $outfile"); - return false; - } + $file_set = false; + $gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true; + if(parent::getParameterValue('id_list') != '') { + $id_list = explode(",",parent::getParameterValue('id_list')); + $ofile = "bio2rdf-clinicaltrials-selected-ids.".parent::getParameterValue('output_format'); + parent::setWriteFile($odir.$ofile, $gz); + $file_set = true; } - } - - - /** parse directory of files */ - function parse_dir(){ - $ignore = array("..",'.','.DS_STORE',"0"); - $this->setCheckPoint('dataset'); - $prefix = parent::getPrefix(); - $bVersion = parent::getParameterValue('bio2rdf_release'); - $date = date ("Y-m-d\TG:i:s\Z"); + #$ofile = "bio2rdf-clinicaltrials.".parent::getParameterValue('output_format'); + #parent::setWriteFile($odir.$ofile, $gz); - $dataset_file = parent::getParameterValue("outdir").parent::getBio2RDFReleaseFile(); - $fp = fopen($dataset_file,"w"); - if($fp === FALSE) { - trigger_error("Unable to open $dataset_file",E_USER_ERROR); - return false; - } - $ids = explode(",",parent::getParameterValue('id_list')); - - $indir = parent::getParameterValue('indir'); - echo "Processing $indir\n"; + $finished = false; + $d = dir($tdir); + $n = 0; $ftotal = 0; + while (false !== ($dir = $d->read())) { + if($dir == '.' or $dir == '..' or $dir == "Contents.txt") continue; - $outfile = "clinicaltrials.".parent::getParameterValue('output_format'); - $gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true; - parent::setWriteFile(parent::getParameterValue("outdir").$outfile,$gz); - - $files = glob($indir."NCT*"); - foreach($files AS $i => $file) { - if($i % 10000 == 0) {parent::clear();} - $trial_id = basename($file,'.xml'); - if(parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) { - if(filesize($file)!=0) { - echo "Processing $trial_id".PHP_EOL; - $this->process_file($file); - } else{ - echo "Processing $trial_id -> Empty!".PHP_EOL; + $edir = $tdir."/".$dir; + + $d2 = dir($edir); + while (false !== ($e2 = $d2->read())) { + if($e2 == '.' or $e2 == '..') continue; + + $f = $edir."/$e2"; + $e = basename($e2,'.xml'); + if(!isset($id_list)) { + $n++; + if(($n % 10000) == 1) { + if(parent::getWriteFile() != null) { + #if($ftotal == 3) {$finished=true;break;} + parent::getWriteFile()->close(); + } + $ftotal ++; + $ofile = "bio2rdf-clinicaltrials-".str_pad($ftotal, 3, "0", STR_PAD_LEFT).".".parent::getParameterValue('output_format'); + parent::setWriteFile($odir.$ofile, $gz); + echo $ofile.PHP_EOL; + } + $this->process_file($f); + } else if(in_array($e, $id_list)) { + echo "processing $e2".PHP_EOL; + $this->process_file($f); + $key = array_search($e, $id_list); + unset($id_list[$key]); + if(count($id_list) == 0) $finished = true; } + if($finished == true) break; } + $d2->close(); + if($finished == true) break; } + $d->close(); + + echo "Finished.".PHP_EOL; parent::getWriteFile()->close(); - + exit; // make the dataset description parent::setGraphURI(parent::getDatasetURI()); - $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true"; $source_version = parent::getDatasetVersion(); + $prefix = parent::getPrefix(); + $bVersion = parent::getParameterValue('bio2rdf_release'); + $date = date ("Y-m-d\TG:i:s\Z"); // dataset description $source_file = (new DataResource($this)) ->setURI($rfile) ->setTitle("Clinicaltrials") - ->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($file))) + ->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($lfile))) ->setFormat("application/xml") ->setPublisher("http://clinicaltrials.gov/") ->setHomepage("http://clinicaltrials.gov/") @@ -207,7 +143,7 @@ function parse_dir(){ parent::writeToReleaseFile($source_file->toRDF()); $output_file = (new DataResource($this)) - ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$outfile") + ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile") ->setTitle("Bio2RDF v$bVersion RDF version of $prefix v$source_version") ->setSource($source_file->getURI()) ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php") @@ -229,15 +165,15 @@ function parse_dir(){ parent::closeReleaseFile(); // write the dataset description file - fclose($fp); + fclose($fp); } + /** * process a results xml file from the download directory **/ - function process_file($infile) { - $indir = parent::getParameterValue('indir'); - $xml = new CXML($infile); + function process_file($entry) { + $xml = new CXML($entry); $this->setCheckPoint('file'); while($xml->Parse("clinical_study") == TRUE) { @@ -250,6 +186,8 @@ function process_file($infile) { $label = $this->getString("//brief_title"); if(!$label) $label = $this->getString("//official_title"); if(!$label) $label = "Clinical trial #".$nct_id; + + $label = trim(preg_replace("/\s+/",' ',$label)); parent::addRDF( parent::describeIndividual($study_id, $label, parent::getVoc()."Clinical-Study"). parent::describeClass(parent::getVoc()."Clinical-Study","Clinical Study") @@ -276,7 +214,7 @@ function process_file($infile) { if(isset($sids)) { foreach($sids AS $id) { parent::addRDF( - parent::triplifyString($study_id, parent::getVoc()."secondary-id", (string)$id, "xsd:string") + parent::triplifyString($study_id, parent::getVoc()."secondary-id", $this->safeString($id), "xsd:string") ); } } @@ -284,7 +222,7 @@ function process_file($infile) { if(isset($nctaliases)) { foreach($nctaliases AS $id) { parent::addRDF( - parent::triplifyString($study_id, parent::getVoc()."nct-alias", (string)$id, "xsd:string") + parent::triplifyString($study_id, parent::getVoc()."nct-alias", $this->safeString($id), "xsd:string") ); } } @@ -292,15 +230,23 @@ function process_file($infile) { ########################################################################################## #titles ########################################################################################## + $x = $this->getString("//brief_title"); + $brief_title = trim(preg_replace("/\s+/",' ',$x)); + + $x = $this->getString("//official_title"); + $official_title = trim(preg_replace("/\s+/",' ',$x)); + parent::addRDF( - parent::triplifyString($study_id, parent::getVoc()."brief-title",$this->getString("//brief_title")). - parent::triplifyString($study_id,parent::getVoc()."official-title",$this->getString("//official_title")) + parent::triplifyString($study_id, parent::getVoc()."brief-title",$brief_title). + parent::triplifyString($study_id,parent::getVoc()."official-title",$official_title) ); ################################################################################### #brief summary ################################################################################### - $brief_summary =str_replace( array("\r","\n","\t"), array(" "," "," "), $this->getString('//brief_summary/textblock')); + $x = $this->getString('//brief_summary/textblock'); + $brief_summary = trim(preg_replace("/\s+/",' ',$x)); + parent::addRDF( parent::triplifyString($study_id,$this->getVoc()."brief-summary",$brief_summary) ); @@ -309,7 +255,9 @@ function process_file($infile) { #################################################################################### # detailed description #################################################################################### - $d = str_replace( array("\r","\n","\t"), array(" "," "," "), $this->getString('//detailed_description/textblock')); + $x = $this->getString('//detailed_description/textblock'); + $d = trim(preg_replace("/\s+/",' ',$x)); + parent::addRDF( parent::triplifyString($study_id,parent::getVoc()."detailed-description",$d) ); @@ -466,7 +414,7 @@ function process_file($infile) { $key = parent::getRes().md5($c[0]); $value = parent::getRes().md5($c[1]); parent::addRDF( - parent::describeIndividual($sdp,$b,parent::getVoc()."Study-Design-Parameter"). + parent::describeIndividual($sdp,$this->safeString($b),parent::getVoc()."Study-Design-Parameter"). parent::describeClass(parent::getVoc()."Study-Design-Parameter","Study Design Parameter"). parent::triplify($sdp,parent::getVoc()."key",$key). parent::describeClass($key,$c[0]). @@ -568,7 +516,7 @@ function process_file($infile) { $mesh_label_id = parent::getRes().md5($condition); parent::addRDF( parent::triplify($study_id,parent::getVoc()."condition",$mesh_label_id). - parent::describeClass($mesh_label_id,$condition,parent::getVoc()."Condition"). + parent::describeClass($mesh_label_id,$this->safeString($condition),parent::getVoc()."Condition"). parent::describeClass(parent::getVoc()."Condition","Condition") ); } @@ -626,10 +574,12 @@ function process_file($infile) { ); $agl = $intervention->xpath("./arm_group_label"); foreach($agl AS $a) { + $label = $this->safeString($a); + $arm_group_id = md5($a); $ag = parent::getRes().$this->nct_id."/arm-group/".$arm_group_id; parent::addRDF( - parent::describeIndividual($ag,$a,parent::getVoc()."Arm-Group"). + parent::describeIndividual($ag,$label,parent::getVoc()."Arm-Group"). parent::describeClass(parent::getVoc()."Arm-Group","Arm Group"). parent::triplify($intervention_id, parent::getVoc()."arm-group",$ag) ); @@ -657,6 +607,9 @@ function process_file($infile) { if($criteria = @array_shift($eligibility->xpath('./criteria'))){ $text = @array_shift($criteria->xpath('./textblock')); + $x = str_replace(array('"',"'",'\\','�'),'', $text); + $text = trim(preg_replace("/\s+/",' ',$x)); + parent::addRDF( parent::triplifyString($eligibility_id, parent::getVoc()."text",$text) ); @@ -680,7 +633,7 @@ function process_file($infile) { if(isset($c[2])) { $d = explode(" - ",$c[1]); foreach($d AS $exclusion) { - $exc = trim($exclusion); + $exc = $this->safeString($exclusion); if($exc != '') { $exc_id = parent::getRes().md5($exc); parent::addRDF( @@ -706,7 +659,7 @@ function process_file($infile) { if($s != 'N/A') { $age = trim(str_replace("Years","",$s)); parent::addRDF( - parent::triplifyString($eligibility_id,parent::getVoc().str_replace("_","-",$a),$age) + parent::triplifyString($eligibility_id,parent::getVoc().str_replace("_","-",$this->safeString($a)),$age) ); } } @@ -753,7 +706,7 @@ function process_file($infile) { $d = @array_shift($root->xpath('//'.$c)); if($d) { parent::addRDF( - parent::triplify($study_id, parent::getVoc().str_replace("_","-",$c), $this->makeContact($d)) + parent::triplify($study_id, parent::getVoc().str_replace("_","-",$this->safeString($c)), $this->makeContact($d)) ); } } @@ -831,12 +784,16 @@ function process_file($infile) { $references = $root->xpath('//'.$ref_type); foreach($references as $reference){ $p = $this->getString('./PMID',$reference); + + $ref = $this->getString('./citation',$reference); + $ref = trim(preg_replace("/\s+/",' ',$x)); + if($p) { $pmid = "pubmed:$p"; parent::addRDF( parent::describeIndividual($pmid,$p,parent::getVoc()."Reference"). parent::describeClass(parent::getVoc()."Reference", "Reference"). - parent::triplifyString($pmid, parent::getVoc()."citation", $this->getString('./citation',$reference)). + parent::triplifyString($pmid, parent::getVoc()."citation", $ref). parent::triplify($study_id,parent::getVoc().str_replace("_","-",$ref_type),$pmid) ); } @@ -904,7 +861,7 @@ function process_file($infile) { $keywords = $root->xpath('//keyword'); foreach($keywords as $keyword){ parent::addRDF( - parent::triplifyString($study_id,parent::getVoc()."keyword",(string)$keyword) + parent::triplifyString($study_id,parent::getVoc()."keyword",$this->safeString($keyword)) ); } }catch(Exception $e){ @@ -916,7 +873,7 @@ function process_file($infile) { try{ $mesh_terms = $root->xpath('//condition_browse/mesh_term'); foreach($mesh_terms as $mesh_term){ - $term = (string)$mesh_term; + $term = $this->safeString($mesh_term); $mesh_id = parent::getRes().md5($term); parent::addRDF(parent::triplify($study_id,parent::getVoc()."condition-mesh",$mesh_id)); parent::addRDF(parent::triplifyString($mesh_id,"rdfs:label",$term)); @@ -946,7 +903,7 @@ function process_file($infile) { foreach($a AS $browse_type) { $terms = $root->xpath("//$browse_type/mesh_term"); foreach($terms as $term){ - $term_label = (string)$term; + $term_label = $this->safeString($term); $term_id = parent::getRes().md5($term); parent::addRDF( parent::describeIndividual($term_id,$term_label,parent::getVoc()."Term"). @@ -1111,10 +1068,12 @@ function process_file($infile) { # outcomes ################################################################################ try { + $o_n = 1; $outcomes = @array_shift($root->xpath('//outcome_list')); if($outcomes) { + foreach($outcomes AS $i => $outcome) { - $outcome_id = $this->nct_id."/outcome/".($i+1); + $outcome_id = $this->nct_id."/outcome/".($o_n++); $outcome_uri = parent::getRes().$outcome_id; $outcome_label = $this->getString("./title",$outcome); if(!$outcome_label) $outcome_label = "outcome for ".$this->nct_id; @@ -1137,7 +1096,7 @@ function process_file($infile) { } } - // measure list + // measure list # this has changed $measures = @array_shift($outcome->xpath('./measure_list')); if($measures) { foreach($measures AS $measure) { @@ -1146,7 +1105,14 @@ function process_file($infile) { ); } } + $measure = @array_shift($outcome->xpath('./measure')); + if($measure) { + parent::addRDF( + parent::triplify($outcome_uri,parent::getVoc()."measure", $this->makeMeasure($measure)) + ); + } + // analysis list $analyses = @array_shift($outcome->xpath('./analysis_list')); if($analyses) { @@ -1187,7 +1153,7 @@ function process_file($infile) { foreach($event_list AS $ev => $ev_label) { $et = @array_shift($reported_events->xpath('./'.$ev)); if(!$et) continue; - $ev_uri = parent::getVoc().str_replace(" ","-",$ev_label); + $ev_uri = parent::getVoc().str_replace(" ","-",$this->safeString($ev_label)); $categories = @array_shift($et->xpath('./category_list')); foreach($categories AS $category) { @@ -1236,9 +1202,9 @@ function process_file($infile) { } catch(Exception $e) { echo "Error in parsing reported events".PHP_EOL; } - parent::writeRDFBufferToWriteFile(); } + parent::writeRDFBufferToWriteFile(); $this->setCheckPoint('record'); $this->setCheckPoint('dataset'); } @@ -1248,9 +1214,14 @@ function getString($xpath,$element = null) $o = $this->root; if(isset($element)) $o = $element; $r = @array_shift($o->xpath($xpath)); - return ((string)$r[0]); + return $this->safeString($r[0]); } + function safeString($string) + { + return str_replace(array('"','\\'),array('','/'),(string)$string); + } + public function getMonthNumber($month) { $months = array( @@ -1346,29 +1317,33 @@ public function makeMeasure($measure) parent::triplifyString($measure_id, parent::getVoc()."dispersion", $this->getString('./dispersion', $measure)) ); - $categories = @array_shift($measure->xpath('./category_list')); - foreach($categories AS $category) { - $cid = parent::getRes().$this->nct_id."/category/".md5($category->asXML()); - $cat_label = $this->getString('./sub_title', $category); - if(!$cat_label) $cat_label = "category for measure"; - parent::addRDF( - parent::describeIndividual($cid, $cat_label, parent::getVoc()."Category"). - parent::describeClass(parent::getVoc()."Category","Category"). - parent::triplify($measure_id,parent::getVoc()."category",$cid) - ); - $ml = @array_shift($category->xpath('./measurement_list')); - foreach($ml AS $m) { - $mid = parent::getRes().$this->nct_id."/measurement/".md5($m->asXML()); + $categories = @array_shift($measure->xpath('./class_list/class/category_list')); + if(isset($categories)) { + foreach($categories AS $category) { + $cid = parent::getRes().$this->nct_id."/category/".md5($category->asXML()); + $cat_label = $this->getString('./sub_title', $category); + if(!$cat_label) $cat_label = "category for measure"; parent::addRDF( - parent::describeIndividual($mid, $this->nct_id." measurement", parent::getVoc()."Measurement"). - parent::describeClass(parent::getVoc()."Measurement","Measurement"). - parent::triplify($mid, parent::getVoc()."group-id", parent::getRes().$this->nct_id."/group/".$m->attributes()->group_id). - parent::triplifyString($mid, parent::getVoc()."value", $m->attributes()->value). - parent::triplifyString($mid, parent::getVoc()."spread", $m->attributes()->spread). - parent::triplifyString($mid, parent::getVoc()."lower-limit", $m->attributes()->lower_limit). - parent::triplifyString($mid, parent::getVoc()."upper-limit", $m->attributes()->upper_limit). - parent::triplify($cid, parent::getVoc()."measurement",$mid) + parent::describeIndividual($cid, $cat_label, parent::getVoc()."Category"). + parent::describeClass(parent::getVoc()."Category","Category"). + parent::triplify($measure_id,parent::getVoc()."category",$cid) ); + $ml = @array_shift($category->xpath('./measurement_list')); + if(isset($ml)) { + foreach($ml AS $m) { + $mid = parent::getRes().$this->nct_id."/measurement/".md5($m->asXML()); + parent::addRDF( + parent::describeIndividual($mid, $this->nct_id." measurement", parent::getVoc()."Measurement"). + parent::describeClass(parent::getVoc()."Measurement","Measurement"). + parent::triplify($mid, parent::getVoc()."group-id", parent::getRes().$this->nct_id."/group/".$m->attributes()->group_id). + parent::triplifyString($mid, parent::getVoc()."value", $m->attributes()->value). + parent::triplifyString($mid, parent::getVoc()."spread", $m->attributes()->spread). + parent::triplifyString($mid, parent::getVoc()."lower-limit", $m->attributes()->lower_limit). + parent::triplifyString($mid, parent::getVoc()."upper-limit", $m->attributes()->upper_limit). + parent::triplify($cid, parent::getVoc()."measurement",$mid) + ); + } + } } } return $measure_id; From 748a4804747cb3eec8614f47294a905d69a6ae24 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 21:47:51 +0200 Subject: [PATCH 58/64] updated download file location --- ndc/ndc.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ndc/ndc.php b/ndc/ndc.php index 3c0f3fd..7b1f93e 100644 --- a/ndc/ndc.php +++ b/ndc/ndc.php @@ -36,7 +36,7 @@ function __construct($argv) { parent::__construct($argv, "ndc"); $this->AddParameter('files',true,'all|product|package','all','files to process'); - $this->AddParameter('download_url',false,null,'http://www.fda.gov/downloads/Drugs/DevelopmentApprovalProcess/UCM070838.zip'); + $this->AddParameter('download_url',false,null,'https://www.accessdata.fda.gov/cder/ndctext.zip'); parent::initialize(); } From 416c2127e7bbc6ab7c76ea886b3825275e214a0f Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 22:37:50 +0200 Subject: [PATCH 59/64] updates for mgi inc. download location, column mappings, and identifiers --- mgi/mgi.php | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mgi/mgi.php b/mgi/mgi.php index 5b6f55d..3ceed95 100644 --- a/mgi/mgi.php +++ b/mgi/mgi.php @@ -36,7 +36,7 @@ class MGIParser extends Bio2RDFizer function __construct($argv) { parent::__construct($argv, "mgi"); parent::addParameter('files',true,'all|MGI_Strain|MGI_PhenotypicAllele|MGI_GenePheno|MRK_Sequence|MGI_Geno_Disease|MGI_Geno_NotDisease','all','all or comma-separated list to process'); - parent::addParameter('download_url', false, null,'ftp://ftp.informatics.jax.org/pub/reports/' ); + parent::addParameter('download_url', false, null,'http://www.informatics.jax.org/downloads/reports/' ); parent::initialize(); } @@ -66,7 +66,7 @@ function Run() parent::setReadFile($lfile,true); echo "Processing $item..."; - $ofile = $odir.$item.'.'.parent::getParameterValue('output_format'); + $ofile = $odir."bio2rdf-".$item.'.'.parent::getParameterValue('output_format'); $gz= strstr(parent::getParameterValue('output_format'), "gz")?true:false; parent::setWriteFile($ofile, $gz); @@ -141,8 +141,9 @@ function MGI_PhenotypicAllele($qtl = false) $a = explode("\t",$l); $line++; if($a[0][0] == "#") continue; - if(count($a) != 12) { - echo "Expecting 12 columns, but found ".count($a)." at line $line. skipping!".PHP_EOL; + $expected_columns = 13; + if(count($a) != $expected_columns) { + echo "Expecting $expected_columns columns, but found ".count($a)." at line $line. skipping!".PHP_EOL; if($errors++ == 25) {echo 'stopping'.PHP_EOL;break;} continue; } @@ -232,19 +233,19 @@ function MGI_PhenotypicAllele($qtl = false) 4 Mammalian Phenotype ID - MP:0000364 5 PubMed ID - 15466160 6 MGI Marker Accession ID (comma-delimited) - MGI:96522 - 7 blank - 8 MGI Genotype ID (comma-delimted) + 7 MGI Genotype ID (comma-delimted) */ function MGI_GenePheno() { $line = 1; while($l = $this->getReadFile()->read(248000)) { $a = explode("\t",$l); - if(count($a) != 9) { - trigger_error("Incorrect number of columns",E_USER_WARNING); - continue; + $exp = 8; + if(count($a) != $exp) { + trigger_error("Incorrect number of columns: Found ".count($a)." and was expecting $exp",E_USER_WARNING); + exit(); } - $id = trim($a[8]); + $id = trim($a[7]); $label = $a[0]." ".$a[3]; parent::addRDF( @@ -310,7 +311,7 @@ function MGI_GenePheno() function MRK_Sequence() { - $cols = 21; + $cols = 19; $line = 0; $h = $this->getReadFile()->read(500000); $o = $this->getReadFile()->read(500000); // extra feature header on a separate line...if you can imagine @@ -335,11 +336,10 @@ function MRK_Sequence() parent::triplifyString($id, parent::getVoc()."chromosome", $a[6], "xsd:string"). parent::triplifyString($id, parent::getVoc()."genome-start", $a[7], "xsd:string"). parent::triplifyString($id, parent::getVoc()."genome-end", $a[8], "xsd:string"). - parent::triplifyString($id, parent::getVoc()."strand", $a[7], "xsd:string"). - parent::triplifyString($id, parent::getVoc()."feature-type", $a[20], "xsd:string") + parent::triplifyString($id, parent::getVoc()."strand", $a[7], "xsd:string") ); $start_pos = 10; - $list = array("genbank","refseq-transcript","vega-transcript","ensembl-transcript","uniprot","trembl","vega-protein","ensembl-protein","refseq-protein","unigene"); + $list = array("genbank","refseq-transcript","ensembl-transcript","uniprot","trembl","ensembl-protein","refseq-protein","unigene"); $list_len = count($list); for($i=0;$i<$list_len;$i++) { $value = trim($a[$i+$start_pos]); @@ -413,7 +413,7 @@ function MGI_Geno_Disease() $genotype = $a[0]; $diseases = explode(",",$a[7]); foreach($diseases AS $d) { - $disease = "omim:$d"; + $disease = "$d"; foreach($alleles AS $allele) { $id = parent::getRes().md5($allele.$disease); $label = "$allele $disease association"; @@ -462,7 +462,7 @@ function MGI_Geno_NotDisease() $alleles = explode("|",strtolower($a[2])); $diseases = explode(",",$a[7]); foreach($diseases AS $d) { - $disease = "omim:$d"; + $disease = "$d"; foreach($alleles AS $allele) { $id = parent::getRes().md5($allele.$disease); From 1fc4163d3a0ef740cdd6fd6f61f67651f3041618 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 22:50:21 +0200 Subject: [PATCH 60/64] fixed comparator error --- wormbase/wormbase.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wormbase/wormbase.php b/wormbase/wormbase.php index 023fff2..a604e05 100644 --- a/wormbase/wormbase.php +++ b/wormbase/wormbase.php @@ -156,7 +156,7 @@ function geneIDs() // taxon, gene id, symbol, cosmid, status $data = explode(",",trim($l)); if($first) { - if(($c = count($data) != 5)) { + if((($c = count($data)) != 5)) { trigger_error("WormBase function expects 5 fields, found $c!".PHP_EOL, E_USER_WARNING); } $first = false; From df74a3a5d087bf6f715fba72ca3039465cad24a9 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Mon, 1 Jun 2020 23:20:14 +0200 Subject: [PATCH 61/64] updates to wormbase --- wormbase/wormbase.php | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/wormbase/wormbase.php b/wormbase/wormbase.php index a604e05..89912ed 100644 --- a/wormbase/wormbase.php +++ b/wormbase/wormbase.php @@ -34,7 +34,7 @@ class WormbaseParser extends Bio2RDFizer { function __construct($argv) { parent::__construct($argv, "wormbase"); - parent::addParameter('files', true, 'all|geneIDs|functional_descriptions|gene_associations|gene_interactions|phenotype_associations','all','files to process'); + parent::addParameter('files', true, 'all|geneIDs|gene_associations|gene_interactions|phenotype_associations','all','files to process'); #functional_descriptions turned into flatfile, needs work parent::addParameter('release', false, null, 'current', 'Release version of WormBase'); parent::addParameter('download_url', false, null,'ftp://ftp.wormbase.org/pub/wormbase/'); parent::initialize(); @@ -49,10 +49,10 @@ public function run() $files = explode(",",parent::getParameterValue('files')); } $release = parent::getParameterValue('release'); - $releaseb = "WS249"; + $releaseb = "WS276"; $remote_files = array( "geneIDs" => "species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.".$release.".geneIDs.txt.gz", - "functional_descriptions" => "species/c_elegans/annotation/functional_descriptions/c_elegans.PRJNA13758.".$release.".functional_descriptions.txt.gz", + #"functional_descriptions" => "species/c_elegans/annotation/functional_descriptions/c_elegans.PRJNA13758.".$release.".functional_descriptions.txt.gz", "gene_interactions" => "species/c_elegans/annotation/gene_interactions/c_elegans.PRJNA13758.".$release.".gene_interactions.txt.gz", "gene_associations" => "releases/current-production-release/ONTOLOGY/gene_association.".$releaseb.".wb", "phenotype_associations" => "releases/current-production-release/ONTOLOGY/phenotype_association.".$releaseb.".wb" @@ -92,7 +92,7 @@ public function run() } $suffix = parent::getParameterValue('output_format'); - $ofile = "wormbase.".$file.".".$suffix; + $ofile = "bio2rdf-wormbase-".$file.".".$suffix; $gz = strstr(parent::getParameterValue('output_format'), "gz")?true:false; parent::setWriteFile($odir.$ofile, $gz); @@ -153,11 +153,12 @@ function geneIDs() $first = true; while($l = $this->getReadFile()->read()){ if($l[0] == '#') continue; - // taxon, gene id, symbol, cosmid, status + // taxon, gene id, symbol, cosmid, status, type $data = explode(",",trim($l)); if($first) { - if((($c = count($data)) != 5)) { - trigger_error("WormBase function expects 5 fields, found $c!".PHP_EOL, E_USER_WARNING); + $exp = 6; + if((($c = count($data)) != $exp)) { + trigger_error("WormBase function expects $exp fields, found $c!".PHP_EOL, E_USER_WARNING); } $first = false; } @@ -194,7 +195,10 @@ function functional_descriptions() // gene_id public_name molecular_name concise_description provisional_description detailed_description automated_description gene_class_description $a = explode("\t",$l); - if(count($a) != 8) {trigger_error("Found one row that only has ".count($a)." columns, expecting 8",E_USER_ERROR);continue;} + if(count($a) != 8) { + trigger_error("Found one row that only has ".count($a)." columns, expecting 8",E_USER_ERROR); + continue; + } $id = parent::getNamespace().$a[0]; $label = $a[1].($a[2]?" (".$a[2].")":""); @@ -230,7 +234,9 @@ function gene_associations(){ 'NAS'=>'eco:0000034', 'ND'=>'eco:0000035', 'RCA'=>'eco:0000245', - 'TAS'=>'eco:0000033' + 'TAS'=>'eco:0000033', + 'HEP'=>'eco:0007007', + 'HDA'=>'eco:0007005' ); From 75e0743eb1633774b177d348362637e8ae9eb4f2 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 2 Jun 2020 22:18:01 +0200 Subject: [PATCH 62/64] fixes for taxonomy --- taxonomy/taxonomy.php | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/taxonomy/taxonomy.php b/taxonomy/taxonomy.php index c1bbac8..c2d7b76 100644 --- a/taxonomy/taxonomy.php +++ b/taxonomy/taxonomy.php @@ -164,7 +164,7 @@ public function Run(){ trigger_error("Unable to get pointer to $fn in $zinfile"); exit("failed\n"); } - $gzoutfile = $odir."taxonomy-$k".".".parent::getParameterValue('output_format'); + $gzoutfile = $odir."bio2rdf-taxonomy-$k".".".parent::getParameterValue('output_format'); //set the write file $gz= strstr(parent::getParameterValue('output_format'), 'gz')?true:false; @@ -218,14 +218,14 @@ private function names(){ $rel = parent::getVoc().str_replace(" ","-",$a[3]); parent::addRDF( - parent::triplifyString($taxid, $rel, $name). - parent::triplifyString($taxid, parent::getVoc()."unique-name", utf8_encode($a[2])) + parent::triplifyString($taxid, $rel, addslashes($name)). + parent::triplifyString($taxid, parent::getVoc()."unique-name", addslashes(utf8_encode($a[2]))) ); if($rel == "scientific-name") { parent::addRDF( - parent::triplifyString($taxid, "dc:title", $name). - parent::triplifyString($taxid, "rdfs:label", $name) + parent::triplifyString($taxid, "dc:title", addslashes($name)). + parent::triplifyString($taxid, "rdfs:label", addslashes($name)) ); } @@ -327,20 +327,26 @@ private function citations() continue; } $c = parent::getRes()."citation-id-".$a[0]; - $seealso = isset($a[4])?trim($a[4]):""; +/* $seealso = isset($a[4])?trim($a[4]):""; if($seealso) { + echo $seealso.PHP_EOL; $seealso = str_replace(array("lx: DOI ","http;//"), array("https://doi.org/","http://"), $seealso); if(strlen($seealso) > 2 and !strstr($seealso,"http")) $seealso = "http://".$seealso; - $seealso = parent::triplify($c, "rdfs:seeAlso", $seealso); + $seealso = parent::triplifyString($c, "rdfs:seeAlso", addslashes($seealso)); # all kinds of garbarge in this field + } +*/ + $text = ''; + if(isset($a[5])) { + $text = str_replace(array('"',"'","",'\\',),'',$a[5]); # get rid of garbage characters } - + parent::addRDF( parent::describeIndividual($c, $a[1], $this->getVoc()."Citation"). parent::describeClass($this->getVoc()."Citation", "Citation"). parent::triplifyString($c, parent::getVoc()."citation-key", $a[1]). ($a[2]=="0"?"":parent::triplify($c, parent::getVoc()."x-pubmed", "pubmed:".$a[2])). - $seealso. - ((isset($a[5]) and $a[5])?parent::triplifyString($c, parent::getVoc()."text", str_replace("\"","", $a[5])):"") +# $seealso. + $text?parent::triplifyString($c, parent::getVoc()."text", $text):"" ); if(isset($a[6])) { $taxids = explode(" ", trim($a[6])); From fc9bea99912f78cf3ff16b2eae2c1c8cd6e33f0b Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 2 Jun 2020 23:03:24 +0200 Subject: [PATCH 63/64] changes to download urls; fixes to record processor --- mesh/mesh.php | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mesh/mesh.php b/mesh/mesh.php index d40313f..f1b726e 100644 --- a/mesh/mesh.php +++ b/mesh/mesh.php @@ -150,8 +150,8 @@ class MeshParser extends Bio2RDFizer{ function __construct($argv) { parent::__construct($argv, "mesh"); parent::addParameter('files', true, 'all|descriptors|qualifiers|supplementary', 'all', 'all or comma-separated list of files to process'); - parent::addParameter('download_url',false,'','ftp://nlmpubs.nlm.nih.gov/online/mesh/.asciimesh/','default ftp location'); - parent::addParameter('year', false, '','2014',"Year to process"); + parent::addParameter('download_url',false,'','ftp://nlmpubs.nlm.nih.gov/online/mesh/YEAR/asciimesh/','default ftp location'); + parent::addParameter('year', false, '','2019',"Year to process"); parent::initialize(); }//constructor @@ -180,7 +180,7 @@ function Run(){ $file = str_replace("YEAR",$year,$fpattern); $lfile = $ldir.$file; $rfile = parent::getParameterValue("download_url").$file; - + $rfile = str_replace("YEAR",$year,$rfile); // download if necessary if(!file_exists($lfile) || parent::getParameterValue('download') == "true") { echo "Downloading $file ... "; @@ -193,7 +193,7 @@ function Run(){ } //set the outfile - $ofile = "mesh_".$k.".".parent::getParameterValue('output_format'); + $ofile = "bio2rdf-mesh-".$k.".".parent::getParameterValue('output_format'); $gz= strstr(parent::getParameterValue('output_format'), "gz")?true:false; echo "processing $k ..."; @@ -249,9 +249,8 @@ function Run(){ private function supplementary(){ $sup_rec = ""; - while($aLine = $this->GetReadFile()->Read(200000)){ - preg_match("/^\n$/", $aLine, $matches); - if(count($matches)){ + while(FALSE !== ($aLine = $this->GetReadFile()->Read(200000))){ + if(strlen($aLine) == 0){ $dR = $this->readRecord($sup_rec); $this->makeSupplementaryRecord($dR); $sup_rec = ""; @@ -259,15 +258,14 @@ private function supplementary(){ } preg_match("/\*NEWRECORD/", $aLine, $matches); if(count($matches) == 0){ - $sup_rec .= $aLine; + $sup_rec .= $aLine.PHP_EOL; } } } private function descriptors(){ $descriptor_record = ""; - while($aLine = $this->GetReadFile()->Read(200000)){ - preg_match("/^\n$/", $aLine, $matches); - if(count($matches)){ + while(FALSE !== ($aLine = $this->GetReadFile()->Read(200000))){ + if(strlen($aLine) == 0){ $dR = $this->readRecord($descriptor_record); $this->makeDescriptorRecord($dR); $descriptor_record = ""; @@ -275,16 +273,15 @@ private function descriptors(){ } preg_match("/\*NEWRECORD/", $aLine, $matches); if(count($matches) == 0){ - $descriptor_record .= $aLine; + $descriptor_record .= $aLine.PHP_EOL; } } } private function qualifiers(){ $qualifier_record = ""; - while($aLine = $this->GetReadFile()->Read(200000)){ - preg_match("/^\n$/", $aLine, $matches); - if(count($matches)){ + while(FALSE !== ($aLine = $this->GetReadFile()->Read(200000))){ + if(strlen($aLine) == 0){ $qR = $this->readRecord($qualifier_record); $this->makeQualifierRecordRDF($qR); $qualifier_record = ""; @@ -292,7 +289,7 @@ private function qualifiers(){ } preg_match("/\*NEWRECORD/", $aLine, $matches); if(count($matches) == 0){ - $qualifier_record .= $aLine; + $qualifier_record .= $aLine.PHP_EOL; } } } @@ -437,7 +434,7 @@ private function makeSupplementaryRecord($sup_record_arr){ if($k == "SO"){ foreach($v as $kv => $vv){ parent::AddRDF( - parent::triplifyString($sr_res, $this->getVoc().$sde['SO'], utf8_encode(htmlspecialchars($vv))). + parent::triplifyString($sr_res, $this->getVoc().$sde['SO'], addslashes(utf8_encode(htmlspecialchars($vv)))). parent::describeProperty($this->getVoc().$sde['SO'], "Relationship between a supplementary record and its source") ); } @@ -499,7 +496,7 @@ private function makeDescriptorRecord($desc_record_arr){ $vvrar = explode(";", $vv); foreach($vvrar as $anAn){ parent::AddRDF( - parent::triplifyString($dr_res, $this->getVoc().$qde["AN"], $anAn). + parent::triplifyString($dr_res, $this->getVoc().$qde["AN"], addslashes($anAn)). parent::describeProperty($this->getVoc().$qde["AN"], "Relationship between a descriptor and its annotation") ); }//foreach @@ -866,7 +863,7 @@ private function makeQualifierRecordRDF($qual_record_arr){ $vvrar = explode(";", $vv); foreach($vvrar as $anAn){ parent::AddRDF( - parent::triplifyString($qr_res, $this->getVoc().$qde["AN"], $anAn). + parent::triplifyString($qr_res, $this->getVoc().$qde["AN"], addslashes($anAn)). parent::describeProperty($this->getVoc().$qde["AN"], "Relationship between a qualifier record and its annotation") ); }//foreach From 84944dc4d0adfc3effbab21e5f925e0f7b4333d4 Mon Sep 17 00:00:00 2001 From: Michel Dumontier Date: Tue, 9 Jun 2020 20:27:48 +0200 Subject: [PATCH 64/64] fix for namespace not provided in obo file --- bioportal/bioportal.php | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/bioportal/bioportal.php b/bioportal/bioportal.php index 136853c..83266c7 100644 --- a/bioportal/bioportal.php +++ b/bioportal/bioportal.php @@ -36,7 +36,7 @@ function __construct($argv) { parent::__construct($argv,'bioportal'); parent::addParameter('files',true,null,'all','all or comma-separated list of ontology short names to process'); parent::addParameter('download_url',false,null,'http://data.bioontology.org/'); - parent::addParameter('exclude',false,null,"AURA",'ontologies to exclude - use acronyms'); + parent::addParameter('exclude',false,null,"AURA,HOOM",'ontologies to exclude - use acronyms'); parent::addParameter('continue_from',false,null,"",'the ontology abbreviation to restart from'); parent::addParameter('ncbo_api_key',false,null,null,'BioPortal API key (please use your own)'); parent::addParameter('ncbo_api_key_file',false,null,'ncbo.api.key','BioPortal API key file'); @@ -123,7 +123,6 @@ function Run() if(isset($ls['description'])) $description = $ls['description']; $rfile = $ls['ontology']['links']['download']; - $lfile = $abbv.".".$format.".gz"; if(!file_exists($idir.$lfile) or parent::getParameterValue('download') == 'true') { echo "downloading ... "; @@ -134,7 +133,7 @@ function Run() $ret = curl_setopt($ch, CURLOPT_HEADER, 1); $ret = curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); $ret = curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); - $ret = curl_setopt($ch, CURLOPT_TIMEOUT, 300); + $ret = curl_setopt($ch, CURLOPT_TIMEOUT, 600); $ret = curl_exec($ch); if(!$ret) {echo "no content";continue;} @@ -167,12 +166,13 @@ function Run() // process echo "converting ... "; - set_time_limit(0); + // let's double check the format $fp = gzopen($idir.$lfile,"r"); $l = gzgets($fp); if(strstr($l,"xml")) $format= "owl"; gzclose($fp); + if($format == 'obo') { $this->OBO2RDF($abbv); } else if($format == 'owl') { @@ -182,6 +182,7 @@ function Run() } else { echo "no processor for $label (format $format)".PHP_EOL; } + if(!file_exists($odir.$ofile)) { echo "no output".PHP_EOL;continue;} parent::getWriteFile()->close(); parent::clear(); @@ -366,7 +367,7 @@ public function TriplifyMap($a, $prefix) } else { parent::addRDF( - parent::triplifyString($s_uri,$p_uri,$a['o'],(($a['o_datatype'] == '')?null:$a['o_datatype']),(($a['o_lang'] == '')?null:$a['o_lang'])) + parent::triplifyString($s_uri,$p_uri,addslashes($a['o']),(($a['o_datatype'] == '')?null:$a['o_datatype']),(($a['o_lang'] == '')?null:$a['o_lang'])) ); } @@ -461,6 +462,7 @@ function OBO2RDF($abbv) else {$ns = strtolower($c[0]);$id=$c[1];} $id = str_replace( array("(",")"), array("_",""), $id); $tid = $ns.":".$id; + echo $tid.PHP_EOL; } else if($a[0] == "name") { $buf .= parent::describeClass($tid,addslashes(stripslashes($a[1]))); } else if($a[0] == "is_a") { @@ -483,7 +485,8 @@ function OBO2RDF($abbv) $buf .= $t; $is_deprecated = true; } else if($a[0] == "id") { - parent::getRegistry()->parseQName($a[1],$ns,$id); + parent::getRegistry()->parseQName($a[1],$ns,$id); + if(trim($ns) == '') $ns = "unspecified"; $tid = "$ns:$id"; // $buf .= parent::describeClass($tid,null,"owl:Class"); // $buf .= parent::triplify($tid,"rdfs:isDefinedBy",$ouri); @@ -610,6 +613,7 @@ function OBO2RDF($abbv) } else if($a[0] == "is_a") { // do subclassing parent::getRegistry()->parseQName($a[1],$ns,$id); + if(trim($ns) == '') $ns = "unspecified"; $t = parent::triplify($tid,"rdfs:subClassOf","$ns:$id"); $buf .= $t; $min .= $t; @@ -657,17 +661,19 @@ function OBO2RDF($abbv) $c = explode(" ",$a[1]); if(count($c) == 1) { // just a class parent::getRegistry()->parseQName($c[0],$ns,$id); + if(trim($ns) == '') $ns = "unspecified"; $relationship .= parent::getRegistry()->getFQURI("$ns:$id"); $buf .= parent::triplify($tid,"rdfs:subClassOf","$ns:$id"); } else if(count($c) == 2) { // an expression parent::getRegistry()->parseQName($c[0],$pred_ns,$pred_id); parent::getRegistry()->parseQName($c[1],$obj_ns,$obj_id); + if(trim($obj_ns) == '') $obj_ns = "unspecified"; $relationship .= '_:b'.$bid.' <'.parent::getRegistry()->getFQURI('owl:onProperty').'> <'.parent::getRegistry()->getFQURI("obo_vocabulary:".$pred_id)."> $graph_uri .".PHP_EOL; $relationship .= '_:b'.$bid.' <'.parent::getRegistry()->getFQURI('owl:someValuesFrom').'> <'.parent::getRegistry()->getFQURI("$obj_ns:$obj_id")."> $graph_uri .".PHP_EOL; - $buf .= parent::triplify($tid,"obo_vocabulary:$pred_id","$obj_ns:$obj_id"); + $buf .= parent::triplify($tid,"obo_vocabulary:$pred_id","$obj_ns:$obj_id"); #@todo this causes problem with OGG-MM } } else { // default handler