diff --git a/MIT-LICENSE.txt b/MIT-LICENSE.txt
new file mode 100644
index 0000000..4639eef
--- /dev/null
+++ b/MIT-LICENSE.txt
@@ -0,0 +1,21 @@
+Copyright 2014 Bio2RDF project team and other contributors
+http://bio2rdf.org
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1840c13
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+Bio2RDF-scripts
+===============
+ This Git repository holds all of the RDF converter scripts used to generate Bio2RDF linked data.
+
+Requirements
+-------------
+See the [wiki](https://github.com/bio2rdf/bio2rdf-scripts/wiki) for details.
+
+---
+Licensed under [MIT License](http://en.wikipedia.org/wiki/MIT_License), see [license page](https://github.com/bio2rdf/bio2rdf-scripts/wiki/MIT-License) for details.
diff --git a/bioportal/bioportal.php b/bioportal/bioportal.php
index 19e1065..83266c7 100644
--- a/bioportal/bioportal.php
+++ b/bioportal/bioportal.php
@@ -36,7 +36,7 @@ function __construct($argv) {
parent::__construct($argv,'bioportal');
parent::addParameter('files',true,null,'all','all or comma-separated list of ontology short names to process');
parent::addParameter('download_url',false,null,'http://data.bioontology.org/');
- parent::addParameter('exclude',false,null,"AURA",'ontologies to exclude - use acronyms');
+ parent::addParameter('exclude',false,null,"AURA,HOOM",'ontologies to exclude - use acronyms');
parent::addParameter('continue_from',false,null,"",'the ontology abbreviation to restart from');
parent::addParameter('ncbo_api_key',false,null,null,'BioPortal API key (please use your own)');
parent::addParameter('ncbo_api_key_file',false,null,'ncbo.api.key','BioPortal API key file');
@@ -123,7 +123,6 @@ function Run()
if(isset($ls['description'])) $description = $ls['description'];
$rfile = $ls['ontology']['links']['download'];
-
$lfile = $abbv.".".$format.".gz";
if(!file_exists($idir.$lfile) or parent::getParameterValue('download') == 'true') {
echo "downloading ... ";
@@ -134,7 +133,7 @@ function Run()
$ret = curl_setopt($ch, CURLOPT_HEADER, 1);
$ret = curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$ret = curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- $ret = curl_setopt($ch, CURLOPT_TIMEOUT, 300);
+ $ret = curl_setopt($ch, CURLOPT_TIMEOUT, 600);
$ret = curl_exec($ch);
if(!$ret) {echo "no content";continue;}
@@ -167,12 +166,13 @@ function Run()
// process
echo "converting ... ";
- set_time_limit(0);
+
// let's double check the format
$fp = gzopen($idir.$lfile,"r");
$l = gzgets($fp);
if(strstr($l,"xml")) $format= "owl";
gzclose($fp);
+
if($format == 'obo') {
$this->OBO2RDF($abbv);
} else if($format == 'owl') {
@@ -182,6 +182,7 @@ function Run()
} else {
echo "no processor for $label (format $format)".PHP_EOL;
}
+
if(!file_exists($odir.$ofile)) { echo "no output".PHP_EOL;continue;}
parent::getWriteFile()->close();
parent::clear();
@@ -366,7 +367,7 @@ public function TriplifyMap($a, $prefix)
} else {
parent::addRDF(
- parent::triplifyString($s_uri,$p_uri,$a['o'],(($a['o_datatype'] == '')?null:$a['o_datatype']),(($a['o_lang'] == '')?null:$a['o_lang']))
+ parent::triplifyString($s_uri,$p_uri,addslashes($a['o']),(($a['o_datatype'] == '')?null:$a['o_datatype']),(($a['o_lang'] == '')?null:$a['o_lang']))
);
}
@@ -394,7 +395,7 @@ function OBO2RDF($abbv)
$graph_uri = '<'.parent::getRegistry()->getFQURI(parent::getGraphURI()).'>';
$bid = 1;
- while($l = parent::getReadFile()->read()) {
+ while(FALSE !== ($l = parent::getReadFile()->read())) {
$lt = trim($l);
if(strlen($lt) == 0) continue;
if($lt[0] == '!') continue;
@@ -461,6 +462,7 @@ function OBO2RDF($abbv)
else {$ns = strtolower($c[0]);$id=$c[1];}
$id = str_replace( array("(",")"), array("_",""), $id);
$tid = $ns.":".$id;
+ echo $tid.PHP_EOL;
} else if($a[0] == "name") {
$buf .= parent::describeClass($tid,addslashes(stripslashes($a[1])));
} else if($a[0] == "is_a") {
@@ -483,7 +485,8 @@ function OBO2RDF($abbv)
$buf .= $t;
$is_deprecated = true;
} else if($a[0] == "id") {
- parent::getRegistry()->parseQName($a[1],$ns,$id);
+ parent::getRegistry()->parseQName($a[1],$ns,$id);
+ if(trim($ns) == '') $ns = "unspecified";
$tid = "$ns:$id";
// $buf .= parent::describeClass($tid,null,"owl:Class");
// $buf .= parent::triplify($tid,"rdfs:isDefinedBy",$ouri);
@@ -610,6 +613,7 @@ function OBO2RDF($abbv)
} else if($a[0] == "is_a") {
// do subclassing
parent::getRegistry()->parseQName($a[1],$ns,$id);
+ if(trim($ns) == '') $ns = "unspecified";
$t = parent::triplify($tid,"rdfs:subClassOf","$ns:$id");
$buf .= $t;
$min .= $t;
@@ -657,17 +661,19 @@ function OBO2RDF($abbv)
$c = explode(" ",$a[1]);
if(count($c) == 1) { // just a class
parent::getRegistry()->parseQName($c[0],$ns,$id);
+ if(trim($ns) == '') $ns = "unspecified";
$relationship .= parent::getRegistry()->getFQURI("$ns:$id");
$buf .= parent::triplify($tid,"rdfs:subClassOf","$ns:$id");
} else if(count($c) == 2) { // an expression
parent::getRegistry()->parseQName($c[0],$pred_ns,$pred_id);
parent::getRegistry()->parseQName($c[1],$obj_ns,$obj_id);
+ if(trim($obj_ns) == '') $obj_ns = "unspecified";
$relationship .= '_:b'.$bid.' <'.parent::getRegistry()->getFQURI('owl:onProperty').'> <'.parent::getRegistry()->getFQURI("obo_vocabulary:".$pred_id)."> $graph_uri .".PHP_EOL;
$relationship .= '_:b'.$bid.' <'.parent::getRegistry()->getFQURI('owl:someValuesFrom').'> <'.parent::getRegistry()->getFQURI("$obj_ns:$obj_id")."> $graph_uri .".PHP_EOL;
- $buf .= parent::triplify($tid,"obo_vocabulary:$pred_id","$obj_ns:$obj_id");
+ $buf .= parent::triplify($tid,"obo_vocabulary:$pred_id","$obj_ns:$obj_id"); #@todo this causes problem with OGG-MM
}
} else {
// default handler
@@ -676,7 +682,8 @@ function OBO2RDF($abbv)
} else {
//header
//format-version: 1.0
- $buf .= parent::triplifyString($ouri,"obo_vocabulary:$a[0]",str_replace( array('"','\:'), array('\"',':'), isset($a[1])?$a[1]:""));
+ $buf .= parent::triplifyString($ouri,"obo_vocabulary:$a[0]",
+ str_replace( array('"','\:'), array('\"',':'), isset($a[1])?$a[1]:""));
}
if($minimal || $minimalp) parent::getWriteFile()->write($min);
diff --git a/chembl/chembl.php b/chembl/chembl.php
index c4b3605..b0f5646 100644
--- a/chembl/chembl.php
+++ b/chembl/chembl.php
@@ -1017,8 +1017,68 @@ function compounds($connection) {
}
parent::writeRDFBufferToWriteFile();
}
+ }
+ $result->free();
+ }
- $result->free();
+ /*
+ * parse the assays tables
+ */
+ function process_assays() {
+
+ $this->set_write_file("assays");
+
+ $allIDs = mysql_query(
+ "SELECT DISTINCT * FROM assays, assay_type " .
+ "WHERE assays.assay_type = assay_type.assay_type"
+ );
+
+ $num = mysql_numrows($allIDs);
+
+ while ($row = mysql_fetch_assoc($allIDs)) {
+
+ $assay = "chembl:assay_".$row['assay_id'];
+ $this->AddRDF($this->QQuad($assay,"rdf:type","chembl_vocabulary:Assay"));
+
+ //chembl assay id
+ $chembl = "chembl:". $row['chembl_id'];
+ $this->AddRDF($this->QQuadl($assay,"dc:identifier",$row['chembl_id']));
+ $this->AddRDF($this->QQuad($assay,"owl:equivalentClass",$chembl));
+ $this->AddRDF($this->QQuad($chembl,"owl:equivalentClass",$assay));
+ $this->WriteRDFBufferToWriteFile();
+
+ if ($row['description']) {
+ # clean up description
+ $description = $row['description'];
+ $description = str_replace("\\", "\\\\", $description);
+ $description = str_replace("\"", "\\\"", $description);
+ $this->AddRDF($this->QQuadl($assay,"chembl_vocabulary:hasDescription",$description));
+ }
+
+ if ($row['doc_id']){
+ $this->AddRDF($this->QQuad($assay,"chembl_vocabulary:citesAsDataSource","chembl:reference_".$row['doc_id']));
+ }
+
+ $props = mysql_query("SELECT DISTINCT * FROM assay2target WHERE assay_id = " . $row['assay_id']);
+
+ while ($prop = mysql_fetch_assoc($props)) {
+ if ($prop['tid']) {
+ $target = "chembl:target_".$prop['tid'];
+ $this->AddRDF($this->QQuad($assay,"chembl_vocabulary:hasTarget",$target));
+
+ if ($prop['confidence_score']) {
+ $targetScore = "chembl:tscore_".md5($assay.$prop['tid']);
+ $this->AddRDF($this->QQuad($assay,"chembl_vocabulary:hasTargetScore",$targetScore));
+ $this->AddRDF($this->QQuad($targetScore,"chembl_vocabulary:forTarget",$target));
+ $this->AddRDF($this->QQuadl($targetScore,"rdf:value",$prop['confidence_score']));
+ }
+ }
+
+ $this->WriteRDFBufferToWriteFile();
+
+ }
+ $this->AddRDF($this->QQuad($assay,"chembl_vocabulary:hasAssayType","chembl_vocabulary:".$row['assay_desc']));
+ $this->WriteRDFBufferToWriteFile();
}
}
@@ -1287,4 +1347,5 @@ function protein_families($connection){
}
}
}
+
?>
diff --git a/clinicaltrials/clinicaltrials.php b/clinicaltrials/clinicaltrials.php
index 0025506..047cba9 100644
--- a/clinicaltrials/clinicaltrials.php
+++ b/clinicaltrials/clinicaltrials.php
@@ -22,7 +22,7 @@
@author :: Dana Klasen
@author :: Michel Dumontier
- @version :: 0.3
+ @version :: 0.4
@description :: clinicaltrials.gov parser
*/
@@ -35,167 +35,103 @@ function __construct($argv)
{
parent::__construct($argv,"clinicaltrials");
parent::addParameter('files',true,'all','all','files to process');
- parent::addParameter('download_url',false,null,'http://clinicaltrials.gov/ct2/crawl');
- parent::addParameter('overwrite',false,'true|false','false','overwrite existing files with download option');
+ parent::addParameter('download_url',false,null,'https://clinicaltrials.gov/AllPublicXML.zip');
parent::initialize();
}
function run()
{
- if(parent::getParameterValue('download') === true) $this->crawl();
- $this->parse_dir();
- }
-
-
- /**
- * generate the proper subdir based on the file name
- **/
- function get_sub_dir($entry)
- {
- $bin_range = 10;
-
- preg_match('/NCT[0]+(\d+)\.xml$/', $entry,$matches);
- $record_number = $matches[1];
-
- // find last multiple of bin_range
- $count = -strlen($bin_range);
- $marker = substr($record_number, $count);
-
- $curr_bin = substr($marker, 0,1). str_repeat(0,intval(strlen($bin_range))-1);
-
- $sub_dir = substr($record_number, 0,$count).$curr_bin;
-
- return $sub_dir;
- }
- /**
- * scape the clinical gov site for the links to invididual records
- **/
- function crawl(){
- $crawl_url = parent::getParameterValue("download_url"); //"http://clinicaltrials.gov/ct2/crawl";
- echo "Fetching clinical trial list...".PHP_EOL;
- $html = file_get_contents($crawl_url);
- if($html === FALSE) {
- trigger_error("unable to get crawl file");
- return false;
- }
- echo "done.".PHP_EOL;
-
- $dom = new DOMDocument();
- @$dom->loadHTML($html);
-
- // grab all the links on the page
- $xpath = new DOMXPath($dom);
- $hrefs = $xpath->evaluate("/html/body//a");
-
- for ($i = 0; $i < $hrefs->length; $i++) {
- $href = $hrefs->item($i);
- if(preg_match("/crawl\/([0-9]+)/",$href->getAttribute('href'))){
- $record_block_url = "http://clinicaltrials.gov".$href->getAttribute('href');
- $this->fetch_record_block($record_block_url);
+ $ldir = parent::getParameterValue('indir');
+ $tdir = $ldir."clinicaltrials";
+ $odir = parent::getParameterValue('outdir');
+
+ $lfile = $ldir.'clinicaltrials.zip'; # giving it this local file name
+ $rfile = parent::getParameterValue('download_url');
+ if(!file_exists($lfile) || parent::getParameterValue('download') == 'true') {
+ #download and extract to temp dir
+ $ret = utils::downloadSingle($rfile,$lfile);
+ if($ret === false) {
+ trigger_error("unable to download $file", E_USER_ERROR);
}
- }
- }
-
- /**
- * Fetch the page holding a block of records
- **/
- function fetch_record_block($url){
- echo "Fetching record block...".PHP_EOL;
- $html = file_get_contents($url);
- if($html === FALSE) {
- trigger_error("unable to fetch record block at $url",E_USER_ERROR);
- return false;
- }
- echo "done.".PHP_EOL;
-
- $dom = new DOMDocument();
- @$dom->loadHTML($html);
-
- $xpath = new DOMXPath($dom);
- $hrefs = $xpath->evaluate("/html/body//a");
-
- for ($i = 0; $i < $hrefs->length; $i++) {
- $href = $hrefs->item($i);
- if(preg_match("/ct2\/show\//",$href->getAttribute('href'))){
- $page_uri = "http://clinicaltrials.gov/".$href->getAttribute('href')."?resultsxml=true";
- $this->fetch_page($page_uri);
+ $zip = new ZipArchive();
+ if ($zip->open($lfile) === FALSE) {
+ trigger_error("Unable to open $lfile");
+ exit;
}
+ $zip->extractTo($tdir);
+ $zip->close();
}
- }
- /**
- * fetch the individual record page using
- **/
- function fetch_page($url){
- preg_match("/show\/(NCT[0-9]+)/",$url,$m);
- $file = $m[1];
- $outfile = parent::getParameterValue("indir")."/".$file.".xml";
- if(!file_exists($outfile) or
- ((parent::getParameterValue("download") === true) and (parent::getParameterValue('overwrite') === true))) {
- echo "fetching $url".PHP_EOL;
- $xml = file_get_contents($url);
-
- # save the file
- $ret = file_put_contents($outfile,$xml);
- if($ret === FALSE) {
- trigger_error("unable to save $outfile");
- return false;
- }
+ $file_set = false;
+ $gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
+ if(parent::getParameterValue('id_list') != '') {
+ $id_list = explode(",",parent::getParameterValue('id_list'));
+ $ofile = "bio2rdf-clinicaltrials-selected-ids.".parent::getParameterValue('output_format');
+ parent::setWriteFile($odir.$ofile, $gz);
+ $file_set = true;
}
- }
-
-
- /** parse directory of files */
- function parse_dir(){
- $ignore = array("..",'.','.DS_STORE',"0");
- $this->setCheckPoint('dataset');
- $prefix = parent::getPrefix();
- $bVersion = parent::getParameterValue('bio2rdf_release');
- $date = date ("Y-m-d\TG:i:s\Z");
-
- $dataset_file = parent::getParameterValue("outdir").parent::getBio2RDFReleaseFile();
- $fp = fopen($dataset_file,"w");
- if($fp === FALSE) {
- trigger_error("Unable to open $dataset_file",E_USER_ERROR);
- return false;
- }
- $ids = explode(",",parent::getParameterValue('id_list'));
+ #$ofile = "bio2rdf-clinicaltrials.".parent::getParameterValue('output_format');
+ #parent::setWriteFile($odir.$ofile, $gz);
- $indir = parent::getParameterValue('indir');
- echo "Processing $indir\n";
+ $finished = false;
+ $d = dir($tdir);
+ $n = 0; $ftotal = 0;
+ while (false !== ($dir = $d->read())) {
+ if($dir == '.' or $dir == '..' or $dir == "Contents.txt") continue;
- $outfile = "clinicaltrials.".parent::getParameterValue('output_format');
- $gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
- parent::setWriteFile(parent::getParameterValue("outdir").$outfile,$gz);
-
- $files = glob($indir."NCT*");
- foreach($files AS $i => $file) {
- if($i % 10000 == 0) {parent::clear();}
- $trial_id = basename($file,'.xml');
- if(parent::getParameterValue('id_list') == '' || in_array($trial_id, $ids)) {
- if(filesize($file)!=0) {
- echo "Processing $trial_id".PHP_EOL;
- $this->process_file($file);
- } else{
- echo "Processing $trial_id -> Empty!".PHP_EOL;
+ $edir = $tdir."/".$dir;
+
+ $d2 = dir($edir);
+ while (false !== ($e2 = $d2->read())) {
+ if($e2 == '.' or $e2 == '..') continue;
+
+ $f = $edir."/$e2";
+ $e = basename($e2,'.xml');
+ if(!isset($id_list)) {
+ $n++;
+ if(($n % 10000) == 1) {
+ if(parent::getWriteFile() != null) {
+ #if($ftotal == 3) {$finished=true;break;}
+ parent::getWriteFile()->close();
+ }
+ $ftotal ++;
+ $ofile = "bio2rdf-clinicaltrials-".str_pad($ftotal, 3, "0", STR_PAD_LEFT).".".parent::getParameterValue('output_format');
+ parent::setWriteFile($odir.$ofile, $gz);
+ echo $ofile.PHP_EOL;
+ }
+ $this->process_file($f);
+ } else if(in_array($e, $id_list)) {
+ echo "processing $e2".PHP_EOL;
+ $this->process_file($f);
+ $key = array_search($e, $id_list);
+ unset($id_list[$key]);
+ if(count($id_list) == 0) $finished = true;
}
+ if($finished == true) break;
}
+ $d2->close();
+ if($finished == true) break;
}
+ $d->close();
+
+
echo "Finished.".PHP_EOL;
parent::getWriteFile()->close();
-
+ exit;
// make the dataset description
parent::setGraphURI(parent::getDatasetURI());
- $rfile = "http://clinicaltrials.gov/ct2/show/NCT_ID?resultsxml=true";
$source_version = parent::getDatasetVersion();
+ $prefix = parent::getPrefix();
+ $bVersion = parent::getParameterValue('bio2rdf_release');
+ $date = date ("Y-m-d\TG:i:s\Z");
// dataset description
$source_file = (new DataResource($this))
->setURI($rfile)
->setTitle("Clinicaltrials")
- ->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($file)))
+ ->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($lfile)))
->setFormat("application/xml")
->setPublisher("http://clinicaltrials.gov/")
->setHomepage("http://clinicaltrials.gov/")
@@ -207,7 +143,7 @@ function parse_dir(){
parent::writeToReleaseFile($source_file->toRDF());
$output_file = (new DataResource($this))
- ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$outfile")
+ ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile")
->setTitle("Bio2RDF v$bVersion RDF version of $prefix v$source_version")
->setSource($source_file->getURI())
->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/clinicaltrials/clinicaltrials.php")
@@ -229,15 +165,16 @@ function parse_dir(){
parent::closeReleaseFile();
// write the dataset description file
- fclose($fp);
+ fclose($fp);
}
+
/**
* process a results xml file from the download directory
**/
- function process_file($infile) {
- $indir = parent::getParameterValue('indir');
- $xml = new CXML($infile);
+ function process_file($entry) {
+ $xml = new CXML($entry);
+
$this->setCheckPoint('file');
while($xml->Parse("clinical_study") == TRUE) {
$this->setCheckPoint('record');
@@ -249,6 +186,8 @@ function process_file($infile) {
$label = $this->getString("//brief_title");
if(!$label) $label = $this->getString("//official_title");
if(!$label) $label = "Clinical trial #".$nct_id;
+
+ $label = trim(preg_replace("/\s+/",' ',$label));
parent::addRDF(
parent::describeIndividual($study_id, $label, parent::getVoc()."Clinical-Study").
parent::describeClass(parent::getVoc()."Clinical-Study","Clinical Study")
@@ -275,7 +214,7 @@ function process_file($infile) {
if(isset($sids)) {
foreach($sids AS $id) {
parent::addRDF(
- parent::triplifyString($study_id, parent::getVoc()."secondary-id", (string)$id, "xsd:string")
+ parent::triplifyString($study_id, parent::getVoc()."secondary-id", $this->safeString($id), "xsd:string")
);
}
}
@@ -283,7 +222,7 @@ function process_file($infile) {
if(isset($nctaliases)) {
foreach($nctaliases AS $id) {
parent::addRDF(
- parent::triplifyString($study_id, parent::getVoc()."nct-alias", (string)$id, "xsd:string")
+ parent::triplifyString($study_id, parent::getVoc()."nct-alias", $this->safeString($id), "xsd:string")
);
}
}
@@ -291,15 +230,23 @@ function process_file($infile) {
##########################################################################################
#titles
##########################################################################################
+ $x = $this->getString("//brief_title");
+ $brief_title = trim(preg_replace("/\s+/",' ',$x));
+
+ $x = $this->getString("//official_title");
+ $official_title = trim(preg_replace("/\s+/",' ',$x));
+
parent::addRDF(
- parent::triplifyString($study_id, parent::getVoc()."brief-title",$this->getString("//brief_title")).
- parent::triplifyString($study_id,parent::getVoc()."official-title",$this->getString("//official_title"))
+ parent::triplifyString($study_id, parent::getVoc()."brief-title",$brief_title).
+ parent::triplifyString($study_id,parent::getVoc()."official-title",$official_title)
);
###################################################################################
#brief summary
###################################################################################
- $brief_summary =str_replace( array("\r","\n","\t"), array("
","
"," "), $this->getString('//brief_summary/textblock'));
+ $x = $this->getString('//brief_summary/textblock');
+ $brief_summary = trim(preg_replace("/\s+/",' ',$x));
+
parent::addRDF(
parent::triplifyString($study_id,$this->getVoc()."brief-summary",$brief_summary)
);
@@ -308,7 +255,9 @@ function process_file($infile) {
####################################################################################
# detailed description
####################################################################################
- $d = str_replace( array("\r","\n","\t"), array("
","
"," "), $this->getString('//detailed_description/textblock'));
+ $x = $this->getString('//detailed_description/textblock');
+ $d = trim(preg_replace("/\s+/",' ',$x));
+
parent::addRDF(
parent::triplifyString($study_id,parent::getVoc()."detailed-description",$d)
);
@@ -364,16 +313,18 @@ function process_file($infile) {
######################################################################################
try {
$oversight = @array_shift($root->xpath('//oversight_info'));
- $oversight_id = parent::getRes().md5($oversight->asXML());
+ if($oversight !== null) {
+ $oversight_id = parent::getRes().md5($oversight->asXML());
- $authority = $this->getString('//authority', $oversight);
- $authority_id = parent::getRes().md5($authority);
- parent::addRDF(
- parent::describeIndividual($oversight_id,$authority,parent::getVoc()."Organization").
- parent::triplify($study_id,$this->getVoc()."oversight",$oversight_id).
- parent::triplify($study_id,$this->getVoc()."authority",$authority_id).
- parent::triplifyString($oversight_id, parent::getVoc()."has-dmc", $this->getString('//has_dmc', $oversight))
- );
+ $authority = $this->getString('//authority', $oversight);
+ $authority_id = parent::getRes().md5($authority);
+ parent::addRDF(
+ parent::describeIndividual($oversight_id,$authority,parent::getVoc()."Organization").
+ parent::triplify($study_id,$this->getVoc()."oversight",$oversight_id).
+ parent::triplify($study_id,$this->getVoc()."authority",$authority_id).
+ parent::triplifyString($oversight_id, parent::getVoc()."has-dmc", $this->getString('//has_dmc', $oversight))
+ );
+ }
} catch(Exception $e){
echo "There was an error in the oversight info element: $e\n";
@@ -463,7 +414,7 @@ function process_file($infile) {
$key = parent::getRes().md5($c[0]);
$value = parent::getRes().md5($c[1]);
parent::addRDF(
- parent::describeIndividual($sdp,$b,parent::getVoc()."Study-Design-Parameter").
+ parent::describeIndividual($sdp,$this->safeString($b),parent::getVoc()."Study-Design-Parameter").
parent::describeClass(parent::getVoc()."Study-Design-Parameter","Study Design Parameter").
parent::triplify($sdp,parent::getVoc()."key",$key).
parent::describeClass($key,$c[0]).
@@ -565,7 +516,7 @@ function process_file($infile) {
$mesh_label_id = parent::getRes().md5($condition);
parent::addRDF(
parent::triplify($study_id,parent::getVoc()."condition",$mesh_label_id).
- parent::describeClass($mesh_label_id,$condition,parent::getVoc()."Condition").
+ parent::describeClass($mesh_label_id,$this->safeString($condition),parent::getVoc()."Condition").
parent::describeClass(parent::getVoc()."Condition","Condition")
);
}
@@ -623,10 +574,12 @@ function process_file($infile) {
);
$agl = $intervention->xpath("./arm_group_label");
foreach($agl AS $a) {
+ $label = $this->safeString($a);
+
$arm_group_id = md5($a);
$ag = parent::getRes().$this->nct_id."/arm-group/".$arm_group_id;
parent::addRDF(
- parent::describeIndividual($ag,$a,parent::getVoc()."Arm-Group").
+ parent::describeIndividual($ag,$label,parent::getVoc()."Arm-Group").
parent::describeClass(parent::getVoc()."Arm-Group","Arm Group").
parent::triplify($intervention_id, parent::getVoc()."arm-group",$ag)
);
@@ -654,6 +607,9 @@ function process_file($infile) {
if($criteria = @array_shift($eligibility->xpath('./criteria'))){
$text = @array_shift($criteria->xpath('./textblock'));
+ $x = str_replace(array('"',"'",'\\','�'),'', $text);
+ $text = trim(preg_replace("/\s+/",' ',$x));
+
parent::addRDF(
parent::triplifyString($eligibility_id, parent::getVoc()."text",$text)
);
@@ -677,7 +633,7 @@ function process_file($infile) {
if(isset($c[2])) {
$d = explode(" - ",$c[1]);
foreach($d AS $exclusion) {
- $exc = trim($exclusion);
+ $exc = $this->safeString($exclusion);
if($exc != '') {
$exc_id = parent::getRes().md5($exc);
parent::addRDF(
@@ -703,7 +659,7 @@ function process_file($infile) {
if($s != 'N/A') {
$age = trim(str_replace("Years","",$s));
parent::addRDF(
- parent::triplifyString($eligibility_id,parent::getVoc().str_replace("_","-",$a),$age)
+ parent::triplifyString($eligibility_id,parent::getVoc().str_replace("_","-",$this->safeString($a)),$age)
);
}
}
@@ -750,7 +706,7 @@ function process_file($infile) {
$d = @array_shift($root->xpath('//'.$c));
if($d) {
parent::addRDF(
- parent::triplify($study_id, parent::getVoc().str_replace("_","-",$c), $this->makeContact($d))
+ parent::triplify($study_id, parent::getVoc().str_replace("_","-",$this->safeString($c)), $this->makeContact($d))
);
}
}
@@ -762,11 +718,12 @@ function process_file($infile) {
# location of facility doing the testing
##############################################################
try {
- $location = @array_shift($root->xpath('//location'));
- if($location){
+ $locations = $root->xpath('//location');
+ foreach($locations AS $location) {
$location_uri = parent::getRes().md5($location->asXML());
- $name = $this->getString('//facility/name',$location);
- $address = @array_shift($location->xpath('//facility/address'));
+ $facility = $location->facility;
+ $name = $facility->name[0];
+ $address = $facility->address;//@array_shift($location->xpath('//facility/address'));
$contact = @array_shift($location->xpath('//contact'));
$backups = @array_shift($location->xpath('//contact_backup'));
$investigators = @array_shift($location->xpath('//investigator'));
@@ -828,12 +785,16 @@ function process_file($infile) {
$references = $root->xpath('//'.$ref_type);
foreach($references as $reference){
$p = $this->getString('./PMID',$reference);
+
+ $ref = $this->getString('./citation',$reference);
+ $ref = trim(preg_replace("/\s+/",' ',$x));
+
if($p) {
$pmid = "pubmed:$p";
parent::addRDF(
parent::describeIndividual($pmid,$p,parent::getVoc()."Reference").
parent::describeClass(parent::getVoc()."Reference", "Reference").
- parent::triplifyString($pmid, parent::getVoc()."citation", $this->getString('./citation',$reference)).
+ parent::triplifyString($pmid, parent::getVoc()."citation", $ref).
parent::triplify($study_id,parent::getVoc().str_replace("_","-",$ref_type),$pmid)
);
}
@@ -901,7 +862,7 @@ function process_file($infile) {
$keywords = $root->xpath('//keyword');
foreach($keywords as $keyword){
parent::addRDF(
- parent::triplifyString($study_id,parent::getVoc()."keyword",(string)$keyword)
+ parent::triplifyString($study_id,parent::getVoc()."keyword",$this->safeString($keyword))
);
}
}catch(Exception $e){
@@ -913,7 +874,7 @@ function process_file($infile) {
try{
$mesh_terms = $root->xpath('//condition_browse/mesh_term');
foreach($mesh_terms as $mesh_term){
- $term = (string)$mesh_term;
+ $term = $this->safeString($mesh_term);
$mesh_id = parent::getRes().md5($term);
parent::addRDF(parent::triplify($study_id,parent::getVoc()."condition-mesh",$mesh_id));
parent::addRDF(parent::triplifyString($mesh_id,"rdfs:label",$term));
@@ -943,7 +904,7 @@ function process_file($infile) {
foreach($a AS $browse_type) {
$terms = $root->xpath("//$browse_type/mesh_term");
foreach($terms as $term){
- $term_label = (string)$term;
+ $term_label = $this->safeString($term);
$term_id = parent::getRes().md5($term);
parent::addRDF(
parent::describeIndividual($term_id,$term_label,parent::getVoc()."Term").
@@ -1108,10 +1069,12 @@ function process_file($infile) {
# outcomes
################################################################################
try {
+ $o_n = 1;
$outcomes = @array_shift($root->xpath('//outcome_list'));
if($outcomes) {
+
foreach($outcomes AS $i => $outcome) {
- $outcome_id = $this->nct_id."/outcome/".($i+1);
+ $outcome_id = $this->nct_id."/outcome/".($o_n++);
$outcome_uri = parent::getRes().$outcome_id;
$outcome_label = $this->getString("./title",$outcome);
if(!$outcome_label) $outcome_label = "outcome for ".$this->nct_id;
@@ -1134,7 +1097,7 @@ function process_file($infile) {
}
}
- // measure list
+ // measure list # this has changed
$measures = @array_shift($outcome->xpath('./measure_list'));
if($measures) {
foreach($measures AS $measure) {
@@ -1143,7 +1106,14 @@ function process_file($infile) {
);
}
}
+ $measure = @array_shift($outcome->xpath('./measure'));
+ if($measure) {
+ parent::addRDF(
+ parent::triplify($outcome_uri,parent::getVoc()."measure", $this->makeMeasure($measure))
+ );
+ }
+
// analysis list
$analyses = @array_shift($outcome->xpath('./analysis_list'));
if($analyses) {
@@ -1184,7 +1154,7 @@ function process_file($infile) {
foreach($event_list AS $ev => $ev_label) {
$et = @array_shift($reported_events->xpath('./'.$ev));
if(!$et) continue;
- $ev_uri = parent::getVoc().str_replace(" ","-",$ev_label);
+ $ev_uri = parent::getVoc().str_replace(" ","-",$this->safeString($ev_label));
$categories = @array_shift($et->xpath('./category_list'));
foreach($categories AS $category) {
@@ -1233,9 +1203,9 @@ function process_file($infile) {
} catch(Exception $e) {
echo "Error in parsing reported events".PHP_EOL;
}
-
parent::writeRDFBufferToWriteFile();
}
+ parent::writeRDFBufferToWriteFile();
$this->setCheckPoint('record');
$this->setCheckPoint('dataset');
}
@@ -1245,9 +1215,14 @@ function getString($xpath,$element = null)
$o = $this->root;
if(isset($element)) $o = $element;
$r = @array_shift($o->xpath($xpath));
- return ((string)$r[0]);
+ return $this->safeString($r[0]);
}
+ function safeString($string)
+ {
+ return str_replace(array('"','\\'),array('','/'),(string)$string);
+ }
+
public function getMonthNumber($month)
{
$months = array(
@@ -1343,29 +1318,33 @@ public function makeMeasure($measure)
parent::triplifyString($measure_id, parent::getVoc()."dispersion", $this->getString('./dispersion', $measure))
);
- $categories = @array_shift($measure->xpath('./category_list'));
- foreach($categories AS $category) {
- $cid = parent::getRes().$this->nct_id."/category/".md5($category->asXML());
- $cat_label = $this->getString('./sub_title', $category);
- if(!$cat_label) $cat_label = "category for measure";
- parent::addRDF(
- parent::describeIndividual($cid, $cat_label, parent::getVoc()."Category").
- parent::describeClass(parent::getVoc()."Category","Category").
- parent::triplify($measure_id,parent::getVoc()."category",$cid)
- );
- $ml = @array_shift($category->xpath('./measurement_list'));
- foreach($ml AS $m) {
- $mid = parent::getRes().$this->nct_id."/measurement/".md5($m->asXML());
+ $categories = @array_shift($measure->xpath('./class_list/class/category_list'));
+ if(isset($categories)) {
+ foreach($categories AS $category) {
+ $cid = parent::getRes().$this->nct_id."/category/".md5($category->asXML());
+ $cat_label = $this->getString('./sub_title', $category);
+ if(!$cat_label) $cat_label = "category for measure";
parent::addRDF(
- parent::describeIndividual($mid, $this->nct_id." measurement", parent::getVoc()."Measurement").
- parent::describeClass(parent::getVoc()."Measurement","Measurement").
- parent::triplify($mid, parent::getVoc()."group-id", parent::getRes().$this->nct_id."/group/".$m->attributes()->group_id).
- parent::triplifyString($mid, parent::getVoc()."value", $m->attributes()->value).
- parent::triplifyString($mid, parent::getVoc()."spread", $m->attributes()->spread).
- parent::triplifyString($mid, parent::getVoc()."lower-limit", $m->attributes()->lower_limit).
- parent::triplifyString($mid, parent::getVoc()."upper-limit", $m->attributes()->upper_limit).
- parent::triplify($cid, parent::getVoc()."measurement",$mid)
+ parent::describeIndividual($cid, $cat_label, parent::getVoc()."Category").
+ parent::describeClass(parent::getVoc()."Category","Category").
+ parent::triplify($measure_id,parent::getVoc()."category",$cid)
);
+ $ml = @array_shift($category->xpath('./measurement_list'));
+ if(isset($ml)) {
+ foreach($ml AS $m) {
+ $mid = parent::getRes().$this->nct_id."/measurement/".md5($m->asXML());
+ parent::addRDF(
+ parent::describeIndividual($mid, $this->nct_id." measurement", parent::getVoc()."Measurement").
+ parent::describeClass(parent::getVoc()."Measurement","Measurement").
+ parent::triplify($mid, parent::getVoc()."group-id", parent::getRes().$this->nct_id."/group/".$m->attributes()->group_id).
+ parent::triplifyString($mid, parent::getVoc()."value", $m->attributes()->value).
+ parent::triplifyString($mid, parent::getVoc()."spread", $m->attributes()->spread).
+ parent::triplifyString($mid, parent::getVoc()."lower-limit", $m->attributes()->lower_limit).
+ parent::triplifyString($mid, parent::getVoc()."upper-limit", $m->attributes()->upper_limit).
+ parent::triplify($cid, parent::getVoc()."measurement",$mid)
+ );
+ }
+ }
}
}
return $measure_id;
diff --git a/common/php/oboparser.php b/common/php/oboparser.php
index fe0c83e..11c27ec 100644
--- a/common/php/oboparser.php
+++ b/common/php/oboparser.php
@@ -9,9 +9,9 @@ function OBOParser($in)
if(strstr($l,"[Term]")) {
if(isset($term)) {
- $terms[$term['id'][0]] = $term;
+ $terms[$term['id'][0]] = $term;
}
- $term = '';
+ $term = array();
} else if(strstr($l,"[Typedef]")) {
if(isset($term)) {
$terms[$term['id'][0]] = $term;
@@ -27,7 +27,7 @@ function OBOParser($in)
if(count($m)) {
$a[1] = $m[1];
}
- $term[$a[0]][] = $a[1];
+ $term[$a[0]][] = $a[1];
} else if(isset($typedef)) {
diff --git a/composer.json b/composer.json
new file mode 100644
index 0000000..bf9a794
--- /dev/null
+++ b/composer.json
@@ -0,0 +1,5 @@
+{
+ "require": {
+ "semsol/arc2": "2.3.*"
+ }
+}
\ No newline at end of file
diff --git a/ctd/ctd.php b/ctd/ctd.php
index 7fc4260..2fc474f 100644
--- a/ctd/ctd.php
+++ b/ctd/ctd.php
@@ -420,13 +420,13 @@ function CTD_chem_pathways_enriched()
return FALSE;
}
$first = false;
- }
-
+ }
$chemical_id = $a[1];
$this->getRegistry()->parseQName($a[4], $pathway_ns, $pathway_id);
if($pathway_ns == "react") $pathway_ns = "reactome";
-
+ if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id;
+
$pathway_resource_id = parent::getRes().md5($chemical_id.$pathway_ns.$pathway_id.$a[6]);
$pathway_resource_label = "Chemical-pathway association between mesh:".$chemical_id." and ".$pathway_ns.":".$pathway_id." with p-value ".$a[6];
@@ -509,7 +509,8 @@ function CTD_diseases_pathways()
$this->getRegistry()->parseQName($a[1],$disease_ns,$disease_id);
$this->getRegistry()->parseQName($a[3],$pathway_ns,$pathway_id);
if($pathway_ns == 'react') $pathway_ns = 'reactome';
-
+ if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id;
+
$this->AddRDF(
parent::triplify($disease_ns.":".$disease_id, $this->getVoc()."pathway", $pathway_ns.":".$pathway_id).
parent::triplifyString($disease_ns.":".$disease_id, "rdfs:label", $a[0]." [$disease_ns:$disease_id]").
@@ -612,6 +613,7 @@ function CTD_genes_pathways()
$this->getRegistry()->parseQName($a[3],$pathway_ns,$pathway_id);
$pathway_id = trim($pathway_id);
if($pathway_ns == "react") $pathway_ns = "reactome";
+ if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id;
$this->ADDRDF(
parent::triplify($gene_ns.":".$gene_id, $this->getVoc()."pathway", $pathway_ns.":".$pathway_id).
@@ -645,7 +647,8 @@ function CTD_Pathways()
$this->getRegistry()->parseQName(trim($a[1]),$pathway_ns,$pathway_id);
if($pathway_ns == "react") $pathway_ns = "reactome";
-
+ if($pathway_ns == "kegg") $pathway_id = "map".$pathway_id;
+
$this->AddRDF(
parent::describeIndividual($pathway_ns.":".$pathway_id, $a[0], $this->getVoc()."Pathway").
parent::describeClass($this->getVoc()."Pathway", "CTD Pathway")
diff --git a/drugbank/drugbank.php b/drugbank/drugbank.php
index 3254e99..3f26ad9 100755
--- a/drugbank/drugbank.php
+++ b/drugbank/drugbank.php
@@ -56,6 +56,16 @@ function Run()
if(parent::getParameterValue("id_list")) {
$this->id_list = array_flip(explode(",",parent::getParameterValue('id_list')));
}
+
+ $go_cache_file = parent::getParameterValue('indir')."go.cache.json";
+ #unlink($go_cache_file);
+ if(!file_exists($go_cache_file) or parent::getParameterValue('download') == true) {
+ $this->getGO();
+ file_put_contents($go_cache_file,json_encode($this->go));
+ } else {
+ // read the file
+ $this->go = json_decode( file_get_contents($go_cache_file), true);
+ }
$dataset_description = '';
foreach($files AS $f) {
@@ -241,9 +251,16 @@ function parsePartnerEntry($did, $pid, $x)
parent::triplify($pid, parent::getVoc()."x-pfam","pfam:"."".$v2->identifier)
);
} else if($k2 == "go-classifier") {
- parent::addRDF(
- parent::triplifyString($pid, parent::getVoc()."go-".$v2->category, $v2->description)
- );
+ $e = array_search($v2->description, $this->go);
+ if($e !== FALSE) {
+ parent::addRDF(
+ parent::triplify($pid, parent::getVoc()."go-".$v2->category, $e)
+ );
+ } else {
+ parent::addRDF(
+ parent::triplifyString($pid, parent::getVoc()."go-".$v2->category, $v2->description)
+ );
+ }
} else {
trigger_error("no handler for $k2",E_USER_WARNING);
/* parent::addRDF(
@@ -437,7 +454,7 @@ function parseDrugEntry(&$xml)
foreach($x->mixtures->mixture AS $item) {
if(isset($item)) {
$o = $item;
- $mid = parent::getRes().str_replace(" ","-",$o->name[0]);
+ $mid = parent::getRes().md5(str_replace(" ","-",$o->name[0]));
parent::addRDF(
parent::triplify($did,parent::getVoc()."mixture",$mid).
@@ -773,6 +790,35 @@ function AddList(&$x, $id, $list_name, $item_name, $predicate, $list_item_name =
}
}
+ function getGO()
+ {
+ $this->go = null;
+
+ $server = "http://bio2rdf.org/sparql";
+ $sparql = "PREFIX dct:
+SELECT distinct ?id ?title
+{
+ ?go a .
+ ?go dct:identifier ?id.
+ ?go dct:title ?title .
+} ";
+ $url = $server."?query=".urlencode($sparql)."&format=".urlencode("text/tab-separated-values");
+
+ $results = file_get_contents($url);
+ if($results === FALSE) {
+ trigger_error("Unable to get Gene Ontology labels",E_USER_WARNING);
+ return false;
+ }
+ $list = explode("\n",$results);
+ array_shift($list); array_pop($list); // remove first and last
+
+ foreach($list AS $v) {
+ $b = explode("\t",str_replace('"','',$v));
+ $this->go[$b[0]] = $b[1];
+ }
+ return true;
+ }
+
} // end class
?>
diff --git a/hgnc/hgnc.php b/hgnc/hgnc.php
index 164bcf5..168d63d 100755
--- a/hgnc/hgnc.php
+++ b/hgnc/hgnc.php
@@ -36,12 +36,12 @@ class HGNCParser extends Bio2RDFizer {
function __construct($argv){
parent::__construct($argv, "hgnc");
parent::addParameter('files',true,'all','all','files to process');
- parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz');
+ parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt');
parent::initialize();
}//constructor
function Run(){
- $file = "hgnc_complete_set.txt.gz";
+ $file = "hgnc_complete_set.txt";
$ldir = parent::getParameterValue('indir');
$odir = parent::getParameterValue('outdir');
$rdir = parent::getParameterValue('download_url');
@@ -80,7 +80,7 @@ function Run(){
->setFormat('text/tab-separated-value')
->setFormat('application/zip')
->setPublisher('http://www.genenames.org/')
- ->setHomepage('http://www.genenames.org/data/gdlw_columndef.html')
+ ->setHomepage('https://www.genenames.org/help/statistics-and-downloads/')
->setRights('use')
->setRights('attribution')
->setLicense('http://www.genenames.org/about/overview')
@@ -115,400 +115,227 @@ function Run(){
}//Run
function process(){
- $header = $this->GetReadFile()->Read(200000);
+ $header = $this->getReadFile()->read(200000);
$header_arr = explode("\t", $header);
- $n = 41;
- $c = count($header_arr);
+ $h = array_flip($header_arr);
+
+ $c = count($h);
+ $n = 52;
if ($c != $n)
{
echo PHP_EOL;
print_r($header_arr);
- trigger_error ("Expected $n columns, found $c . please update the script",E_USER_ERROR);
- exit;
+ trigger_error ("Expected $n columns, found $c . some fields may not be properly processed. update the script",E_USER_ERROR);
}
+ $this->getReadFile()->read(200000); // skip a line
- while($l = $this->GetReadFile()->Read(4096)) {
- $fields = explode("\t", $l);
- $id = strtolower($fields[0]);
- $approved_symbol = $fields[1];
- $approved_name = $fields[2];
- $status = $fields[3];
- $locus_type = $fields[4];
- $locus_group = $fields[5];
- $previous_symbols = $fields[6];
- $previous_names = $fields[7];
- $synonyms = $fields[8];
- $name_synonyms = $fields[9];
- $chromosome = $fields[10];
- $date_approved = $fields[11];
- $date_modified = $fields[12];
- $date_symbol_changed = $fields[13];
- $date_name_changed = $fields[14];
- $accession_numbers = $fields[15];
- $enzyme_ids = $fields[16];
- $entrez_gene_id = $fields[17];
- $ensembl_gene_id = $fields[18];
- $mouse_genome_database_id = $fields[19];
- $specialist_database_links = $fields[20];
- $specialist_database_ids = $fields[21];
- $pubmed_ids = $fields[22];
- $refseq_ids = $fields[23];
- $gene_family_tag = $fields[24];
- $gene_family_description = $fields[25];
- $record_type = $fields[26];
- $primary_ids = $fields[27];
- $secondary_ids = $fields [28];
- $ccd_ids = $fields[29];
- $vega_ids = $fields[30];
- $locus_specific_databases = $fields[31];
- $entrez_gene_id_mappeddatasuppliedbyNCBI = $fields[32];
- $omim_id_mappeddatasuppliedbyNCBI = $fields[33];
- $refseq_mappeddatasuppliedbyNCBI = $fields[34];
- $uniprot_id_mappeddatasuppliedbyUniProt = $fields[35];
- $ensembl_id_mappeddatasuppliedbyEnsembl = $fields[36];
- $vega_id_mappeddatasuppliedbyVega = $fields[37];
- $ucsc_id_mappeddatasuppliedbyUCSC = $fields[38];
- $mouse_genome_database_id_mappeddatasuppliedbyMGI = $fields[39];
- $rat_genome_database_id_mappeddatasuppliedbyRGD = $fields[40];
+ while($l = $this->getReadFile()->read(4096)) {
+ $l = str_replace('"','',$l);
+ $r = explode("\t", $l);
+
+ $id = strtolower($r[$h['hgnc_id']]);
+ $uid = str_replace(":","_",$id);
+ $symbol = $r[$h['symbol']];
- $id_res = $id;
- $id_label = "Gene Symbol for ".$approved_symbol;
- parent::AddRDF(
- parent::triplify($id_res, "rdf:type", $this->getVoc()."Gene-Symbol").
- parent::describeIndividual($id_res, $id_label, $this->getVoc()."Gene-Symbol").
+ parent::addRDF(
+ parent::triplify($id, "rdf:type", $this->getVoc()."Gene-Symbol").
+ parent::describeIndividual($id, "Gene symbol for ".$symbol, $this->getVoc()."Gene-Symbol").
parent::describeClass($this->getVoc()."Gene-Symbol", "HGNC Official Gene Symbol")
);
- if(!empty($approved_symbol)){
- $s = "hgnc.symbol:".$approved_symbol;
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."approved-symbol",utf8_encode(htmlspecialchars($approved_symbol))).
+
+ if(!empty($symbol)){
+ $s = "hgnc.symbol:".$symbol;
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."approved-symbol",utf8_encode(htmlspecialchars($symbol))).
parent::describeProperty($this->getVoc()."approved-symbol", "HGNC approved gene symbol","The official gene symbol that has been approved by the HGNC and is publicly available. Symbols are approved based on specific HGNC nomenclature guidelines. In the HTML results page this ID links to the HGNC Symbol Report for that gene").
- parent::describeIndividual($s, $approved_symbol, parent::getVoc()."Approved-Gene-Symbol").
+ parent::describeIndividual($s, $symbol, parent::getVoc()."Approved-Gene-Symbol").
parent::describeClass(parent::getVoc()."Approved-Gene-Symbol","Approved Gene Symbol").
- parent::triplify($id_res, parent::getVoc()."has-approved-symbol", $s).
- parent::triplify($s, parent::getVoc()."is-approved-symbol-of", $id_res)
+ parent::triplify($id, parent::getVoc()."has-approved-symbol", $s).
+ parent::triplify($s, parent::getVoc()."is-approved-symbol-of", $id)
);
-
}
- if(!empty($approved_name)){
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."approved-name",utf8_encode(htmlspecialchars($approved_name))).
+ if(!empty($r[$h['name']])){
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."approved-name",utf8_encode(htmlspecialchars($r[$h['name']]))).
parent::describeProperty($this->getVoc()."approved-name","HGNC approved name", "The official gene name that has been approved by the HGNC and is publicly available. Names are approved based on specific HGNC nomenclature guidelines.")
);
}
- if(!empty($status)){
- $s = $this->getVoc().str_replace(" ","-",$status);
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."status",$s).
+ if(!empty($r[$h['status']])){
+ $s = $this->getVoc().str_replace(" ","-",$r[$h['status']]);
+ parent::addRDF(
+ parent::triplify($id, $this->getVoc()."status",$s).
parent::describeProperty($this->getVoc()."status","HGNC status", "Indicates whether the gene is classified as: Approved - these genes have HGNC-approved gene symbols. Entry withdrawn - these previously approved genes are no longer thought to exist. Symbol withdrawn - a previously approved record that has since been merged into a another record.").
- parent::describeClass($s,$status,$this->getVoc()."Status")
+ parent::describeClass($s,$r[$h['status']],$this->getVoc()."Status")
);
}
- if(!empty($locus_id)){
- $locus_res = $this->getRes().$id."_LOCUS";
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."locus", $locus_res).
- parent::triplifyString($locus_res, $this->getVoc()."locus-type",utf8_encode(htmlspecialchars($locus_type))).
- parent::triplifyString($locus_res, $this->getVoc()."locus-group", utf8_encode(htmlspecialchars($locus_group))).
+ if(!empty($r[$h['locus_group']])){
+ $locus_res = $this->getRes().$uid."_locus";
+ parent::addRDF(
+ parent::triplify($id, $this->getVoc()."locus", $locus_res).
+ parent::triplifyString($locus_res, $this->getVoc()."locus-type",utf8_encode(htmlspecialchars($r[$h['locus_type']]))).
+ parent::triplifyString($locus_res, $this->getVoc()."locus-group", utf8_encode(htmlspecialchars($r[$h['locus_group']]))).
parent::describeProperty($this->getVoc()."locus-type", "locus type","Specifies the type of locus described by the given entry").
parent::describeProperty($this->getVoc()."locus-group", "locus group", "Groups locus types together into related sets. Below is a list of groups and the locus types within the group")
);
}
- if(!empty($previous_symbols)){
- $previous_symbols = explode(", ", $previous_symbols);
+ if(!empty($r[$h['prev_symbol']])){
+ $s = $r[$h['prev_symbol']];
+ $previous_symbols = explode("|", $s);
foreach($previous_symbols as $previous_symbol){
$previous_symbol_uri = "hgnc.symbol:".$previous_symbol;
- parent::AddRDF(
+ parent::addRDF(
parent::describeIndividual($previous_symbol_uri, $previous_symbol, parent::getVoc()."Previous-Symbol").
parent::describeClass(parent::getVoc()."Previous-Symbol","Previous Symbol").
- parent::triplify($id_res, $this->getVoc()."previous-symbol", $previous_symbol_uri).
+ parent::triplify($id, $this->getVoc()."previous-symbol", $previous_symbol_uri).
parent::describeProperty($this->getVoc()."previous-symbol", "HGNC previous symbol","Symbols previously approved by the HGNC for this gene")
);
}
}
- if(!empty($previous_names)){
- $previous_names = explode(", ", $previous_names);
+ if(!empty($r[$h['prev_name']])){
+ $s = $r[$h['prev_name']];
+ $previous_names = explode("|", $s);
foreach($previous_names as $previous_name){
- $previous_name = str_replace("\"", "", $previous_name);
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."previous-name", utf8_encode(htmlspecialchars($previous_name))).
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."previous-name", utf8_encode(htmlspecialchars($previous_name))).
parent::describeProperty($this->getVoc()."previous-name", "HGNC previous name","Gene names previously approved by the HGNC for this gene")
);
}
}
- if(!empty($synonyms)){
- $synonyms = explode(", ", $synonyms);
- foreach ($synonyms as $synonym) {
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."synonym", utf8_encode(htmlspecialchars($synonym))).
- parent::describeProperty($this->getVoc()."synonym", "synonym","Other symbols used to refer to this gene")
+ if(!empty($r[$h['prev_symbol']])){
+ $s = $r[$h['prev_symbol']];
+ $prev_symbols = explode('|',$s);
+ foreach ($prev_symbols as $prev_symbol) {
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."prev-symbol", utf8_encode(htmlspecialchars($prev_symbol))).
+ parent::describeProperty($this->getVoc()."prev-symbol", "previous symbol","Previously used symbols used to refer to this gene")
);
}
}
- if(!empty($name_synonyms)){
- $name_synonyms = explode(", ", $name_synonyms);
- foreach ($name_synonyms as $name_synonym) {
- $name_synonym = str_replace("\"", "", $name_synonym);
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."name-synonym", utf8_encode(htmlspecialchars($name_synonym))).
- parent::describeProperty($this->getVoc()."name-synonym", "name synonym","Other names used to refer to this gene")
+ if(!empty($r[$h['alias_name']])){
+ $s = $r[$h['alias_name']];
+ $alias_names = explode("|", $s);
+ foreach ($alias_names as $alias_name) {
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."alias-name", utf8_encode(htmlspecialchars($alias_name))).
+ parent::describeProperty($this->getVoc()."alias-name", "alias name","Other names used to refer to this gene")
);
}
}
- if(!empty($chromosome)){
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."chromosome", utf8_encode(htmlspecialchars($chromosome))).
- parent::describeProperty($this->getVoc()."chromosome", "chromosome", "Indicates the location of the gene or region on the chromosome")
+ if(!empty($r[$h['alias_symbol']])){
+ $s = $r[$h['alias_symbol']];
+ $alias_symbols = explode("|", $s);
+ foreach ($alias_symbols as $alias_symbol) {
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."alias-symbol", utf8_encode(htmlspecialchars($alias_symbol))).
+ parent::describeProperty($this->getVoc()."alias-symbol", "alias symbol","Other symbols used to refer to this gene")
+ );
+ }
+ }
+ if(!empty($r[$h['location']])){
+ $s = $r[$h['location']];
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."location", utf8_encode(htmlspecialchars($s))).
+ parent::describeProperty($this->getVoc()."location", "location", "Indicates the location of the gene or region on the chromosome")
);
}
- if(!empty($date_approved)){
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."date-approved", $date_approved, "xsd:date").
+ if(!empty($r[$h['date_approved_reserved']])){
+ $s = $r[$h['date_approved_reserved']];
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."date-approved", $s, "xsd:date").
parent::describeProperty($this->getVoc()."date-approved", "date approved","Date the gene symbol and name were approved by the HGNC")
);
}
- if(!empty($date_modified)){
+ if(!empty($r[$h['date_modified']])){
+ $s = $r[$h['date_modified']];
parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."date-modified", $date_modified, "xsd:date").
+ parent::triplifyString($id, $this->getVoc()."date-modified", $s, "xsd:date").
parent::describeProperty($this->getVoc()."date-modified", "date modified", "the date the entry was modified by the HGNC")
);
}
- if(!empty($date_symbol_changed)){
+ if(!empty($r[$h['date_symbol_changed']])){
+ $s = $r[$h['date_symbol_changed']];
parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."date-symbol-changed", $date_symbol_changed, "xsd:date").
+ parent::triplifyString($id, $this->getVoc()."date-symbol-changed", $s, "xsd:date").
parent::describeProperty($this->getVoc()."date-symbol-changed", "date symbol changed","The date the gene symbol was last changed by the HGNC from a previously approved symbol. Many genes receive approved symbols and names which are viewed as temporary (eg C2orf#) or are non-ideal when considered in the light of subsequent information. In the case of individual genes a change to the name (and subsequently the symbol) is only made if the original name is seriously misleading")
);
}
- if(!empty($date_name_changed)){
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."date-name-changed", $date_name_changed, "xsd:date").
+ if(!empty($r[$h['date_name_changed']])){
+ $s = $r[$h['date_name_changed']];
+ parent::addRDF(
+ parent::triplifyString($id, $this->getVoc()."date-name-changed", $s, "xsd:date").
parent::describeProperty($this->getVoc()."date-name-changed", "date name changed", "The date the gene name was last changed by the HGNC from a previously approved name")
);
}
- if(!empty($accession_numbers)){
- $accession_numbers = explode(", ", $accession_numbers);
- foreach ($accession_numbers as $accession_number) {
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."accession", utf8_encode(htmlspecialchars($accession_number))).
- parent::describeProperty($this->getVoc()."accession", "accession number", "Accession numbers for each entry selected by the HGNC")
- );
- }
- }
- if(!empty($enzyme_ids)){
- $enzyme_ids = explode(", ", $enzyme_ids);
- foreach ($enzyme_ids as $enzyme_id) {
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."x-ec", utf8_encode(htmlspecialchars($enzyme_id))).
- parent::describeProperty($this->getVoc()."x-ec","Enzyme Commission (EC) number", "Enzyme entries have Enzyme Commission (EC) numbers associated with them that indicate the hierarchical functional classes to which they belong")
- );
- }
- }
- if(!empty($entrez_gene_id)){
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-ncbigene", "ncbigene:$entrez_gene_id").
- parent::describeProperty($this->getVoc()."x-ncbigene", "NCBI Gene", "NCBI Gene provides curated sequence and descriptive information about genetic loci including official nomenclature, synonyms, sequence accessions, phenotypes, EC numbers, MIM numbers, UniGene clusters, homology, map locations, and related web sites")
- );
- }
- if(!empty($ensembl_gene_id)){
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-ensembl", "ensembl:$ensembl_gene_id").
- parent::describeProperty($this->getVoc()."x-ensembl", "Ensembl Gene")
- );
- }
- if(!empty($mouse_genome_database_id)){
- if(strpos($mouse_genome_database_id, "MGI:") !== FALSE){
- $mouse_genome_database_id = substr($mouse_genome_database_id, 4);
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-mgi", "mgi:$mouse_genome_database_id").
- parent::describeProperty($this->getVoc()."x-mgi", "MGI entry")
- );
- }
- }
- if(!empty($specialist_database_links)){
- $specialist_database_links = explode(", ", $specialist_database_links);
- foreach ($specialist_database_links as $specialist_database_link) {
- preg_match('/href="(\S+)"/', $specialist_database_link, $matches);
- if(!empty($matches[1])){
- parent::AddRDF(
- parent::QQuadO_URL($id_res, $this->getVoc()."xref", $matches[1]).
- parent::describeProperty($this->getVoc()."xref", "Specialist database references.")
+ $idmap = array(
+ "entrez_id" => "ncbigene",
+ "ensembl_gene_id" => "ensembl",
+ "vega_id" => "vega",
+ "ucsc_id" => "ucsc",
+ "ena" => "ena",
+ "refseq_accession" => "refseq",
+ "ccds_id" => "ccds",
+ "uniprot_ids" => "uniprot",
+ "pubmed_id" => "pubmed",
+ "mgd_id" => "mgd",
+ "rgd_id" => "rgd",
+ // "lsdb" => "lsdb", # special structure
+ "cosmic" => "cosmic",
+ "omim_id" => "omim",
+ "mirbase" => "mirbase",
+ "homeodb" => "homeodb",
+ "snornabase" => "snornabase",
+ "bioparadigms_slc" => "bioparadigms_slc",
+ "orphanet" =>"orphanet",
+ "pseudogene.org" => "pseudogene",
+ "horde_id" => "horde",
+ "merops" => "merops",
+ "imgt" => "imgt",
+ "iuphar" => "iuphar",
+ "kznf_gene_catalog" => "kznf",
+ "mamit-trnadb" => "mamit",
+ "cd" => "hcdm",
+ "lncrnadb" => "lncrnadb",
+ "enzyme_id" => "ec",
+ "intermediate_filament_db" => "intermediate_filament_db",
+ "rna_central_ids" => "rna_central_ids",
+ "lncipedia" => "lncipedia",
+ "gtrnadb" => "gtrnadb",
+ // "agr" => "agr" #uses hgnc?
+ );
+ foreach($idmap AS $fieldname => $prefix) {
+ if(!empty($r[$h[$fieldname]])){
+ $s = $r[$h[$fieldname]];
+ $identifiers = explode("|", $s);
+ foreach ($identifiers as $identifier) {
+ // some identifiers are prefixed...
+ $pos = strpos($identifier,":");
+ if($pos !== FALSE) {
+ $identifier = substr($identifier, strpos($identifier, ":")+1);
+ }
+
+ parent::addRDF(
+ parent::triplify($id, $this->getVoc()."x-".$prefix, $prefix.":".$identifier).
+ parent::describeProperty($this->getVoc()."x-".$prefix, $prefix, "reference to an entry in the $prefix database")
);
}
}
}
- if(!empty($pubmed_ids)){
- $pubmed_ids = explode(", ", $pubmed_ids);
- foreach ($pubmed_ids as $pubmed_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-pubmed", "pubmed:".trim($pubmed_id)).
- parent::describeProperty($this->getVoc()."x-pubmed", "NCBI PubMed entry","Identifier that links to published articles relevant to the entry in the NCBI's PubMed database.")
- );
- }
- }
- if(!empty($refseq_ids)){
- $refseq_ids = explode(", ", $refseq_ids);
- foreach ($refseq_ids as $refseq_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-refseq", "refseq:".trim($refseq_id)).
- parent::describeProperty($this->getVoc()."x-refseq", "NCBI Refseq entry","The Reference Sequence (RefSeq) identifier for that entry, provided by the NCBI. As we do not aim to curate all variants of a gene only one selected RefSeq is displayed per gene report. RefSeq aims to provide a comprehensive, integrated, non-redundant set of sequences, including genomic DNA, transcript (RNA), and protein products. RefSeq identifiers are designed to provide a stable reference for gene identification and characterization, mutation analysis, expression studies, polymorphism discovery, and comparative analyses. In the HTML results page this ID links to the RefSeq page for that entry.")
- );
- }
- }
- if(!empty($gene_family_tag)){
+ if(!empty($r[$h['gene_family_id']])){
+ $s = $r[$h['gene_family_id']];
parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."gene-family-tag", utf8_encode(htmlspecialchars($gene_family_tag))).
+ parent::triplifyString($id_res, $this->getVoc()."gene-family-tag", utf8_encode(htmlspecialchars($s))).
parent::describeProperty($this->getVoc()."gene-family-tag", "Gene Family Tag","Tag used to designate a gene family or group the gene has been assigned to, according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group. These tags are used to generate gene family or grouping specific pages at genenames.org and do not necessarily reflect an official nomenclature. Each gene family has an associated gene family tag and gene family description. If a particular gene is a member of more than one gene family, the tags and the descriptions will be shown in the same order.")
);
}
- if(!empty($gene_family_description)){
- $gene_family_description = str_replace("\"", "", $gene_family_description);
+ if(!empty($r[$h['gene_family']])){
+ $s = $r[$h['gene_family']];
parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."gene-family-description", utf8_encode(htmlspecialchars($gene_family_description))).
+ parent::triplifyString($id_res, $this->getVoc()."gene-family-description", utf8_encode(htmlspecialchars($s))).
parent::describeProperty($this->getVoc()."gene-family-description", "gene family name","Name given to a particular gene family. The gene family description has an associated gene family tag. Gene families are used to group genes according to either sequence similarity or information from publications, specialist advisors for that family or other databases. Families/groups may be either structural or functional, therefore a gene may belong to more than one family/group.")
);
}
- if(!empty($record_type)){
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."record-type", utf8_encode(htmlspecialchars($record_type)))
- );
- }
- if(!empty($primary_ids)){
- $primary_ids = explode(", ", $primary_ids);
- foreach ($primary_ids as $primary_id) {
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."primary-id", utf8_encode(htmlspecialchars($primary_id))).
- parent::describeProperty($this->getVoc()."primary-id", "primary identifier")
- );
- }
- }
- if(!empty($secondary_ids)){
- $secondary_ids = explode(", ", $secondary_ids);
- foreach ($secondary_ids as $secondary_id) {
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."secondary-id", utf8_encode(htmlspecialchars($secondary_id))).
- parent::describeProperty($this->getVoc()."secondary-id", "secondary identifier")
- );
- }
- }
- if(!empty($ccd_ids)){
- $ccd_ids = explode(", ", $ccd_ids);
- foreach ($ccd_ids as $ccd_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-ccds", "ccds:".trim($ccd_id)).
- parent::describeProperty($this->getVoc()."x-ccds", "consensus CDS entry","The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.")
- );
- }
- }
- if(!empty($vega_ids)){
- $vega_ids = explode(", ", $vega_ids);
- foreach ($vega_ids as $vega_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-vega", "vega:".trim($vega_id)).
- parent::describeProperty($this->getVoc()."x-vega", "VEGA gene entry")
- );
- }
- }
- if(!empty($locus_specific_databases)){
- parent::AddRDF(
- parent::triplifyString($id_res, $this->getVoc()."locus-specific-xref", utf8_encode(htmlspecialchars($locus_specific_databases))).
- parent::describeProperty($this->getVoc()."locus-specific-xref", "locus specific xref", "This contains a list of links to databases or database entries pertinent to the gene")
- );
- }
- if(!empty($entrez_gene_id_mappeddatasuppliedbyNCBI)){
- $entrez_gene_id_mappeddatasuppliedbyNCBI = explode(", ", $entrez_gene_id_mappeddatasuppliedbyNCBI);
- foreach ($entrez_gene_id_mappeddatasuppliedbyNCBI as $gene_id) {
- if(strstr($gene_id, ":") !== FALSE){
- $a = explode(":", $gene_id);
- $gene_id = $a[1];
- }
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-ncbigene", "ncbigene:".trim($gene_id)).
- parent::describeProperty($this->getVoc()."x-ncbigene", "NCBI Gene entry")
- );
-
- }
- }
- if(!empty($omim_id_mappeddatasuppliedbyNCBI)){
- $omim_id_mappeddatasuppliedbyNCBI = explode(", ", $omim_id_mappeddatasuppliedbyNCBI);
- foreach ($omim_id_mappeddatasuppliedbyNCBI as $omim_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-omim", "omim:".trim($omim_id)).
- parent::describeProperty($this->getVoc()."x-omim", "OMIM entry","Identifier provided by Online Mendelian Inheritance in Man (OMIM) at the NCBI. This database is described as a catalog of human genes and genetic disorders containing textual information and links to MEDLINE and sequence records in the Entrez system, and links to additional related resources at NCBI and elsewhere. In the HTML results page this ID links to the OMIM page for that entry.")
- );
- }
- }
- if(!empty($refseq_mappeddatasuppliedbyNCBI)){
- $refseq_mappeddatasuppliedbyNCBI = explode(", ", $refseq_mappeddatasuppliedbyNCBI);
- foreach ($refseq_mappeddatasuppliedbyNCBI as $refseq_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-refseq", "refseq:".trim($refseq_id)).
- parent::describeProperty($this->getVoc()."x-refseq", "NCBI Refseq entry","The Reference Sequence (RefSeq) identifier for that entry, provided by the NCBI. As we do not aim to curate all variants of a gene only one selected RefSeq is displayed per gene report. RefSeq aims to provide a comprehensive, integrated, non-redundant set of sequences, including genomic DNA, transcript (RNA), and protein products. RefSeq identifiers are designed to provide a stable reference for gene identification and characterization, mutation analysis, expression studies, polymorphism discovery, and comparative analyses. In the HTML results page this ID links to the RefSeq page for that entry.")
- );
- }
- }
- if(!empty($uniprot_id_mappeddatasuppliedbyUniProt)){
- $uniprot_id_mappeddatasuppliedbyUniProt = explode(", ", $uniprot_id_mappeddatasuppliedbyUniProt);
- foreach ($uniprot_id_mappeddatasuppliedbyUniProt as $uniprot_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-uniprot", "uniprot:".trim($uniprot_id)).
- parent::describeProperty($this->getVoc()."x-uniprot", "Uniprot entry","The UniProt identifier, provided by the EBI. The UniProt Protein Knowledgebase is described as a curated protein sequence database that provides a high level of annotation, a minimal level of redundancy and high level of integration with other databases. In the HTML results page this ID links to the UniProt page for that entry.")
- );
- }
- }
- if(!empty($ensembl_id_mappeddatasuppliedbyEnsembl)){
- $ensembl_id_mappeddatasuppliedbyEnsembl = explode(", ", $ensembl_id_mappeddatasuppliedbyEnsembl);
- foreach ($ensembl_id_mappeddatasuppliedbyEnsembl as $ensembl_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-ensembl", "ensembl:".trim($refseq_id)).
- parent::describeProperty($this->getVoc()."x-ensembl", "Ensembl entry","The Ensembl ID is derived from the current build of the Ensembl database and provided by the Ensembl team.")
- );
- }
- }
-
- if(!empty($ucsc_id_mappeddatasuppliedbyVega)){
- $ucsc_id_mappeddatasuppliedbyVega = explode(", ", $ucsc_id_mappeddatasuppliedbyVega);
- foreach ($ucsc_id_mappeddatasuppliedbyVega as $vega_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-vega", "vega:".trim($vega_id)).
- parent::describeProperty($this->getVoc()."x-vega", "Vega entry")
- );
- }
- }
- if(!empty($ucsc_id_mappeddatasuppliedbyUCSC)){
- $ucsc_id_mappeddatasuppliedbyUCSC = explode(", ", $ucsc_id_mappeddatasuppliedbyUCSC);
- foreach ($ucsc_id_mappeddatasuppliedbyUCSC as $ucsc_id) {
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-ucsc", "ucsc:".trim($ucsc_id)).
- parent::describeProperty($this->getVoc()."x-ucsc", "UCSC entry")
- );
- }
- }
- if(!empty($mouse_genome_database_id_mappeddatasuppliedbyMGI)){
- $mouse_genome_database_id_mappeddatasuppliedbyMGI = explode(", ", $mouse_genome_database_id_mappeddatasuppliedbyMGI);
- foreach ($mouse_genome_database_id_mappeddatasuppliedbyMGI as $mgi_id) {
- if(strpos($mgi_id, "MGI:") !== FALSE){
- $mgi_id = substr($mgi_id, 4);
- }
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-mgi", "mgi:".trim($mgi_id)).
- parent::describeProperty($this->getVoc()."x-mgi", "MGI entry")
- );
- }
- }
- if(!empty($rat_genome_database_id_mappeddatasuppliedbyRGD)){
- $rat_genome_database_id_mappeddatasuppliedbyRGD = explode(", ", trim($rat_genome_database_id_mappeddatasuppliedbyRGD));
- foreach ($rat_genome_database_id_mappeddatasuppliedbyRGD as $rgd_id) {
- $rgd_id = trim($rgd_id);
- if(!empty($rgd_id)){
- parent::AddRDF(
- parent::triplify($id_res, $this->getVoc()."x-rgd", trim($rgd_id)).
- parent::describeProperty($this->getVoc()."x-rgd", "RGD entry")
- );
- }
- }
- }
//write RDF to file
$this->WriteRDFBufferToWriteFile();
}//while
diff --git a/interpro/interpro.php b/interpro/interpro.php
index d4342ca..8d074bb 100644
--- a/interpro/interpro.php
+++ b/interpro/interpro.php
@@ -173,15 +173,20 @@ function Parse($xml)
}
}
}
- $abstract = (string) $o->abstract->p->asXML();
- if(isset($pubs)) {
- $abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract);
- }
-
- parent::addRDF(
- parent::triplifyString($s,"dc:description",$abstract)
- );
+ if(isset($o->abstract)) {
+ $abstract = (string) $o->abstract->p->asXML();
+ if(isset($pubs)) {
+ $abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract);
+ }
+ $abstract= preg_replace('/(?i)<[^>]*>/', ' ', $abstract); #remove html tags
+ $abstract = trim(preg_replace("/\s+/",' ',$abstract)); # remove extra spaces
+ $abstract = addslashes($abstract);
+
+ parent::addRDF(
+ parent::triplifyString($s,"dc:description",$abstract)
+ );
+ }
if(isset($o->example_list)) {
foreach($o->example_list->example AS $example) {
$db = (string) $example->db_xref->attributes()->db;
diff --git a/irefindex/irefindex.php b/irefindex/irefindex.php
index 227f6b1..db27c55 100644
--- a/irefindex/irefindex.php
+++ b/irefindex/irefindex.php
@@ -34,8 +34,8 @@ class irefindexParser extends Bio2RDFizer
function __construct($argv) { //
parent::__construct($argv,"irefindex");
parent::addParameter('files',true,'all|10090|10116|4932|559292|562|6239|7227|9606','all','all or comma-separated list of files to process');
- parent::addParameter('version',false,'07042015|08122013|03022013|10182011','07042015','dated version of files to download');
- parent::addParameter('download_url',false,null,'http://irefindex.org/download/irefindex/data/current/psi_mitab/MITAB2.6/');
+ parent::addParameter('version',false,'07042015|08122013|03022013|10182011','05-29-2019','dated version of files to download');
+ parent::addParameter('download_url',false,null,'https://irefindex.vib.be/download/irefindex/data/archive/release_16.0/psi_mitab/MITAB2.6/');
parent::initialize();
}
@@ -163,7 +163,7 @@ function Parse()
trigger_erorr("Expecting 54 columns, found $c!");
return FALSE;
}
-
+ #print_r($header);exit;
// check # of columns
while($l = parent::getReadFile()->read(500000)) {
$a = explode("\t",trim($l));
@@ -280,14 +280,14 @@ function Parse()
}
// add the alternatives through the taxon + seq redundant group
- for($i=2;$i<=3;$i++) {
+ for($i=0;$i<1;$i++) {
$taxid = '';
- $rogid = "irefindex.".$a[32+($i-2)];
+ $rogid = "irefindex.".$a[32+$i];
parent::addRDF(
parent::describeIndividual($rogid,"",parent::getVoc()."Taxon-Sequence-Identical-Group").
parent::describeClass(parent::getVoc()."Taxon-Sequence-Identical-Group","Taxon + Sequence Identical Group")
);
- $tax = $a[9+($i-2)];
+ $tax = $a[9+$i];
if($tax && $tax != '-' && $tax != '-1') {
$data = $this->ParseStringArray($tax);
$taxid = trim($data["ns"]).":".trim($data["id"]);
@@ -296,7 +296,8 @@ function Parse()
);
}
- $list = explode("|",$a[3+($i-2)]);
+ // parse the alternative identifiers
+ $list = explode("|",$a[2+$i]);
foreach($list AS $item) {
$data = $this->ParseStringArray($item);
$ns = trim($data["ns"]);
@@ -311,6 +312,24 @@ function Parse()
);
}
}
+
+ // parse the aliases
+ $list = explode("|",$a[4+$i]);
+ foreach($list AS $item) {
+ $data = $this->ParseStringArray($item);
+ $ns = trim($data["ns"]);
+ $id = trim($data["id"]);
+ $qname = $ns.":".$id;
+ if($ns && $ns != 'rogid' && $ns != 'irogid' and $ns != 'icrogid' and $id != '-') {
+ parent::addRDF(
+ parent::triplify($rogid,parent::getVoc()."has-member",$qname)
+ );
+ if($taxid && $taxid != '-' && $taxid != '-1') parent::addRDF(
+ parent::triplify($qname,parent::getVoc()."x-taxonomy",$taxid)
+ );
+ }
+ }
+
}
// publications
diff --git a/kegg/kegg.php b/kegg/kegg.php
index fe393e9..01a7404 100644
--- a/kegg/kegg.php
+++ b/kegg/kegg.php
@@ -59,7 +59,8 @@ function run()
// handle genes separately
if(in_array("genes",$files)) {
- $orgs = array("hsa"); //,"mmu","eco","dre","dme","ath","sce","ddi");
+ $orgs = array("hsa","mmu","eco","dre","dme","ath","sce","ddi");
+ //$orgs = array("hsa");
echo "processing genes".PHP_EOL;
$ofile = "kegg-genes.".parent::getParameterValue('output_format');
@@ -82,7 +83,7 @@ function run()
// get the list of genes for this organims
echo "processing $org".PHP_EOL;
- $this->org = $org; // local variable
+ $this->org = strtoupper($org); // local variable
$lfile = $ldir.$org.".txt";
$rfile = parent::getParameterValue("download_url")."list/$org";
@@ -223,9 +224,10 @@ function process($db)
if(isset($this->idlist) and !in_array($id,$this->idlist)) continue;
if(isset($this->org)) {
- $id = $ns."_".$id;
+ $id = strtoupper($ns)."_".$id;
}
$uri = $this->getNamespace().$id;
+
parent::addRDF(
parent::describeIndividual($uri,$name,parent::getVoc().ucfirst($db)).
parent::describeClass(parent::getVoc().ucfirst($db),"KEGG $db").
@@ -293,6 +295,7 @@ function parseEntry($lfile)
$uri = parent::getNamespace().$e['id'];
continue;
}
+
// key with value
if(in_array($k, array("NAME","DESCRIPTION","DEFINITION","EQUATION","COMMENT"))) {
if($k == "NAME") {
@@ -312,13 +315,16 @@ function parseEntry($lfile)
parent::addRDF(
parent::triplifyString($uri,"dc:description",$v)
);
- } else if($k == "DEFINITION" and $e['type'] == "KO") {
- preg_match("/\[([^\]]+)\]/",$v,$m);
- if(isset($m[1])) {
+ } else if($k == "DEFINITION" and $e['type'] == "KO") {
+ preg_match("/\[EC:([^\]]+)/",$v,$m);
+ if(isset($m[1])) {
+ $a = explode(" ", $m[1]);
+ foreach($a AS $b) {
parent::addRDF(
- parent::triplify($uri,parent::getVoc()."x-ec",$m[1])
- );
+ parent::triplify($uri,parent::getVoc()."x-ec","ec:".$b)
+ );
}
+ }
} else if($k == "COMMENT") {
preg_match("/ICD-O: ([^,]+),/",$v,$m);
if(isset($m[1])) {
@@ -468,12 +474,12 @@ function parseEntry($lfile)
echo "parse error: ".$k." ".$v.PHP_EOL;continue;
}
$str = $a[1];
-
+
foreach($ids AS $id) {
- $o = '';
- $o['id'] = $id;
- $o['label'] = $str;
- $o['type'] = strtolower($k);
+ #$o = '';
+ #$o['id'] = $id;
+ #$o['label'] = $str;
+ #$o['type'] = strtolower($k);
parent::addRDF(
parent::triplify($uri,parent::getVoc().strtolower($k),"kegg:$id")
);
@@ -688,14 +694,19 @@ function parseEntry($lfile)
parent::triplify($uri,parent::getVoc().strtolower($k),$id)
);
preg_match_all("/ \[([^\]]+)\]/",$v,$m);
- if(isset($m[1])) {
- foreach($m[1] AS $item) {
- if(!strstr($item,"KO")) $item = "kegg:".str_replace(":","_",$item);
- else $item = str_replace("KO:","kegg:",$item);
- parent::addRDF(
- parent::triplify($id,parent::getVoc()."link",$item)
- );
+ if(isset($m[1]) and !empty($m[1])) {
+ foreach($m[1] AS $item) {
+ $a = explode(':',$item); // get the namespace
+ $b = explode(' ',$a[1]);
+ foreach($b AS $c) {
+ if(!strstr($item,"KO")) $i = "kegg:".$a[0].'_'.$c;
+ else $i = "kegg:".$c;
+ parent::addRDF(
+ parent::triplify($id,parent::getVoc()."link",$i)
+ );
+ }
}
+ $test = true;
}
continue;
}
diff --git a/mesh/mesh.php b/mesh/mesh.php
index d40313f..f1b726e 100644
--- a/mesh/mesh.php
+++ b/mesh/mesh.php
@@ -150,8 +150,8 @@ class MeshParser extends Bio2RDFizer{
function __construct($argv) {
parent::__construct($argv, "mesh");
parent::addParameter('files', true, 'all|descriptors|qualifiers|supplementary', 'all', 'all or comma-separated list of files to process');
- parent::addParameter('download_url',false,'','ftp://nlmpubs.nlm.nih.gov/online/mesh/.asciimesh/','default ftp location');
- parent::addParameter('year', false, '','2014',"Year to process");
+ parent::addParameter('download_url',false,'','ftp://nlmpubs.nlm.nih.gov/online/mesh/YEAR/asciimesh/','default ftp location');
+ parent::addParameter('year', false, '','2019',"Year to process");
parent::initialize();
}//constructor
@@ -180,7 +180,7 @@ function Run(){
$file = str_replace("YEAR",$year,$fpattern);
$lfile = $ldir.$file;
$rfile = parent::getParameterValue("download_url").$file;
-
+ $rfile = str_replace("YEAR",$year,$rfile);
// download if necessary
if(!file_exists($lfile) || parent::getParameterValue('download') == "true") {
echo "Downloading $file ... ";
@@ -193,7 +193,7 @@ function Run(){
}
//set the outfile
- $ofile = "mesh_".$k.".".parent::getParameterValue('output_format');
+ $ofile = "bio2rdf-mesh-".$k.".".parent::getParameterValue('output_format');
$gz= strstr(parent::getParameterValue('output_format'), "gz")?true:false;
echo "processing $k ...";
@@ -249,9 +249,8 @@ function Run(){
private function supplementary(){
$sup_rec = "";
- while($aLine = $this->GetReadFile()->Read(200000)){
- preg_match("/^\n$/", $aLine, $matches);
- if(count($matches)){
+ while(FALSE !== ($aLine = $this->GetReadFile()->Read(200000))){
+ if(strlen($aLine) == 0){
$dR = $this->readRecord($sup_rec);
$this->makeSupplementaryRecord($dR);
$sup_rec = "";
@@ -259,15 +258,14 @@ private function supplementary(){
}
preg_match("/\*NEWRECORD/", $aLine, $matches);
if(count($matches) == 0){
- $sup_rec .= $aLine;
+ $sup_rec .= $aLine.PHP_EOL;
}
}
}
private function descriptors(){
$descriptor_record = "";
- while($aLine = $this->GetReadFile()->Read(200000)){
- preg_match("/^\n$/", $aLine, $matches);
- if(count($matches)){
+ while(FALSE !== ($aLine = $this->GetReadFile()->Read(200000))){
+ if(strlen($aLine) == 0){
$dR = $this->readRecord($descriptor_record);
$this->makeDescriptorRecord($dR);
$descriptor_record = "";
@@ -275,16 +273,15 @@ private function descriptors(){
}
preg_match("/\*NEWRECORD/", $aLine, $matches);
if(count($matches) == 0){
- $descriptor_record .= $aLine;
+ $descriptor_record .= $aLine.PHP_EOL;
}
}
}
private function qualifiers(){
$qualifier_record = "";
- while($aLine = $this->GetReadFile()->Read(200000)){
- preg_match("/^\n$/", $aLine, $matches);
- if(count($matches)){
+ while(FALSE !== ($aLine = $this->GetReadFile()->Read(200000))){
+ if(strlen($aLine) == 0){
$qR = $this->readRecord($qualifier_record);
$this->makeQualifierRecordRDF($qR);
$qualifier_record = "";
@@ -292,7 +289,7 @@ private function qualifiers(){
}
preg_match("/\*NEWRECORD/", $aLine, $matches);
if(count($matches) == 0){
- $qualifier_record .= $aLine;
+ $qualifier_record .= $aLine.PHP_EOL;
}
}
}
@@ -437,7 +434,7 @@ private function makeSupplementaryRecord($sup_record_arr){
if($k == "SO"){
foreach($v as $kv => $vv){
parent::AddRDF(
- parent::triplifyString($sr_res, $this->getVoc().$sde['SO'], utf8_encode(htmlspecialchars($vv))).
+ parent::triplifyString($sr_res, $this->getVoc().$sde['SO'], addslashes(utf8_encode(htmlspecialchars($vv)))).
parent::describeProperty($this->getVoc().$sde['SO'], "Relationship between a supplementary record and its source")
);
}
@@ -499,7 +496,7 @@ private function makeDescriptorRecord($desc_record_arr){
$vvrar = explode(";", $vv);
foreach($vvrar as $anAn){
parent::AddRDF(
- parent::triplifyString($dr_res, $this->getVoc().$qde["AN"], $anAn).
+ parent::triplifyString($dr_res, $this->getVoc().$qde["AN"], addslashes($anAn)).
parent::describeProperty($this->getVoc().$qde["AN"], "Relationship between a descriptor and its annotation")
);
}//foreach
@@ -866,7 +863,7 @@ private function makeQualifierRecordRDF($qual_record_arr){
$vvrar = explode(";", $vv);
foreach($vvrar as $anAn){
parent::AddRDF(
- parent::triplifyString($qr_res, $this->getVoc().$qde["AN"], $anAn).
+ parent::triplifyString($qr_res, $this->getVoc().$qde["AN"], addslashes($anAn)).
parent::describeProperty($this->getVoc().$qde["AN"], "Relationship between a qualifier record and its annotation")
);
}//foreach
diff --git a/mgi/mgi.php b/mgi/mgi.php
index 5b6f55d..3ceed95 100644
--- a/mgi/mgi.php
+++ b/mgi/mgi.php
@@ -36,7 +36,7 @@ class MGIParser extends Bio2RDFizer
function __construct($argv) {
parent::__construct($argv, "mgi");
parent::addParameter('files',true,'all|MGI_Strain|MGI_PhenotypicAllele|MGI_GenePheno|MRK_Sequence|MGI_Geno_Disease|MGI_Geno_NotDisease','all','all or comma-separated list to process');
- parent::addParameter('download_url', false, null,'ftp://ftp.informatics.jax.org/pub/reports/' );
+ parent::addParameter('download_url', false, null,'http://www.informatics.jax.org/downloads/reports/' );
parent::initialize();
}
@@ -66,7 +66,7 @@ function Run()
parent::setReadFile($lfile,true);
echo "Processing $item...";
- $ofile = $odir.$item.'.'.parent::getParameterValue('output_format');
+ $ofile = $odir."bio2rdf-".$item.'.'.parent::getParameterValue('output_format');
$gz= strstr(parent::getParameterValue('output_format'), "gz")?true:false;
parent::setWriteFile($ofile, $gz);
@@ -141,8 +141,9 @@ function MGI_PhenotypicAllele($qtl = false)
$a = explode("\t",$l);
$line++;
if($a[0][0] == "#") continue;
- if(count($a) != 12) {
- echo "Expecting 12 columns, but found ".count($a)." at line $line. skipping!".PHP_EOL;
+ $expected_columns = 13;
+ if(count($a) != $expected_columns) {
+ echo "Expecting $expected_columns columns, but found ".count($a)." at line $line. skipping!".PHP_EOL;
if($errors++ == 25) {echo 'stopping'.PHP_EOL;break;}
continue;
}
@@ -232,19 +233,19 @@ function MGI_PhenotypicAllele($qtl = false)
4 Mammalian Phenotype ID - MP:0000364
5 PubMed ID - 15466160
6 MGI Marker Accession ID (comma-delimited) - MGI:96522
- 7 blank
- 8 MGI Genotype ID (comma-delimted)
+ 7 MGI Genotype ID (comma-delimted)
*/
function MGI_GenePheno()
{
$line = 1;
while($l = $this->getReadFile()->read(248000)) {
$a = explode("\t",$l);
- if(count($a) != 9) {
- trigger_error("Incorrect number of columns",E_USER_WARNING);
- continue;
+ $exp = 8;
+ if(count($a) != $exp) {
+ trigger_error("Incorrect number of columns: Found ".count($a)." and was expecting $exp",E_USER_WARNING);
+ exit();
}
- $id = trim($a[8]);
+ $id = trim($a[7]);
$label = $a[0]." ".$a[3];
parent::addRDF(
@@ -310,7 +311,7 @@ function MGI_GenePheno()
function MRK_Sequence()
{
- $cols = 21;
+ $cols = 19;
$line = 0;
$h = $this->getReadFile()->read(500000);
$o = $this->getReadFile()->read(500000); // extra feature header on a separate line...if you can imagine
@@ -335,11 +336,10 @@ function MRK_Sequence()
parent::triplifyString($id, parent::getVoc()."chromosome", $a[6], "xsd:string").
parent::triplifyString($id, parent::getVoc()."genome-start", $a[7], "xsd:string").
parent::triplifyString($id, parent::getVoc()."genome-end", $a[8], "xsd:string").
- parent::triplifyString($id, parent::getVoc()."strand", $a[7], "xsd:string").
- parent::triplifyString($id, parent::getVoc()."feature-type", $a[20], "xsd:string")
+ parent::triplifyString($id, parent::getVoc()."strand", $a[7], "xsd:string")
);
$start_pos = 10;
- $list = array("genbank","refseq-transcript","vega-transcript","ensembl-transcript","uniprot","trembl","vega-protein","ensembl-protein","refseq-protein","unigene");
+ $list = array("genbank","refseq-transcript","ensembl-transcript","uniprot","trembl","ensembl-protein","refseq-protein","unigene");
$list_len = count($list);
for($i=0;$i<$list_len;$i++) {
$value = trim($a[$i+$start_pos]);
@@ -413,7 +413,7 @@ function MGI_Geno_Disease()
$genotype = $a[0];
$diseases = explode(",",$a[7]);
foreach($diseases AS $d) {
- $disease = "omim:$d";
+ $disease = "$d";
foreach($alleles AS $allele) {
$id = parent::getRes().md5($allele.$disease);
$label = "$allele $disease association";
@@ -462,7 +462,7 @@ function MGI_Geno_NotDisease()
$alleles = explode("|",strtolower($a[2]));
$diseases = explode(",",$a[7]);
foreach($diseases AS $d) {
- $disease = "omim:$d";
+ $disease = "$d";
foreach($alleles AS $allele) {
$id = parent::getRes().md5($allele.$disease);
diff --git a/miriam/miriam.php b/miriam/miriam.php
index 31bd80d..5c740ab 100644
--- a/miriam/miriam.php
+++ b/miriam/miriam.php
@@ -159,7 +159,7 @@ function parseItem($item)
else $mylist = $i;
foreach($mylist AS $myitem) {
if(strstr($myitem, "pubmed")) $uri = "pubmed:".substr($myitem, strrpos($myitem, ":")+1);
- else if(strstr($myitem, "doi")) $uri = "http://dx.doi.org/".substr($myitem, strpos($myitem, "doi:"));
+ else if(strstr($myitem, "doi")) $uri = "https://doi.org/".substr($myitem, strpos($myitem, "doi:"));
else $uri = $myitem;
parent::addRDF(
diff --git a/ncbigene/ncbigene.php b/ncbigene/ncbigene.php
index 5943d2c..31c7f5a 100644
--- a/ncbigene/ncbigene.php
+++ b/ncbigene/ncbigene.php
@@ -39,10 +39,10 @@ class NCBIGeneParser extends Bio2RDFizer
"gene2ensembl" => "gene2ensembl.gz",
"gene2go" => "gene2go.gz",
"gene2pubmed" => "gene2pubmed.gz",
- "gene2refseq" => "gene2refseq.gz",
- "gene2sts" => "gene2sts",
- "gene2unigene" => "gene2unigene",
- "gene2vega" => "gene2vega.gz",
+ "gene2refseq" => "gene2refseq.gz"
+ #"gene2sts" => "gene2sts",
+ #"gene2unigene" => "gene2unigene",
+ #"gene2vega" => "gene2vega.gz",
);
private $taxids = null;
private $default_taxids = array(
@@ -64,7 +64,7 @@ function __construct($argv) {
parent::__construct($argv,"ncbigene");
// set and print application parameters
- parent::addParameter('files',true,'all|geneinfo|gene2accession|gene2ensembl|gene2go|gene2pubmed|gene2refseq|gene2sts|gene2unigene|gene2vega','','files to process');
+ parent::addParameter('files',true,'all|geneinfo|gene2accession|gene2ensembl|gene2go|gene2pubmed|gene2refseq','','files to process'); # |gene2sts|gene2unigene|gene2vega were removed
parent::addParameter('download_url',false,null,'ftp://ftp.ncbi.nih.gov/gene/DATA/');
parent::addParameter('limit_organisms',false,'true|false','true','flag to use specified organisms');
parent::addParameter('organisms',false,null,implode(",",array_keys($this->default_taxids)),'taxonomy ids for organisms to process');
@@ -129,7 +129,7 @@ function process()
$file = $module.".gz";
$lfile = $ldir.$file;
$rfile = $rdir.$rfilename;
- $ofile = $module.".".parent::getParameterValue('output_format');
+ $ofile = "bio2rdf-".$module.".".parent::getParameterValue('output_format');
$gz = false;
if(strstr(parent::getParameterValue('output_format'), "gz")) $gz = true;
@@ -140,6 +140,7 @@ function process()
$fnx = $module;
if($module == 'gene2refseq') $fnx = 'gene2accession';
$this->$fnx();
+
parent::clear();
echo 'done!'.PHP_EOL;
@@ -343,7 +344,7 @@ private function gene2accession()
$z = 1;
while($l = $this->getReadFile()->read(200000)){
if($l[0] == "#") continue;
- if(($z++) % 10000 == 0) {echo $z.PHP_EOL;parent::clear();}
+ if(($z++) % 100000 == 0) {echo $z.PHP_EOL;parent::clear();}
$a = explode("\t",rtrim($l));
if(count($a) != 16) { trigger_error("gene2accession: expecting 16 columns, found ".count($a)." instead", E_USER_ERROR);}
$taxid = $a[0];
diff --git a/ndc/ndc.php b/ndc/ndc.php
index 3c0f3fd..7b1f93e 100644
--- a/ndc/ndc.php
+++ b/ndc/ndc.php
@@ -36,7 +36,7 @@ function __construct($argv) {
parent::__construct($argv, "ndc");
$this->AddParameter('files',true,'all|product|package','all','files to process');
- $this->AddParameter('download_url',false,null,'http://www.fda.gov/downloads/Drugs/DevelopmentApprovalProcess/UCM070838.zip');
+ $this->AddParameter('download_url',false,null,'https://www.accessdata.fda.gov/cder/ndctext.zip');
parent::initialize();
}
diff --git a/orphanet/orphanet.php b/orphanet/orphanet.php
index ae5651e..54a9029 100644
--- a/orphanet/orphanet.php
+++ b/orphanet/orphanet.php
@@ -35,14 +35,13 @@ class ORPHANETParser extends Bio2RDFizer
{
private $filemap = array(
'disease' => 'en_product1.xml',
- 'epi' => 'en_product2.xml',
- 'd2s' => 'en_product4.xml',
- 'signs' => 'en_product5.xml',
+ 'prevalence' => 'en_product9_prev.xml',
+ 'phenotypefreq' => 'en_product4.xml',
'genes' => 'en_product6.xml'
);
function __construct($argv) {
parent::__construct($argv, "orphanet");
- parent::addParameter('files',true,'all|disease|epi|d2s|signs|genes','all','all or comma-separated list of ontology short names to process');
+ parent::addParameter('files',true,'all|disease|genes|phenotypefreq|prevalence','all','all or comma-separated list of ontology short names to process');
parent::addParameter('download_url',false,null,'http://www.orphadata.org/data/xml/');
parent::initialize();
}
@@ -79,43 +78,43 @@ function run()
$ofile = "orphanet-".$file.'.'.$suffix;
$gz = strstr(parent::getParameterValue('output_format'), "gz")?($gz=true):($gz=false);
-/* parent::setWriteFile($odir.$ofile, $gz);
+ parent::setWriteFile($odir.$ofile, $gz);
$this->$file($lfile);
parent::getWriteFile()->close();
-*/ parent::getReadFile()->close();
+ parent::getReadFile()->close();
parent::clear();
echo "done!".PHP_EOL;
// dataset description
- $source_file = (new DataResource($this))
- ->setURI($rfile)
- ->setTitle("Orphanet: $file")
- ->setRetrievedDate(parent::getDate(filemtime($lfile)))
- ->setFormat("application/xml")
- ->setPublisher("http://www.orpha.net")
- ->setHomepage("http://www.orpha.net/")
- ->setRights("use")
- ->setRights("sharing-modified-version-needs-permission")
- ->setLicense("http://creativecommons.org/licenses/by-nd/3.0/")
- ->setDataset("http://identifiers.org/orphanet/");
-
- $prefix = parent::getPrefix();
- $bVersion = parent::getParameterValue('bio2rdf_release');
- $date = parent::getDate(filemtime($odir.$ofile));
-
- $output_file = (new DataResource($this))
- ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile")
- ->setTitle("Bio2RDF v$bVersion RDF version of $prefix")
- ->setSource($source_file->getURI())
- ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php")
- ->setCreateDate($date)
- ->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html")
- ->setPublisher("http://bio2rdf.org")
- ->setRights("use-share-modify")
- ->setRights("by-attribution")
- ->setRights("restricted-by-source-license")
- ->setLicense("http://creativecommons.org/licenses/by/3.0/")
- ->setDataset(parent::getDatasetURI());
+ $source_file = (new DataResource($this))
+ ->setURI($rfile)
+ ->setTitle("Orphanet: $file")
+ ->setRetrievedDate(parent::getDate(filemtime($lfile)))
+ ->setFormat("application/xml")
+ ->setPublisher("http://www.orpha.net")
+ ->setHomepage("http://www.orpha.net/")
+ ->setRights("use")
+ ->setRights("sharing-modified-version-needs-permission")
+ ->setLicense("http://creativecommons.org/licenses/by-nd/3.0/")
+ ->setDataset("http://identifiers.org/orphanet/");
+
+ $prefix = parent::getPrefix();
+ $bVersion = parent::getParameterValue('bio2rdf_release');
+ $date = parent::getDate(filemtime($odir.$ofile));
+
+ $output_file = (new DataResource($this))
+ ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile")
+ ->setTitle("Bio2RDF v$bVersion RDF version of $prefix")
+ ->setSource($source_file->getURI())
+ ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/orphanet/orphanet.php")
+ ->setCreateDate($date)
+ ->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html")
+ ->setPublisher("http://bio2rdf.org")
+ ->setRights("use-share-modify")
+ ->setRights("by-attribution")
+ ->setRights("restricted-by-source-license")
+ ->setLicense("http://creativecommons.org/licenses/by/3.0/")
+ ->setDataset(parent::getDatasetURI());
$gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
if($gz) $output_file->setFormat("application/gzip");
@@ -137,6 +136,7 @@ function disease($file)
foreach($x->Disorder AS $d) {
// var_dump($d);exit;
+
$internal_id = (string) $d->attributes()->id;
$orphanet_id = parent::getNamespace().((string)$d->OrphaNumber);
$name = (string) $d->Name;
@@ -148,9 +148,9 @@ function disease($file)
parent::triplifyString($orphanet_id, parent::getVoc()."internal-id", $internal_id).
parent::triplify($orphanet_id, parent::getVoc()."expert-link-url", $expert_link)
);
-
+ // get the synonyms
foreach($d->SynonymList AS $s) {
- $synonym = (string) $s->Synonym;
+ $synonym = str_replace('"','', (string) $s->Synonym);
parent::addRDF(
parent::triplifyString($orphanet_id, parent::getVoc()."synonym", $synonym)
);
@@ -164,14 +164,30 @@ function disease($file)
);
}
}
+ // get external references
foreach($d->ExternalReferenceList AS $erl) {
- $er = $erl->ExternalReference;
- $source = (string) $er->Source;
- $db = parent::getRegistry()->getPreferredPrefix($source);
- $id = (string) $er->Reference;
- parent::addRDF(
- parent::triplify($orphanet_id, parent::getVoc()."x-$db", "$db:$id")
- );
+ foreach($erl->ExternalReference AS $er) {
+ $source = (string) $er->Source;
+ $db = parent::getRegistry()->getPreferredPrefix($source);
+ $id = (string) $er->Reference;
+ parent::addRDF(
+ parent::triplify($orphanet_id, parent::getVoc()."x-$db", "$db:$id")
+ );
+ }
+ }
+ // get the definition
+ foreach($d->TextualInformationList AS $til) {
+ foreach($til->TextualInformation As $ti) {
+ foreach($ti->TextSectionList AS $tsl) {
+ foreach($tsl->TextSection AS $ts) {
+ if(((string) $ts->TextSectionType->Name) == "Definition") {
+ parent::addRDF(
+ parent::triplifyString($orphanet_id, parent::getVoc()."definition", addslashes((string) $ts->Contents))
+ );
+ };
+ }
+ }
+ }
}
parent::writeRDFBufferToWriteFile();
}
@@ -179,52 +195,121 @@ function disease($file)
unset($xml);
}
- function epi ($file)
+ function prevalence ($file)
{
$seen = '';
$xml = new CXML($file);
while($xml->parse("DisorderList") == TRUE) {
$x = $xml->GetXMLRoot();
foreach($x->Disorder AS $d) {
- // var_dump($d);exit;
+
$orphanet_id = parent::getNamespace().((string)$d->OrphaNumber);
- if(isset($d->ClassOfPrevalence)) {
- $id = parent::getNamespace().((string) $d->ClassOfPrevalence->attributes()->id);
- $name = (string) $d->ClassOfPrevalence->Name;
- if($name != '' && $name != 'Unknown' && $name != 'No data available') {
- if(!isset($seen[$name])) {
- $seen[$name] = true;
- $a = explode (" / ", $name);
- $size = str_replace(" ","",$a[1]);
- $upper_bound = $lower_bound = '';
- if($a[0][0] == '<') {
- $upper_bound = substr($a[0],1) / $size;
- } else if($a[0][0] == '>') {
- $lower_bound = substr($a[0],1) / $size;
- } else {
- $b = explode("-",$a[0]);
- $lower_bound = $b[0] / $size;
- $upper_bound = $b[1] / $size;
- }
- if($upper_bound) {
+ $disease_name = (string) $d->Name;
+
+ foreach($d->PrevalenceList->Prevalence AS $pl) {
+ $id = parent::getRes()."pl".((string) $pl->attributes()->id);
+ parent::addRDF(
+ parent::describeClass($id,"Prevalence",parent::getVoc()."Prevalence").
+ parent::describeIndividual($id, "Prevalence for $disease_name", parent::getVoc()."Prevalence")
+ );
+ $type_id = parent::getRes()."pt".(string) $pl->PrevalenceType->attributes()->id;
+ $type_label = (string) $pl->PrevalenceType->Name;
+ if($type_label != "") {
+ parent::addRDF(
+ parent::describeIndividual($type_id, $type_label, parent::getVoc()."Prevalence-Type").
+ parent::triplify($id, parent::getVoc()."prevalence-type", $type_id).
+ parent::triplify($orphanet_id, parent::getVoc()."prevalence", $id)
+ );
+ }
+
+ $qual_id = parent::getRes()."qu".(string) $pl->PrevalenceQualification->attributes()->id;
+ $qual_label = (string) $pl->PrevalenceQualification->Name;
+ if($qual_label != "") {
+ parent::addRDF(
+ parent::describeIndividual($qual_id, $qual_label, parent::getVoc()."Prevalence-Qualification").
+ parent::triplify($id, parent::getVoc()."prevalence-qualification", $qual_id)
+ );
+ }
+
+ $prev_id = parent::getRes()."pr".(string) $pl->PrevalenceClass->attributes()->id;
+ $prev_label = (string) $pl->PrevalenceClass->Name;
+ if($prev_label != "") {
+ parent::addRDF(
+ parent::describeIndividual($prev_id, $prev_label, parent::getVoc()."Prevalence-Value").
+ parent::triplify($id, parent::getVoc()."prevalence-value", $prev_id)
+ );
+ }
+
+ $geo_id = parent::getRes()."geo".(string) $pl->PrevalenceGeographic->attributes()->id;
+ $geo_label = (string) $pl->PrevalenceGeographic->Name;
+ if($geo_label != "") {
+ parent::addRDF(
+ parent::describeIndividual($geo_id, $geo_label, parent::getVoc()."Geographic-Prevalence").
+ parent::triplify($id, parent::getVoc()."prevalence-geo", $geo_id)
+ );
+ }
+
+ $val_id = parent::getRes()."val".(string) $pl->PrevalenceValidationStatus->attributes()->id;
+ $val_label = (string) $pl->PrevalenceValidationStatus->Name;
+ if($val_label != "") {
+ parent::addRDF(
+ parent::describeIndividual($val_id, $val_label, parent::getVoc()."Prevalence-Validation-Status").
+ parent::triplify($id, parent::getVoc()."prevalence-status", $val_id)
+ );
+ }
+ $valmoy = (string) $pl->ValMoy;
+ if($valmoy != "") {
+ parent::addRDF(
+ parent::triplifyString($id, parent::getVoc()."val-moy", $valmoy)
+ );
+ }
+
+
+ $source = trim((string) $pl->Source);
+ if($source and (strlen($source) != 0)) {
+ //23712425[PMID]
+ preg_match_all("/([0-9]*)\[([^\]]*)?\]/",$source, $m, PREG_SET_ORDER );
+ foreach($m AS $i) {
+ if(isset($i[2]) and ($i[2] == "PMID")) {
+ $source_id = "PMID:".$i[1];
parent::addRDF(
- parent::triplifyString($id,parent::getVoc()."upper-bound",$upper_bound, "xsd:float")
+ parent::triplify($id, parent::getVoc()."source", $source_id)
);
- }
- if($lower_bound) {
+ } else {
parent::addRDF(
- parent::triplifyString($id,parent::getVoc()."lower-bound",$lower_bound, "xsd:float")
+ parent::triplifyString($id, parent::getVoc()."source", $i[0])
);
}
- }
- parent::addRDF(
- parent::triplify($orphanet_id, parent::getVoc()."prevalence", $id).
- parent::describeClass($id,$name,parent::getVoc()."Prevalence")
- );
- //echo parent::getRDF();exit;
+ }
+
}
}
+ parent::writeRDFBufferToWriteFile();
+ }
+ }
+ unset($xml);
+ }
+
+ function onset ($file)
+ {
+ $seen = '';
+ $xml = new CXML($file);
+ while($xml->parse("DisorderList") == TRUE) {
+ $x = $xml->GetXMLRoot();
+ foreach($x->Disorder AS $d) {
+ // var_dump($d);exit;
+ $orphanet_id = parent::getNamespace().((string)$d->OrphaNumber);
+ $disease_name = (string) $d->Name;
+ foreach($d->PrevalanceList AS $pl) {
+ $id = parent::getNamespace().((string) $pl->attributes()->id);
+
+ parent::addRDF(
+ parent::triplify($orphanet_id, parent::getVoc()."prevalence", $id).
+ parent::describeClass($id,$name,parent::getVoc()."Prevalence")
+ );
+
+ }
if(isset($d->AverageAgeofOnset)) {
$id = parent::getNamespace().((string) $d->AverageAgeOfOnset->attributes()->id);
$name = (string) $d->AverageAgeOfOnset->Name;
@@ -264,42 +349,66 @@ function epi ($file)
}
unset($xml);
}
-
- function d2s($file)
+
+
+ function phenotypefreq($file)
{
/*
-
-
-
- Macrocephaly/macrocrania/megalocephaly/megacephaly
-
-
- Very frequent
-
-
- */
+
+
+ 558
+ http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=558
+ Marfan syndrome
+
+ Disease
+
+
+ Disorder
+
+
+
+
+
+ HP:0000768
+ Pectus carinatum
+
+
+ Very frequent (99-80%)
+
+
+ Diagnostic criterion
+
+
+ */
$xml = new CXML($file);
- while($xml->parse("DisorderList") == TRUE) {
+ while($xml->parse("HPODisorderSetStatus") == TRUE) {
$x = $xml->GetXMLRoot();
foreach($x->Disorder AS $d) {
$orphanet_id = parent::getNamespace().((string)$d->OrphaNumber);
- foreach($d->DisorderSignList->DisorderSign AS $ds) {
- $sfid = parent::getRes().md5($ds->asXML());
- if($ds->ClinicalSign) {
- $sid = parent::getVoc().((string)$ds->ClinicalSign->attributes()->id);
- $s = (string) $ds->ClinicalSign->Name;
- $fid = parent::getRes().((string) $ds->SignFreq->attributes()->id);
- $f = (string) $ds->SignFreq->Name;
- parent::addRDF(
- parent::describeIndividual($sfid, "$f $s",parent::getVoc()."Clinical-Sign-And-Frequency").
- parent::describeClass(parent::getVoc()."Clinical-Sign-And-Frequency","Clinical Sign and Frequency").
- parent::triplify($orphanet_id, parent::getVoc()."sign-freq", $sfid).
- parent::triplify($sfid,parent::getVoc()."sign", $sid).
- parent::describeClass($sid,$s,parent::getVoc()."Clinical-Sign").
- parent::triplify($sfid,parent::getVoc()."frequency",$fid).
- parent::describeClass($fid,$f,parent::getVoc()."Frequency")
- );
+ $disease_name = ((string)$d->Name);
+ foreach($d->HPODisorderAssociationList->HPODisorderAssociation AS $ds) {
+ $sfid = parent::getRes()."sf".((string)$ds->attributes()->id);
+ $s = (string) $ds->HPO->HPOTerm;
+ $sid = $ds->HPO->HPOId;
+ $f = (string) $ds->HPOFrequency->Name;
+ $fid = parent::getRes()."f".((string) $ds->HPOFrequency->attributes()->id);
+
+ $diagnostic = false;
+ if($ds->DiagnosticCriteria->Name) {
+ $diagnostic = true;
}
+ $sflabel = "$f $s".(($diagnostic == true)?" that is diagnostic":"")." for ".$disease_name;
+
+ parent::addRDF(
+ parent::describeIndividual($sfid, $sflabel, parent::getVoc()."Clinical-Sign-And-Frequency").
+ parent::describeClass(parent::getVoc()."Clinical-Sign-And-Frequency","Clinical Sign and Frequency").
+ parent::triplify($orphanet_id, parent::getVoc()."sign-freq", $sfid).
+ parent::triplify($sfid,parent::getVoc()."sign", $sid).
+ parent::triplify($sfid,parent::getVoc()."frequency",$fid).
+ parent::triplifyString($sfid, parent::getVoc()."is-diagnostic", (isset($diagnostic)?"true":"false")).
+ parent::triplifyString($fid, "rdfs:label", $fid).
+ parent::describeClass($fid,$f,parent::getVoc()."Frequency")
+ );
}
parent::writeRDFBufferToWriteFile();
}
@@ -367,15 +476,16 @@ function genes($file)
foreach($d->DisorderGeneAssociationList->DisorderGeneAssociation AS $dga) {
// gene
$gene = $dga->Gene;
- $gene_id = parent::getNamespace().((string) $gene->OrphaNumber);
- $gene_internal_id = ((string) $gene->attributes()->id);
+ $gid = ((string) $gene->attributes()->id);
+ $gene_id = parent::getNamespace().$gid;
$gene_label = (string) $gene->Name;
$gene_symbol = (string) $gene->Symbol;
parent::addRDF(
parent::describeIndividual($gene_id,$gene_label,parent::getVoc()."Gene").
- parent::describeClass(parent::getVoc()."Gene","orphanet gene").
+ parent::describeClass(parent::getVoc()."Gene","Orphanet Gene").
parent::triplifyString($gene_id,parent::getVoc()."symbol",$gene_symbol)
);
+
foreach($gene->SynonymList AS $s) {
$synonym = (string) $s->Synonym;
parent::addRDF(
@@ -383,23 +493,37 @@ function genes($file)
);
}
foreach($gene->ExternalReferenceList AS $erl) {
- $er = $erl->ExternalReference;
- $db = (string) $er->Source;
- $db = parent::getRegistry()->getPreferredPrefix($db);
- $id = (string) $er->Reference;
- $xref = "$db:$id";
- parent::addRDF(
- parent::triplify($gene_id, parent::getVoc()."x-$db", $xref)
- );
+ foreach($erl->ExternalReference AS $er) {
+ $db = (string) $er->Source;
+ $db = parent::getRegistry()->getPreferredPrefix($db);
+ $id = (string) $er->Reference;
+ $xref = "$db:$id";
+ parent::addRDF(
+ parent::triplify($gene_id, parent::getVoc()."x-$db", $xref)
+ );
+ }
+ }
+
+ // parse the sources of validation
+ //16150725[PMID]_16150725[PMID]_21771795[PMID]
+ $sources = explode("_",$dga->SourceOfValidation);
+ foreach($sources AS $source) {
+ preg_match_all("/([0-9]*)\[([^\]]*)?\]/",$source, $m, PREG_PATTERN_ORDER );
+ if(isset($m[1][0])) {
+ $prefix = parent::getRegistry()->getPreferredPrefix($m[2][0]);
+ parent::addRDF(
+ parent::triplify($gene_id,parent::getVoc()."source-of-validation", "$prefix:".$m[1][0])
+ );
+ }
}
$dga_id = parent::getRes().((string)$d->OrphaNumber)."_".md5($dga->asXML());
$ga = $dga->DisorderGeneAssociationType;
- $ga_id = parent::getNamespace().((string) $ga->attributes()->id);
+ $ga_id = parent::getRes()."ga".((string) $ga->attributes()->id);
$ga_label = (string) $ga->Name;
$s = $dga->DisorderGeneAssociationStatus;
- $s_id = parent::getNamespace().((string) $s->attributes()->id);
+ $s_id = parent::getRes()."st".((string) $s->attributes()->id);
$s_label = (string) $s->Name;
parent::addRDF(
diff --git a/pdb/pom.xml b/pdb/pom.xml
index d757bed..0a28a4d 100644
--- a/pdb/pom.xml
+++ b/pdb/pom.xml
@@ -119,7 +119,7 @@
virtuoso-repo
- http://maven.aksw.org/repository/internal/
+ https://maven.aksw.org/repository/internal/
true
diff --git a/pharmgkb/pharmgkb.php b/pharmgkb/pharmgkb.php
index 117323a..fec5e8a 100644
--- a/pharmgkb/pharmgkb.php
+++ b/pharmgkb/pharmgkb.php
@@ -37,9 +37,9 @@ class PharmGKBParser extends Bio2RDFizer
function __construct($argv) {
parent::__construct($argv, "pharmgkb");
- $this->AddParameter('files',true,'all|drugs|genes|diseases|pathways|relationships|annotations|rsid','all','all or comma-separated list of files to process'); /** pathways **/
+ $this->AddParameter('files',true,'all|drugs|genes|phenotypes|pathways|relationships|annotations|variants','all','all or comma-separated list of files to process'); /** pathways **/
$this->addParameter('additional',false,'none|offsides|twosides','none','process offsides and/or twosides');
- $this->AddParameter('download_url',false,null,'https://www.pharmgkb.org/download.do?dlCls=common&objId=');
+ $this->AddParameter('download_url',false,null,'https://www.pharmgkb.org/downloads');
parent::initialize();
}
@@ -60,33 +60,7 @@ function download()
$ldir = $this->GetParameterValue('indir');
$rdir = $this->GetParameterValue('download_url');
- foreach($files AS $file) {
- $lfile = $ldir.$file.".zip";
- if($file == 'annotations' or $file == 'relationships') {
- if(!file_exists($lfile)) {
- echo "Unable to file $lfile . Contact PharmGKB to get access to license-restricted data".PHP_EOL;
- continue;
- }
- }
-
- // download
- $rfile = $rdir.$file.".zip";
- echo "Downloading $file ...";
- if($file == 'offsides') {
- if(!file_exists($lfile)) {
- Utils::DownloadSingle('https://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip', $lfile);
- }
- } elseif($file == 'twosides') {
- if(!file_exists($lfile)) {
- Utils::DownloadSingle('https://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip', $lfile);
- }
- } elseif($file == 'pathways') {
- Utils::DownloadSingle('https://www.pharmgkb.org/download.do?dlCls=common&objId='.$file.'-tsv.zip', $lfile);
- } else {
- Utils::DownloadSingle('https://www.pharmgkb.org/download.do?dlCls=common&objId='.$file.'.zip', $lfile);
- }
- echo "done.".PHP_EOL;
- }
+ echo "Download the data from https://www.pharmgkb.org/downloads.".PHP_EOL;
}
function run()
@@ -109,33 +83,11 @@ function run()
$dataset_description = '';
foreach($files AS $file) {
+ if($file == "pathways") $file = "pathways-tsv";
$suffix = ".zip";
$lfile = $ldir.$file.$suffix;
$rfile = $rdir.$file.$suffix;
- if($file == "offsides" and !file_exists($lfile)){
- echo "downloading twosides...";
- $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-offsides.zip";
- utils::DownloadSingle($rfile,$lfile);
- echo "done".PHP_EOL;
- } elseif($file == "twosides" and !file_exists($lfile)){
- echo "downloading $file ...";
- $rfile = "http://www.pharmgkb.org/redirect.jsp?p=ftp%3A%2F%2Fftpuserd%3AGKB4ftp%40ftp.pharmgkb.org%2Fdownload%2Ftatonetti%2F3003377s-twosides.zip";
- utils::DownloadSingle($rfile,$lfile);
- echo "done".PHP_EOL;
- } elseif($file == 'annotations' or $file == 'relationships') {
- if(!file_exists($lfile)) {
- echo "Contact PharmGKB to get access to variants/clinical variants; save file as annotations.zip".PHP_EOL;
- continue;
- }
- } else {
- if(!file_exists($lfile) or parent::getParameterValue('download') == true) {
- echo "Downloading $lfile ... ";
- Utils::DownloadSingle('https://www.pharmgkb.org/download.do?objId='.$file.'.zip&dlCls=common', $lfile);
- echo "done".PHP_EOL;
- }
- }
-
// get a pointer to the file in the zip archive
if(!file_exists($lfile)) {echo "no local copy of $lfile . skipping".PHP_EOL;continue;}
@@ -147,8 +99,11 @@ function run()
$zipentries = array();
if($file == "annotations") {
// exclude: 'clinical_ann.tsv','study_parameters.tsv'
- $zipentries = array('clinical_ann_metadata.tsv','var_drug_ann.tsv','var_pheno_ann.tsv','var_fa_ann.tsv');
- } else if($file == "pathways") {
+ $zipentries = array(
+ 'clinical_ann_metadata.tsv',
+ 'var_drug_ann.tsv','var_pheno_ann.tsv','var_fa_ann.tsv'
+ );
+ } else if($file == "pathways-tsv") {
for( $i = 0; $i < $zin->numFiles; $i++ ){
$stat = $zin->statIndex( $i );
$entry = $stat['name'];
@@ -165,7 +120,7 @@ function run()
// set the write file, parse, write and close
$suffix = parent::getParameterValue('output_format');
- $outfile = $file.'.'.$suffix;
+ $outfile = "pharmgkb-".$file.'.'.$suffix;
$gz=false;
if(strstr(parent::getParameterValue('output_format'), "gz")) {
@@ -183,10 +138,11 @@ function run()
$this->GetReadFile()->SetFilePointer($fp);
if($file == "annotations") {
- $fnx = substr($zipentry,0,strpos($zipentry,".tsv"));
+ $fnx = substr($zipentry,0,strpos($zipentry,".tsv"));
echo "processing $zipentry..";
- } else if($file == 'pathways') {
+ } else if($file == 'pathways-tsv') {
$fnx = 'pathways';
+ $this->pathway_name = $zipentry;
echo "processing $fnx ($zipentry)... ";
} else {
$fnx = $file;
@@ -247,7 +203,8 @@ function run()
/*
0 PharmGKB Accession Id
- 1 Entrez Id
+ 1 NCBI Gene Id
+ 1.a HGNC Id
2 Ensembl Id
3 Name
4 Symbol
@@ -264,17 +221,18 @@ function run()
function genes()
{
$h = explode("\t",parent::getReadFile()->read());
- $expected_columns = 14;
+ $expected_columns = 17;
if(($n = count($h)) != $expected_columns) {
trigger_error("Found $n columns in gene file - expecting $expected_columns!", E_USER_WARNING);
+ //print_r($h);
return false;
}
- while($l = parent::getReadFile()->read(200000)) {
+ while($l = parent::getReadFile()->read(2000000)) {
$a = explode("\t",$l);
$id = parent::getNamespace().$a[0];
- $label = $a[3];
- $this->genes[$a[0]] = $a[3];
+ $label = $a[4];
+ $this->genes[$a[0]] = $a[4];
parent::addRDF(
parent::describeIndividual($id, $label, parent::getVoc()."Gene").
@@ -289,33 +247,47 @@ function genes()
);
if($a[1]){
- parent::addRDF(
- parent::triplify($id, parent::getVoc()."x-ncbigene", "ncbigene:".$a[1])
- );
+ $list = $this->parseList($a[1]);
+ foreach($list as $c) {
+ parent::addRDF(
+ parent::triplify($id, parent::getVoc()."x-ncbigene", "ncbigene:".$c)
+ );
+ }
}
-
if($a[2]){
- parent::addRDF(
- parent::triplify($id, parent::getVoc()."x-ensembl", "ensembl:".$a[2])
- );
- }
+ $list = $this->parseList($a[2]);
+ foreach($list as $c) {
+ parent::addRDF(
+ parent::triplify($id, parent::getVoc()."x-hgnc", "hgnc:".$c)
+ );
+ }
+ }
if($a[3]){
+ $list = $this->parseList($a[3]);
+ foreach($list as $c) {
+ parent::addRDF(
+ parent::triplify($id, parent::getVoc()."x-ensembl", "ensembl:".$c)
+ );
+ }
+ }
+
+ if($a[4]){
parent::addRDF(
- parent::triplifyString($id, parent::getVoc()."name", $a[3]).
+ parent::triplifyString($id, parent::getVoc()."name", $a[4]).
parent::describeProperty(parent::getVoc()."name", "Relationship between a PharmGKB entity and its name")
);
}
- if($a[4]){
+ if($a[5]){
parent::addRDF(
- parent::triplify($id, parent::getVoc()."symbol", "symbol:".$a[4]).
+ parent::triplify($id, parent::getVoc()."symbol", "symbol:".$a[5]).
parent::describeProperty(parent::getVoc()."symbol", "Relationship between a PharmGKB gene and a gene symbol")
);
}
- if($a[5]) {
- $b = explode('","',substr($a[5],1,-2));
- foreach($b AS $alt_name) {
+ if($a[6]) {
+ $list = $this->parseList($a[6]);
+ foreach($list AS $alt_name) {
parent::addRDF(
parent::triplifyString($id, parent::getVoc()."alternative-name", parent::safeLiteral(trim(stripslashes($alt_name))))
);
@@ -324,9 +296,9 @@ function genes()
parent::describeProperty(parent::getVoc()."alternative-name", "Relationship between a PharmGKB gene and an alternative name")
);
}
- if($a[6]) { // these are not hgnc symbols
- $b = explode('","',substr($a[6],1,-2));
- foreach($b as $alt_symbol) {
+ if($a[7]) { // these are not hgnc symbols
+ $list = $this->parseList($a[7]);
+ foreach($list as $alt_symbol) {
parent::addRDF(
parent::triplifyString($id, parent::getVoc()."alternate-symbol", trim($alt_symbol))
);
@@ -336,22 +308,22 @@ function genes()
);
}
- if($a[7]){
+ if($a[8]){
parent::addRDF(
- parent::triplifyString($id, parent::getVoc()."is-vip", $a[7]).
+ parent::triplifyString($id, parent::getVoc()."is-vip", $a[8]).
parent::describeProperty(parent::getVoc()."is-vip", "Relationship between a PharmGKB gene and its vip status")
);
}
- if($a[8]){
+ if($a[9]){
parent::addRDF(
- parent::triplifyString($id, parent::getVoc()."has-variant-annotation", $a[8]).
+ parent::triplifyString($id, parent::getVoc()."has-variant-annotation", $a[9]).
parent::describeProperty(parent::getVoc()."has-variant-annotation", "Relationship between a PharmGKB gene and whether it has a variant annotation")
);
}
- if($a[9]) {
- $b = explode(",",$a[9]);
- foreach($b AS $xref) {
+ if($a[10]) {
+ $list = $this->parseList($a[10]);
+ foreach($list AS $xref) {
$xref = trim($xref);
if(!$xref) continue;
@@ -362,7 +334,6 @@ function genes()
parent::addRDF(
parent::QQuadO_URL($id, parent::getVoc()."x-$ns", $x)
);
-
} else {
parent::addRDF(
parent::triplify($id, parent::getVoc()."x-$ns", $x)
@@ -370,25 +341,43 @@ function genes()
}
}
}
- if($a[10]) {
+
+ if($a[11]) {
parent::addRDF(
- parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[10])
+ parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[11])
);
}
- if($a[11]) {
+ if($a[12]) {
parent::addRDF(
- parent::triplifyString($id,parent::getVoc()."chromosome",$a[11]).
- parent::describeProperty(parent::getVoc()."chrosomome","Relationship between a PharmGKB gene and its chromosomal position").
- parent::triplifyString($id,parent::getVoc()."chromosome-start",$a[12]).
- parent::triplifyString($id,parent::getVoc()."chromosome-end",$a[13])
+ parent::triplifyString($id,parent::getVoc()."chromosome",$a[12]).
+ parent::describeProperty(parent::getVoc()."chrosomome","Relationship between a PharmGKB gene and its chromosomal position")
);
+ if($a[13] != '-1' and $a[14] != '-1') {
+ parent::addRDF(
+ parent::triplifyString($id,parent::getVoc()."grch37.p13-chromosome-start",$a[13]).
+ parent::triplifyString($id,parent::getVoc()."grch37.p13-chromosome-end",$a[14])
+ );
+ }
+ if($a[15] != '-1' and $a[16] != '-1') {
+ parent::addRDF(
+ parent::triplifyString($id,parent::getVoc()."grch38.p7-chromosome-start",$a[13]).
+ parent::triplifyString($id,parent::getVoc()."grch38.p7-chromosome-end",$a[14])
+ );
+ }
}
- parent::WriteRDFBufferToWriteFile();
-
+ parent::writeRDFBufferToWriteFile();
}
}
+ function parseList($str)
+ {
+ $list = '';
+ if($str[0] == '"') $list = explode('","', substr($str,1,-1));
+ else $list = array($str);
+ return $list;
+ }
+
function MapXrefs($xref, &$url = false, &$ns = null, &$id = null)
{
$xrefs = array(
@@ -405,13 +394,16 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null)
'refseqprotein' => 'refseq',
'refseqdna' => 'refseq',
'comparativetoxicogenomicsdatabase' => 'ctd',
- 'humancycgene' => 'humancyc'
+ 'humancycgene' => 'humancyc',
+ 'chemicalabstractsservice' => 'cas',
+ 'chebi:chebi' => 'chebi'
);
$this->getRegistry()->ParseQName($xref,$ns,$id);
$ns = str_replace(array('"',' '),'',$ns);
if(isset($xrefs[$ns])) {
$ns = $xrefs[$ns];
}
+
$url = false;
if($ns == "url") {
$url = true;
@@ -438,16 +430,20 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null)
*/
function drugs()
{
- $declared = '';
- $h = explode("\t",$this->GetReadFile()->Read(1000)); // first line is header
- if(count($h) != 10) {
- trigger_error("Change in number of columns for drugs file",E_USER_ERROR);
- return FALSE;
+ $declared = array();
+ $h = explode("\t",$this->GetReadFile()->Read(10000)); // first line is header
+ $ncols = count($h);
+ $nexp = 24;
+ if($ncols != $nexp) {
+ trigger_error("Change in number of columns for drugs file. Expected $nexp but found $ncols.",E_USER_ERROR);
+ #return FALSE;
}
+ $this->GetReadFile()->Read(200000);
+
while($l = $this->GetReadFile()->Read(200000)) {
$a = explode("\t",$l);
+
$id = parent::getNamespace().$a[0];
-
$this->drugs[$a[0]] = $a[1];
parent::addRDF(
@@ -458,8 +454,8 @@ function drugs()
if(trim($a[2])) {
// generic names
// Entacapona [INN-Spanish],Entacapone [Usan:Inn],Entacaponum [INN-Latin],entacapone
- $b = explode(',',trim($a[2]));
- foreach($b AS $c) {
+ $list = $this->parseList(trim($a[2]));
+ foreach($list AS $c) {
parent::addRDF(
parent::triplifyString($id, parent::getVoc()."generic_name", str_replace('"','',$c))
);
@@ -471,8 +467,8 @@ function drugs()
if(trim($a[3])) {
// trade names
//Disorat,OptiPranolol,Trimepranol
- $b = explode(',',trim($a[3]));
- foreach($b as $c) {
+ $list = $this->parseList(trim($a[3]));
+ foreach($list as $c) {
parent::addRDF(
parent::triplifyString($id, parent::getVoc()."trade_name", str_replace(array("'", "\""), array("\\\'", "") ,$c))
);
@@ -484,8 +480,8 @@ function drugs()
if(trim($a[4])) {
// Brand Mixtures
// Benzyl benzoate 99+ %,"Dermadex Crm (Benzoic Acid + Benzyl Benzoate + Lindane + Salicylic Acid + Zinc Oxide + Zinc Undecylenate)",
- $b = explode(',',trim($a[4]));
- foreach($b as $c) {
+ $list = $this->parseList(trim($a[4]));
+ foreach($list as $c) {
parent::addRDF(
parent::triplifyString($id, parent::getVoc()."brand_mixture", str_replace(array("'", "\""),array("\\\'",""), $c))
);
@@ -504,16 +500,19 @@ function drugs()
if(trim($a[6])) {
// Cross References
// drugBank:DB00789,keggDrug:D01707,pubChemCompound:55466,pubChemSubstance:192903,url:http://en.wikipedia.org/wiki/Gadopentetate_dimeglumine
- $b = explode(',',trim(str_replace('"','',$a[6])));
- foreach($b as $c) {
+ $list = $this->parseList(trim($a[6]));
+ foreach($list as $c) {
$this->getRegistry()->parseQName($c,$ns,$id1);
- $ns = str_replace(array('"',' '),'',$ns);
- $ns = str_replace(array('keggcompound','keggdrug','drugbank','uniprotkb','clinicaltrials.gov','drugsproductdatabase(dpd)','nationaldrugcodedirectory','therapeutictargetsdatabase','fdadruglabelatdailymed'),
- array('kegg','kegg','drugbank', 'uniprot','clinicaltrials','dpd','ndc','ttd','dailymed'),
- strtolower(str_replace('"','',$ns)));
+ if($ns == "chebi") $id1 = substr($id1, 6);
+ $ns = str_replace(
+ array('chemicalabstractsservice','keggcompound','keggdrug','drugbank','uniprotkb','clinicaltrials.gov','drugsproductdatabase(dpd)','nationaldrugcodedirectory','therapeutictargetsdatabase','fdadruglabelatdailymed'),
+ array('cas','kegg','kegg','drugbank', 'uniprot','clinicaltrials','dpd','ndc','ttd','dailymed'),
+ strtolower(str_replace(' ','',$ns)));
+
+ #echo $ns." ".$id1.PHP_EOL;
if($ns == "url") {
parent::addRDF(
- parent::QQuadO_URL($id, "rdfs:seeAlso", $id)
+ parent::QQuad($id, "rdfs:seeAlso", $id)
);
} else {
parent::addRDF(
@@ -522,28 +521,60 @@ function drugs()
}
}
}
- if(trim($a[9])) {
+
+ if(trim($a[7])) {
+ parent::addRDF(
+ parent::triplifyString($id, parent::getVoc()."smiles", addslashes(substr($a[7],1,-1))).
+ parent::describeProperty(parent::getVoc()."smiles", "Relationship between a PharmGKB drug and its SMILES string")
+ );
+ }
+ if(trim($a[8])) {
+ parent::addRDF(
+ parent::triplifyString($id, parent::getVoc()."inchi", $a[8]).
+ parent::describeProperty(parent::getVoc()."smiles", "Relationship between a PharmGKB drug and its SMILES string")
+ );
+ }
+
+ if($a[9]) {
+ parent::addRDF(
+ parent::triplifyString($id,parent::getVoc()."cpic-dosing-guideline",$a[9])
+ );
+ }
+ if(trim($a[10])) {
// External Vocabulary
// ATC:H01AC(Somatropin and somatropin agonists),ATC:V04CD(Tests for pituitary function)
// ATC:D07AB(Corticosteroids, moderately potent (group II)) => this is why you don't use brackets and commas as separators.
- $b = explode(',',trim($a[9]),2);
- foreach($b as $c) {
- preg_match_all("/ATC:([A-Z0-9]+)\((.*)\)$/",$c,$m);
- if(isset($m[1][0])) {
- $atc = "atc:".$m[1][0];
+ $list = $this->parseList(trim($a[10]));
+ foreach($list as $c) {
+ preg_match("/([^\(]+)?\((.*)\)/", $c, $m);
+ if(isset($m[1])) {
+ $this->getRegistry()->parseQName($m[1],$ns,$id1);
+ $myid = $ns.":".$id1;
+ $label = $m[2];
+
parent::addRDF(
- parent::triplify($id, parent::getVoc()."x-atc", $atc)
+ parent::triplify($id, parent::getVoc()."x-$ns", $myid)
);
- if(!isset($declared[$atc])) {
- $declared[$atc] = '';
+ if(!isset($declared[$myid])) {
+ $declared[$myid] = '';
parent::addRDF(
- parent::triplifyString($atc, "rdfs:label", $m[2][0])
+ parent::triplifyString($myid, "rdfs:label", $m[2])
);
}
}
}
}
- parent::WriteRDFBufferToWriteFile();
+ if(trim($a[22])) {
+ // ATC identifiers
+ $list = $this->parseList(trim($a[22]));
+ foreach($list as $c) {
+ parent::addRDF(
+ parent::triplify($id, parent::getVoc()."x-atc", "atc:".$c)
+ );
+ }
+ }
+
+ parent::writeRDFBufferToWriteFile();
}
}
@@ -554,7 +585,7 @@ function drugs()
[3] => Cross-references
[4] => External Vocabulary
*/
- function diseases()
+ function phenotypes()
{
$h = explode("\t",$this->GetReadFile()->Read(10000)); // first line is header
if(count($h) != 5) {
@@ -578,9 +609,8 @@ function diseases()
parent::describeProperty(parent::getVoc()."name", "Relationship between a PharmGKB entity and its name")
);
- if(!isset($a[2])) continue;
if($a[2] != '') {
- $names = explode('",',$a[2]);
+ $names = $this->parseList($a[2]);
foreach($names AS $name) {
if($name != ''){
parent::addRDF(
@@ -591,30 +621,26 @@ function diseases()
}
}
- // MeSH:D001145(Arrhythmias, Cardiac),SnoMedCT:195107004(Cardiac dysrhythmia NOS),UMLS:C0003811(C0003811)
+ // $a[3] appears to be null.
- $sameID = parent::getRes().md5($a[1]);
- parent::addRDF(
- parent::triplify($id, "owl:sameAs", $sameID)
- );
+ // MeSH:D001145(Arrhythmias, Cardiac),SnoMedCT:195107004(Cardiac dysrhythmia NOS),UMLS:C0003811(C0003811)
if(isset($a[4]) && trim($a[4]) != '') {
- $xrefs = explode('","', $a[4]);
+ $xrefs = $this->parseList($a[4]);
foreach($xrefs AS $xref) {
- $xref = str_replace('"','',$xref);
- $d = preg_match_all('/[,]?([^\:]+):([A-Za-z0-9]+)\(([^\)]+)\)/',$xref,$m, PREG_SET_ORDER);
- foreach($m AS $n) {
- if(isset($n[1]) && isset($n[2]) && !strstr($n[1]," ")) {
- $n[1] = str_replace("),","",strtolower($n[1]));
- $id2 = $n[1].':'.$n[2];
+ preg_match("/([^\(]+)?\((.*)\)/", str_replace('"','',$xref), $m);
+ if(isset($m[1])) {
+ $this->getRegistry()->parseQName($m[1],$ns,$id1);
+ $myid = $ns.":".$id1;
+ $label = $m[2];
+ parent::addRDF(
+ parent::triplify($id, "pharmgkb_vocabulary:x-".$ns, $myid)
+ );
+ if(!isset($declared[$myid]) and $id1 != $label) {
+ $declared[$myid] = '';
parent::addRDF(
- parent::triplify($id, "pharmgkb_vocabulary:x-".$n[1], $id2)
+ parent::triplifyString($myid, "rdfs:label", $label)
);
- if(isset($n[3]) && $n[2] != $n[3]){
- parent::addRDF(
- parent::triplifyString($id2, "rdfs:label", str_replace(array("\'", "\""),array("\\\'", ""),$n[3]))
- );
- }
- }
+ }
}
}
}
@@ -640,7 +666,6 @@ function relationships()
$declared = '';
$hash = ''; // md5 hash list
$h = explode("\t", $this->GetReadFile()->Read());
-
if(count($h) != 11) {
trigger_error("Change in number of columns for relationships file (again)", E_USER_ERROR);
return FALSE;
@@ -649,7 +674,7 @@ function relationships()
while($l = $this->getReadFile()->read(100000)) {
$a = explode("\t",$l);
-
+
$id1_list = explode(",",trim($a[0]));
$id1_names = explode(",",trim($a[1]));
$type1 = $a[2];
@@ -677,14 +702,21 @@ function relationships()
// association
$z++;
$id = parent::getRes().$z;
- $label = $id1_names[$i]." - ".$id2_names[$j]." association";
+ if($type1 < $type2) {
+ $type = $type1.'-'.$type2.'-Assocation';
+ $label = $id1_names[$i]." - ".$id2_names[$j]." association";
+ } else {
+ $type = $type2.'-'.$type1.'-Assocation';
+ $label = $id2_names[$i]." - ".$id1_names[$j]." association";
+ }
+
parent::addRDF(
- parent::describeIndividual($id, $label, parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association").
+ parent::describeIndividual($id, $label, parent::getVoc().$type).
parent::triplify($id, parent::getVoc().strtolower($type1), $i1).
parent::triplify($id, parent::getVoc().strtolower($type2), $i2).
parent::triplify($i1, parent::getVoc().strtolower($type2), $i2).
parent::triplify($i2, parent::getVoc().strtolower($type1), $i1).
- parent::describeClass(parent::getVoc().strtolower($type1)."-".strtolower($type2)."-Association", "PharmGKB $type1 $type2 Association").
+ parent::describeClass(parent::getVoc().$type, "PharmGKB $type").
parent::describeProperty(parent::getVoc().strtolower($type1), "Relationship between a PharmGKB association and a $type1").
parent::describeProperty(parent::getVoc().strtolower($type2), "Relationship between a PharmGKB association and a $type2")
);
@@ -734,49 +766,65 @@ function relationships()
/*
- THIS FILE ONLY INCLUDES RSIDs IN GENES
- RSID Gene IDs Gene Symbols
- rs8331 PA27674;PA162375713 EGR2;ADO
+ THIS FILE ONLY INCLUDES variants IN GENES
+
+Variant ID Variant Name Gene IDs Gene Symbols Location Variant Annotation count Clinical Annotation count Level 1/2 Clinical Annotation count Guideline Annotation count Label Annotation count Synonyms
+PA166156302 rs1000002 PA395 ABCC5 NC_000003.11:183635768 1 0 0 0 0 rs17623022, NC_000003.12:g.183917980C>T, rs386508637, rs1000002, 1000002, [GRCh37]chr3:183635768, rs60664316, NC_000003.11:g.183635768C>T
+
*/
- function rsid()
+ function variants()
{
$z = 0;
- $this->GetReadFile()->Read();
- $this->GetReadFile()->Read();
+ $header = $this->GetReadFile()->Read();
parent::addRDF(
- parent::describeClass(parent::getVoc()."Variation", "PharmGKB Variation")
+ parent::describeClass(parent::getVoc()."Variant", "PharmGKB Variant")
);
while($l = $this->GetReadFile()->Read()) {
- if($z % 10000 == 0) {
- parent::writeRDFBufferToWriteFile();
- }
$a = explode("\t",$l);
- $rsid = "dbsnp:".$a[0];
- $genes = explode(";",$a[1]);
- parent::addRDF(
- parent::describeIndividual($rsid, $rsid, parent::getVoc()."Variation")
- );
- foreach($genes AS $gene) {
+ if(isset($a[1])) {
+ $id = parent::getNamespace().$a[0];
+ $rsid = "dbsnp:".$a[1];
+ $genes = explode(",",$a[2]);
parent::addRDF(
- parent::triplify($rsid, parent::getVoc()."gene", parent::getNamespace().$gene)
+ parent::describeIndividual($id, $id, parent::getVoc()."Variant").
+ parent::triplify($id, parent::getVoc()."x-dbsnp", $rsid)
);
+ foreach($genes AS $gene) {
+ parent::addRDF(
+ parent::triplify($id, parent::getVoc()."gene", parent::getNamespace().$gene)
+ );
+ }
}
}
+ parent::writeRDFBufferToWriteFile();
}
function clinical_ann_metadata()
{
- $header = array("Clinical Annotation Id","Location","Gene","Level of Evidence","Clinical Annotation Types","Genotype-Phenotype IDs","Annotation Text","Variant Annotations IDs","Variant Annotations","PMIDs","Evidence Count","Related Drugs","Related Diseases","Race");
- $this_header = explode("\t",$this->GetReadFile()->Read());
+ $header = array("Clinical Annotation Id","Location","Gene","Level of Evidence","Clinical Annotation Types","Genotype-Phenotype IDs","Annotation Text","Variant Annotations IDs","Variant Annotations","PMIDs","Evidence Count","Related Drugs","Related Diseases","Biogeographical groups", "Chromosome");
+ $this_header = explode("\t",$this->getReadFile()->read());
if(count($this_header) != count($header)) {
trigger_error("Change in the number of columns. Expected ".count($header).", but found ".count($this_header),E_USER_ERROR);
return (-1);
}
- while($l = $this->GetReadFile()->Read(200000)) {
+ while($l = $this->GetReadFile()->Read(20000000)) {
$a = explode("\t",$l);
+
$id = parent::getNamespace().$a[0];
- $label = "clinical annotation for ".$a[1];
+ # fixing bad file formatting
+ if($a[0] == "982040598" or $a[0] == "982037603") {
+ $a[8] .= $a[11];
+ $a[9] = $a[12];
+ $a[10] = $a[13];
+ $a[11]= $a[14];
+ $a[12] = $a[15];
+ $a[13] = $a[16];
+ $a[14] = $a[17];
+ }
+
+
+ $label = "clinical genotype to phenotype annotations for ".$a[1];
// [0] => Clinical Annotation Id
parent::addRDF(
parent::describeIndividual($id, $label, parent::getVoc()."Clinical-Annotation").
@@ -818,30 +866,31 @@ function clinical_ann_metadata()
);
}
- // [6] => Clinical Annotation Types
+ // [4] => Clinical Annotation Types
if($a[4]) {
- $types = explode(";",$a[4]);
+ $types = $this->parseList($a[4]);
foreach($types AS $t) {
+ $t = strtolower($t);
parent::addRDF(
- parent::triplifyString($id, parent::getVoc()."annotation-type", strtolower($t))
+ parent::triplifyString($id, parent::getVoc()."annotation-type", $t)
);
}
}
// [5] => Genotype-Phenotypes IDs
// [6] => Text
if($a[5]) {
- $gps = explode(";",$a[5]);
- $gps_texts = explode(";",$a[6]);
+ $gps = explode('","',$a[5]);
+ $gps_texts = explode('","',$a[6]);
foreach($gps AS $i => $gp) {
- $gp = trim($gp);
- $gp_text = trim($gps_texts[$i]);
+ $gp = str_replace('"','',trim($gp));
+ $gp_text = str_replace('"','',trim($gps_texts[$i]));
$b = explode(":",$gp_text,2);
parent::addRDF(
- parent::describeIndividual(parent::getNamespace().$gp, $gp_text, parent::getVoc()."Genotype-Phenotype").
+ parent::describeIndividual(parent::getNamespace().$gp, $gp_text, parent::getVoc()."Genotype-Phenotype-Association").
parent::triplify($id, parent::getVoc()."genotype_phenotype", parent::getNamespace().$gp).
parent::triplifyString(parent::getNamespace().$gp, parent::getVoc()."genotype", trim($b[0])).
- parent::describeClass(parent::getVoc()."Genotype-Phenotype", "PharmGKB Genotype Phenotype").
+ parent::describeClass(parent::getVoc()."Genotype-Phenotype-Association", "PharmGKB Genotype Phenotype Association").
parent::describeProperty(parent::getVoc()."genotype_phenotype", "Relationship between a PharmGKB entity and a Genotype Phenotype").
parent::describeProperty(parent::getVoc()."genotype", "Relationship between a PharmGKB Genotype Phenotype and a genotype")
);
@@ -851,13 +900,13 @@ function clinical_ann_metadata()
// [7] => Variant Annotations IDs
// [8] => Variant Annotations
if($a[7]) {
- $b = explode(";",$a[7]);
- $b_texts = explode(";",$a[8]);
+ $b = explode('","',$a[7]);
+ $b_texts = explode('","',$a[8]);
foreach($b AS $i => $variant) {
- $variant = trim($variant);
- $variant_text = trim ($b_texts[$i]);
+ $variant = str_replace('"','',trim($variant));
+ $variant_text = str_replace('"','',trim ($b_texts[$i]));
parent::addRDF(
- parent::describeIndividual(parent::getNamespace().$variant, $variant_text, parent::getVoc()."Variant").
+ parent::describeIndividual(parent::getNamespace().$variant, $variant_text, parent::getVoc()."Variant-Annotation").
parent::triplify($id, parent::getVoc()."variant", parent::getNamespace().$variant)
);
}
@@ -865,9 +914,8 @@ function clinical_ann_metadata()
// [9] => PMIDs
if($a[9]) {
- $b = explode(";",$a[9]);
+ $b = $this->parseList($a[9]);
foreach($b AS $i => $pmid) {
- $pmid = trim($pmid);
parent::addRDF(
parent::triplify($id, parent::getVoc()."article", "pubmed:".$pmid)
);
@@ -884,21 +932,18 @@ function clinical_ann_metadata()
// [11] => Related Drugs
if($a[11]) {
- $b = explode(";",$a[11]);
+ //print_r($a);exit;
+ $b = $this->parseList($a[11]);
foreach($b AS $drug_label) {
- // find the id from the label
- $find = @array_search($drug_label, $this->drug_names_array);
- if($find !== FALSE and $find !== NULL){
+ preg_match('/\(PA(.*)\)/',$drug_label,$m);
+
+ if(isset($m[1])) {
parent::addRDF(
- parent::triplify($id, parent::getVoc()."related-drug", $find)
+ parent::triplify($id, parent::getVoc()."related-drug", "pharmgkb:PA".$m[1])
);
} else {
- $drug_id = parent::getRes().md5($drug_label);
- parent::addRDF(
- parent::describeIndividual($drug_id, $drug_label, parent::getVoc()."Drug").
- parent::triplify($id, parent::getVoc()."related-drug", $drug_id)
- );
- }
+ echo "Error in parsing drug label for $id - ".$drug_label." ".PHP_EOL;
+ }
}
parent::addRDF(
parent::describeProperty(parent::getVoc()."related-drug", "Relationship between a PharmGKB annotation and a related drug")
@@ -906,31 +951,28 @@ function clinical_ann_metadata()
}
// [12] => Related Diseases
if($a[12]) {
- $b = explode(";",$a[12]);
+ $b = $this->parseList($a[12]);
foreach($b AS $disease_label) {
- // find the id from the label
- $find = @array_search($disease_label, $this->disease_names_array);
- if($find !== FALSE and $find !== NULL){
- parent::addRDF(
- parent::triplify($id, parent::getVoc()."related-disease", $find)
- );
- }else {
- $disease_id = parent::getRes().md5($disease_label);
+ preg_match('/\(PA(.*)\)/',$disease_label,$m);
+ if(isset($m[1])) {
parent::addRDF(
- parent::describeIndividual($disease_id, $disease_label, parent::getVoc()."Disease").
- parent::triplify($id, parent::getVoc()."related-disease", $disease_id)
+ parent::triplify($id, parent::getVoc()."related-disease", "pharmgkb:PA".$m[1])
);
+ } else {
+ print_r($a);
+ echo $l.PHP_EOL;
+ echo "Error in parsing disease label for $id - ".$disease_label." ".PHP_EOL;
}
}
parent::addRDF(
parent::describeProperty(parent::getVoc()."related-disease", "Relationship between a PharmGKB annotation and a related disease")
);
}
- // [13] => OMB Races
+ // [13] => Biogeographical groupss
if($a[13]) {
parent::addRDF(
- parent::triplifyString($id, parent::getVoc()."race", $a[13]).
- parent::describeProperty(parent::getVoc()."race", "Relationship between a PharmGKB annotation and a race")
+ parent::triplifyString($id, parent::getVoc()."biogeographical-group", $a[13]).
+ parent::describeProperty(parent::getVoc()."biogeographical-group", "Relationship between a PharmGKB annotation and a biogeographical group")
);
}
}
@@ -944,20 +986,29 @@ function var_pheno_ann() {return $this->variant_annotation();}
function variant_annotation()
{
- $canonical_header = array("Annotation ID","Variant","Gene","Drug","Literature Id","Phenotype Category","Significance","Notes","Sentence","StudyParameters"," Alleles");
+ $canonical_header = array("Annotation ID","Variant","Gene","Chemical","PMID","Phenotype Category","Significance","Notes","Sentence","StudyParameters","Alleles","Chromosome");
$header = explode("\t",$this->getReadFile()->read(20000));
if(count($header) != count($canonical_header)) {
trigger_error("column mismatch! Expected ".count($canonical_header).",but found ".count($header),E_USER_ERROR);
return (-1);
}
+ foreach($canonical_header AS $i => $ch) {
+ if($header[$i] != $ch) {
+ trigger_error("Change in the column header. Expecting $ch and found $header[$i] instead.",E_USER_ERROR);
+ return (-1);
+ }
+ }
$declaration = '';
- while($l = $this->GetReadFile()->Read(20000)) {
+ while($l = $this->getReadFile()->read(20000)) {
$a = explode("\t",$l);
+
//[0] => Annotation ID
$id = parent::getNamespace().$a[0];
+ $label = "Variant annotation $a[0]";
+ if($a[8]) $label = $a[8];
parent::addRDF(
- parent::describeIndividual($id, "Variant Annotation $a[0]", parent::getVoc()."Variant-Annotation").
+ parent::describeIndividual($id, $label, parent::getVoc()."Variant-Annotation").
parent::describeClass(parent::getVoc()."Variant-Annotation", "PharmGKB Variant Annotation")
);
@@ -979,7 +1030,7 @@ function variant_annotation()
//[2] => Gene
//CYP3A (PA27114),CYP3A4 (PA130)
if($a[2]) {
- $genes = explode(",",$a[2]);
+ $genes = $this->parseList($a[2]);
foreach($genes AS $gene) {
preg_match("/\((PA[A-Za-z0-9]+)\)/",$gene,$m);
if(isset($m[1])) {
@@ -993,7 +1044,7 @@ function variant_annotation()
//[3] => Drug
if($a[3]) {
- $drugs = explode(",",$a[3]);
+ $drugs = $this->parseList($a[3]);
foreach($drugs AS $drug) {
preg_match("/\((PA[A-Za-z0-9]+)\)/",$drug,$m);
if(isset($m[1])) {
@@ -1007,6 +1058,11 @@ function variant_annotation()
// [4] => Literature Id
if($a[4]) {
+ if($a[4][0] == 'h') {
+ // occurs in var_pheno_ann for 2 entries. 10-04-2016
+ $a[4] = str_replace('http://sfx.stanford.edu/local?sid=Entrez:PubMed&id=pmid:','',$a[4]);
+ }
+
$b = explode(";",$a[4]);
foreach($b AS $i => $pmid) {
$pmid = trim($pmid);
@@ -1019,7 +1075,7 @@ function variant_annotation()
//[5] => Phenotype
if($a[5]) {
- $types = explode(";",$a[5]);
+ $types = $this->parseList($a[5]);
foreach($types AS $t) {
parent::addRDF(
parent::triplifyString($id, parent::getVoc()."annotation-type", strtolower($t))
@@ -1052,11 +1108,12 @@ function variant_annotation()
//[9] => StudyParameters
if($a[9]) {
- $sps = explode(";",$a[9]);
+ $sps = $this->parseList($a[9]);
foreach($sps AS $sp) {
$t = parent::getNamespace().trim($sp);
parent::addRDF(
parent::describeIndividual($t, $sp, parent::getVoc()."Study-Parameter").
+ parent::describeClass(parent::getVoc()."Study-Parameter", "PharmGKB Study Parameter").
parent::triplify($id, parent::getVoc()."study-parameter", $t)
);
}
@@ -1074,40 +1131,123 @@ function variant_annotation()
function pathways()
{
- // needs to be finished
- return;
-
+ preg_match('/(PA[0-9]+)-([^\.]+)\.tsv/',$this->pathway_name,$m);
+ if(!isset($m[1]) and !isset($m[2])) {
+ trigger_error("unable to find pathway identifier in ".$this->pathway_name);
+ return false;
+ }
+ $pathway_id = parent::getNamespace().$m[1];
+ $pathway_name = $m[2];
+
+ parent::addRDF(
+ parent::describeIndividual($pathway_id,$pathway_name,parent::getVoc()."Pathway").
+ parent::describeClass(parent::getVoc()."Pathway","PharmGKB Pathway")
+ );
+
+ $fields = array('From','To','Reaction Type','Controller','Control Type','Cell Type','PMIDs','Genes','Drugs','Diseases');
+ $h = explode("\t", $this->getReadFile()->read(50000));
+ // @todo check that the fields match
+
while($l = $this->getReadFile()->read(50000)) {
- $a = explode("\t",trim($l));
+ $a = explode("\t",$l);
- // From To Reaction Type Controller Control Type Cell Type PubMed Id Genes Drugs Diseases
- // hmg coa reductase inhibitors Active & Inactive metabolites Biochemical Reaction CYP2C19,CYP2C8,CYP2C9,CYP2D6,CYP3A4,CYP3A5,UGT1A1,UGT1A3,UGT2B7 Catalysis hepatocyte CYP3A4,CYP3A5,UGT1A3,CYP2C19,CYP2C9,CYP2C8,CYP2D6,UGT1A1,UGT2B7 hmg coa reductase inhibitors
-
- $c1 = array_search($a[0],$this->drugs);
- if($c1 === FALSE) {
- $c1 = array_search($a[0],$this->genes);
- if($c1 === FALSE) {
- $c1 = parent::getRes().url_encode($c1);
- } else {
- $c1 = parent::getNamespace().$c1;
+ $id = md5($l);
+ $uri = parent::getRes().$id;
+ $label = $a[2]." in ".$pathway_name;
+ $type = parent::getVoc().urlencode(str_replace(' ','-',$a[2]));
+ $from = parent::getRes().md5($a[0]);
+ $to = parent::getRes().md5($a[1]);
+
+ parent::addRDF(
+ parent::describeIndividual($uri, $label, $type).
+ parent::describeClass($type, $a[2]).
+ parent::describeIndividual($from, str_replace('"', '', $a[0]), parent::getVoc()."Resource").
+ parent::describeIndividual($to, $a[1], parent::getVoc()."Resource").
+ parent::triplify($uri, parent::getVoc()."from", $from).
+ parent::triplify($uri, parent::getVoc()."to", $to).
+ parent::triplify($uri, parent::getVoc()."pathway", $pathway_id).
+ parent::triplify($pathway_id, parent::getVoc()."pathway-component", $uri)
+ );
+
+ if($a[4]) {
+ // control type
+ $types = explode(',',$a[4]);
+ foreach($types as $type) {
+ $ctid= parent::getRes().md5($type);
+ parent::addRDF(
+ parent::describeIndividual($ctid, $type, parent::getVoc()."Control-Type").
+ parent::describeClass(parent::getVoc()."Control-Type", "PharmGKB Control Type").
+ parent::triplify($uri, parent::getVoc()."control-type",$ctid)
+ );
+ }}
+ if($a[5]) {
+ // cell type
+ $list = $this->parseList($a[5]);
+ foreach($list AS $item) {
+ $ctid= parent::getRes().md5($item);
+ parent::addRDF(
+ parent::describeIndividual($ctid, $item, parent::getVoc()."Cell-Type").
+ parent::describeClass(parent::getVoc()."Cell-Type", "PharmGKB Cell Type").
+ parent::triplify($uri, parent::getVoc()."cell-type",$ctid)
+ );
}
}
-
- $c2 = array_search($a[1],$this->drugs);
- if($c2 === FALSE) {
- $c2 = array_search($a[1],$this->genes);
- if($c2 === FALSE) {
- // not found
- $c2 = parent::getRes().url_encode($c2);
- } else {
- // actual id
- $c2 = parent::getNamespace().$c2;
+ if($a[6]) {
+ $pmids = explode(",",$a[6]);
+ foreach($pmids AS $pmid) {
+ parent::addRDF(
+ parent::triplify($uri, parent::getVoc()."x-pubmed", "pubmed:".trim($pmid))
+ );
}
}
- $id = md5($l);
- $uri = parent::getRes().$id;
-
+ if($a[7]) {
+ $genes = $this->parseList($a[7]);
+ foreach($genes AS $gene) {
+ $c1 = array_search($gene,$this->genes);
+ if(!$c1) {
+ $c1 = parent::getRes().urlencode($gene);
+ } else {
+ $c1 = parent::getNamespace().$c1;
+ }
+
+ if($c1 !== FALSE) {
+ parent::addRDF(
+ parent::triplify($uri, parent::getVoc()."gene", $c1)
+ );
+ }
+ }}
+
+ if($a[8]) {
+ $drugs = $this->parseList($a[8]);
+ foreach($drugs AS $drug) {
+ $c2 = array_search($drug,$this->drugs);
+ if(!$c2) {
+ $c2 = parent::getRes().urlencode($drug);
+ } else {
+ $c2 = parent::getNamespace().$c2;
+ }
+ if($c2 !== FALSE) {
+ parent::addRDF(
+ parent::triplify($uri, parent::getVoc()."drug", $c2)
+ );
+ }
+ }}
+ if($a[9]) {
+ $diseases = $this->parseList($a[9]);
+ foreach($diseases AS $disease) {
+ $c2 = array_search($disease,$this->diseases);
+ if(!$c2) {
+ $c2 = parent::getRes().urlencode($disease);
+ } else {
+ $c2 = parent::getNamespace().$c2;
+ }
+ if($c2 !== FALSE) {
+ parent::addRDF(
+ parent::triplify($uri, parent::getVoc()."disease", $c2)
+ );
+ }
+ }}
parent::writeRDFBufferToWriteFile();
}
diff --git a/pubmed/pubmed.php b/pubmed/pubmed.php
index 92dd9eb..106828f 100644
--- a/pubmed/pubmed.php
+++ b/pubmed/pubmed.php
@@ -65,6 +65,8 @@ function process_dir(){
$files = glob($ldir."*.xml.gz");
foreach($files AS $i => $file) {
+ // if($file != '/data/download/pubmed/medline16n0345.xml.gz') continue;
+
echo "Processing $file (".($i+1)."/".count($files).") ...";
$this->process_file($file);
parent::clear();
@@ -185,19 +187,17 @@ function pubmed()
$i = 0;
foreach($citation->MeshHeadingList->MeshHeading AS $mh){
$id = parent::getRes().$pmid."_mh_".++$i;
- $did = parent::getRes().md5($mh->DescriptorName);
+ $did = "mesh:".$mh->DescriptorName['UI'];
parent::addRDF(
parent::describeIndividual($id, $mh->DescriptorName, parent::getVoc()."MeshHeading").
- parent::describeClass(parent::getVoc()."MeshHeading","MeSH Heading").
- parent::triplify($pmid_uri, parent::getVoc()."mesh-heading", $id).
-
+ parent::triplify($id, parent::getVoc()."x-mesh", $did).
parent::triplifyString($id, parent::getVoc()."descriptor-major-topic", "".$mh->DescriptorName['MajorTopicYN']).
- parent::describeIndividual($did, "".$mh->DescriptorName, parent::getVoc()."Mesh-Descriptor").
- parent::triplify($id, parent::getVoc()."mesh-descriptor", $did)
+ parent::describeClass(parent::getVoc()."MeshHeading","MeSH Heading").
+ parent::triplify($pmid_uri, parent::getVoc()."mesh-heading", $id)
);
if(!empty($mh->QualifierName)){
foreach($mh->QualifierName AS $qualifier_name) {
- $qid = parent::getRes().md5($qualifier_name);
+ $qid = "mesh:".$mh->QualifierName['UI'];
parent::addRDF(
parent::describeIndividual($qid, $qualifier_name, parent::getVoc()."Mesh-Qualifier").
parent::triplify($id, parent::getVoc()."mesh-qualifier", $qid)
@@ -211,8 +211,10 @@ function pubmed()
$i = 0;
foreach($citation->ChemicalList->Chemical as $chemical){
$id = parent::getRes().$pmid."_ch_".++$i;
+ $mesh_id = "mesh:".$chemical->NameOfSubstance['UI'];
parent::addRDF(
parent::describeIndividual($id, $chemical->NameOfSubstance, parent::getVoc()."Chemical").
+ parent::triplify($id,parent::getVoc()."x-mesh",$mesh_id).
parent::describeClass(parent::getVoc()."Chemical","Chemical").
parent::triplify($pmid_uri, parent::getVoc()."chemical", $id)
);
@@ -255,7 +257,8 @@ function pubmed()
$label = str_replace(" ","-",$publicationType);
parent::addRDF(
parent::triplify($pmid_uri, parent::getVoc()."publication-type", $id).
- parent::describeClass($id, $publicationType)
+ parent::describeClass($id, $publicationType).
+ parent::triplify($id,parent::getVoc()."x-mesh","mesh:".$publicationType['UI'])
);
}
diff --git a/sgd/sgd.php b/sgd/sgd.php
index 013a325..9dd23da 100644
--- a/sgd/sgd.php
+++ b/sgd/sgd.php
@@ -40,7 +40,7 @@ function __construct($argv) {
parent::addParameter('download_url',false,null,'http://downloads.yeastgenome.org/');
parent::addParameter('ncbo_download_dir', false, null, '/data/download/bioportal/', 'directory of bioportal ontologies');
parent::addParameter('ncbo_api_key',true,null,null,'your NCBO API key');
- parent::addParameter('one_file',false,'true|false','true',"whether to produce a single file output");
+ parent::addParameter('one_file',false,'true|false','false',"whether to produce a single file output");
parent::initialize();
}
@@ -74,7 +74,7 @@ function download(){
"features" => "curation/chromosomal_feature/SGD_features.tab",
"domains" => "curation/calculated_protein_info/domains/domains.tab",
"protein" => "curation/calculated_protein_info/protein_properties.tab",
- "goa" => "curation/literature/gene_association.sgd.gz",
+ "goa" => "curation/literature/gene_association.sgd.gaf.gz",
"goslim" => "curation/literature/go_slim_mapping.tab",
"complex" => "curation/literature/go_protein_complex_slim.tab",
"interaction" => "curation/literature/interaction_data.tab",
@@ -121,7 +121,7 @@ function process(){
"features" => "curation/chromosomal_feature/SGD_features.tab",
"domains" => "curation/calculated_protein_info/domains/domains.tab",
"protein" => "curation/calculated_protein_info/protein_properties.tab",
- "goa" => "curation/literature/gene_association.sgd.gz",
+ "goa" => "curation/literature/gene_association.sgd.gaf.gz",
"goslim" => "curation/literature/go_slim_mapping.tab",
"complex" => "curation/literature/go_protein_complex_slim.tab",
"interaction" => "curation/literature/interaction_data.tab",
@@ -135,7 +135,7 @@ function process(){
$gz = false;if(strstr(parent::getParameterValue('output_format'), "gz")) $gz = true;
if(parent::getParameterValue('one_file') == true) {
- $ofile = "sgd.".parent::getParameterValue('output_format');
+ $ofile = "bio2rdf-sgd-".parent::getParameterValue('output_format');
parent::setWriteFile($odir.$ofile, $gz);
}
$dataset_description = '';
@@ -156,7 +156,7 @@ function process(){
}
if(parent::getParameterValue('one_file') == false) {
- $ofile = "sgd_".$file.'.'.parent::getParameterValue('output_format');
+ $ofile = "bio2rdf-sgd-".$file.'.'.parent::getParameterValue('output_format');
parent::setWriteFile($odir.$ofile, $gz);
}
@@ -619,6 +619,7 @@ function domains(){
"BlastProDom" => "prodom",
"FPrintScan" => "fprintscan",
"Gene3D" => "gene3d",
+ "CDD" => "cdd",
"Coil" => "coil",
"Coils" => "coil",
"Pfam" => "pfam",
@@ -627,6 +628,7 @@ function domains(){
"PIRSF" => "pirsf",
"PRINTS" => "prints",
"Seg" => "seg",
+ "SFLD" => "sfld",
"SMART" => "smart",
"SUPERFAMILY" => "superfamily",
"TIGRPFAM" => "pfam",
@@ -636,6 +638,7 @@ function domains(){
"HMMPfam" => "pfam",
"HMMPIR" => "pir",
"HMMTigr" => "tigr",
+ "signalp" => "signalp",
"SignalP_GRAM_POSITIVE" => "signalp",
"SignalP_GRAM_NEGATIVE" => "signalp",
"SignalP_EUK" => "signalp",
@@ -1049,8 +1052,9 @@ function phenotype(){
*/
if(trim($a[7]) != ''){
+ $allele = addslashes($a[7]);
$this->AddRDF(
- parent::triplifyString($this->getRes().$eid, $this->getVoc()."allele", $a[7]).
+ parent::triplifyString($this->getRes().$eid, $this->getVoc()."allele", $allele).
parent::describeProperty($this->getVoc()."allele", "Relationship between an SGD experiment and an allele")
);
}
@@ -1089,7 +1093,7 @@ function phenotype(){
function pathways(){
$sp = false;
- $e = '';
+ $e = array();
while($l = $this->GetReadFile()->Read(96000)) {
$a = explode("\t",$l);
@@ -1344,7 +1348,10 @@ function GetMethodID($label, &$id, &$type) {
}//GetMethodID
function GetLatestNCBOOntology($ontology_id,$apikey,$target_filepath){
- Utils::DownloadSingle('http://data.bioontology.org/ontologies/'.$ontology_id.'/download?apikey='.$apikey, $target_filepath);
+ $url = 'http://data.bioontology.org/ontologies/'.$ontology_id.'/download?apikey='.$apikey;
+ $path = pathinfo($target_filepath);
+ @mkdir($path['dirname'],'0777');
+ Utils::DownloadSingle($url, $target_filepath);
}
}//SGDParser
diff --git a/taxonomy/taxonomy.php b/taxonomy/taxonomy.php
index 5807007..c2d7b76 100644
--- a/taxonomy/taxonomy.php
+++ b/taxonomy/taxonomy.php
@@ -164,7 +164,7 @@ public function Run(){
trigger_error("Unable to get pointer to $fn in $zinfile");
exit("failed\n");
}
- $gzoutfile = $odir."taxonomy-$k".".".parent::getParameterValue('output_format');
+ $gzoutfile = $odir."bio2rdf-taxonomy-$k".".".parent::getParameterValue('output_format');
//set the write file
$gz= strstr(parent::getParameterValue('output_format'), 'gz')?true:false;
@@ -218,14 +218,14 @@ private function names(){
$rel = parent::getVoc().str_replace(" ","-",$a[3]);
parent::addRDF(
- parent::triplifyString($taxid, $rel, $name).
- parent::triplifyString($taxid, parent::getVoc()."unique-name", utf8_encode($a[2]))
+ parent::triplifyString($taxid, $rel, addslashes($name)).
+ parent::triplifyString($taxid, parent::getVoc()."unique-name", addslashes(utf8_encode($a[2])))
);
if($rel == "scientific-name") {
parent::addRDF(
- parent::triplifyString($taxid, "dc:title", $name).
- parent::triplifyString($taxid, "rdfs:label", $name)
+ parent::triplifyString($taxid, "dc:title", addslashes($name)).
+ parent::triplifyString($taxid, "rdfs:label", addslashes($name))
);
}
@@ -327,20 +327,26 @@ private function citations()
continue;
}
$c = parent::getRes()."citation-id-".$a[0];
- $seealso = isset($a[4])?trim($a[4]):"";
+/* $seealso = isset($a[4])?trim($a[4]):"";
if($seealso) {
- $seealso = str_replace(array("lx: DOI ","http;//"), array("http://dx.doi.org/","http://"), $seealso);
+ echo $seealso.PHP_EOL;
+ $seealso = str_replace(array("lx: DOI ","http;//"), array("https://doi.org/","http://"), $seealso);
if(strlen($seealso) > 2 and !strstr($seealso,"http")) $seealso = "http://".$seealso;
- $seealso = parent::triplify($c, "rdfs:seeAlso", $seealso);
+ $seealso = parent::triplifyString($c, "rdfs:seeAlso", addslashes($seealso)); # all kinds of garbarge in this field
+ }
+*/
+ $text = '';
+ if(isset($a[5])) {
+ $text = str_replace(array('"',"'","",'\\',),'',$a[5]); # get rid of garbage characters
}
-
+
parent::addRDF(
parent::describeIndividual($c, $a[1], $this->getVoc()."Citation").
parent::describeClass($this->getVoc()."Citation", "Citation").
parent::triplifyString($c, parent::getVoc()."citation-key", $a[1]).
($a[2]=="0"?"":parent::triplify($c, parent::getVoc()."x-pubmed", "pubmed:".$a[2])).
- $seealso.
- ((isset($a[5]) and $a[5])?parent::triplifyString($c, parent::getVoc()."text", str_replace("\"","", $a[5])):"")
+# $seealso.
+ $text?parent::triplifyString($c, parent::getVoc()."text", $text):""
);
if(isset($a[6])) {
$taxids = explode(" ", trim($a[6]));
diff --git a/wormbase/wormbase.php b/wormbase/wormbase.php
index 023fff2..89912ed 100644
--- a/wormbase/wormbase.php
+++ b/wormbase/wormbase.php
@@ -34,7 +34,7 @@ class WormbaseParser extends Bio2RDFizer {
function __construct($argv) {
parent::__construct($argv, "wormbase");
- parent::addParameter('files', true, 'all|geneIDs|functional_descriptions|gene_associations|gene_interactions|phenotype_associations','all','files to process');
+ parent::addParameter('files', true, 'all|geneIDs|gene_associations|gene_interactions|phenotype_associations','all','files to process'); #functional_descriptions turned into flatfile, needs work
parent::addParameter('release', false, null, 'current', 'Release version of WormBase');
parent::addParameter('download_url', false, null,'ftp://ftp.wormbase.org/pub/wormbase/');
parent::initialize();
@@ -49,10 +49,10 @@ public function run()
$files = explode(",",parent::getParameterValue('files'));
}
$release = parent::getParameterValue('release');
- $releaseb = "WS249";
+ $releaseb = "WS276";
$remote_files = array(
"geneIDs" => "species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.".$release.".geneIDs.txt.gz",
- "functional_descriptions" => "species/c_elegans/annotation/functional_descriptions/c_elegans.PRJNA13758.".$release.".functional_descriptions.txt.gz",
+ #"functional_descriptions" => "species/c_elegans/annotation/functional_descriptions/c_elegans.PRJNA13758.".$release.".functional_descriptions.txt.gz",
"gene_interactions" => "species/c_elegans/annotation/gene_interactions/c_elegans.PRJNA13758.".$release.".gene_interactions.txt.gz",
"gene_associations" => "releases/current-production-release/ONTOLOGY/gene_association.".$releaseb.".wb",
"phenotype_associations" => "releases/current-production-release/ONTOLOGY/phenotype_association.".$releaseb.".wb"
@@ -92,7 +92,7 @@ public function run()
}
$suffix = parent::getParameterValue('output_format');
- $ofile = "wormbase.".$file.".".$suffix;
+ $ofile = "bio2rdf-wormbase-".$file.".".$suffix;
$gz = strstr(parent::getParameterValue('output_format'), "gz")?true:false;
parent::setWriteFile($odir.$ofile, $gz);
@@ -153,11 +153,12 @@ function geneIDs()
$first = true;
while($l = $this->getReadFile()->read()){
if($l[0] == '#') continue;
- // taxon, gene id, symbol, cosmid, status
+ // taxon, gene id, symbol, cosmid, status, type
$data = explode(",",trim($l));
if($first) {
- if(($c = count($data) != 5)) {
- trigger_error("WormBase function expects 5 fields, found $c!".PHP_EOL, E_USER_WARNING);
+ $exp = 6;
+ if((($c = count($data)) != $exp)) {
+ trigger_error("WormBase function expects $exp fields, found $c!".PHP_EOL, E_USER_WARNING);
}
$first = false;
}
@@ -194,7 +195,10 @@ function functional_descriptions()
// gene_id public_name molecular_name concise_description provisional_description detailed_description automated_description gene_class_description
$a = explode("\t",$l);
- if(count($a) != 8) {trigger_error("Found one row that only has ".count($a)." columns, expecting 8",E_USER_ERROR);continue;}
+ if(count($a) != 8) {
+ trigger_error("Found one row that only has ".count($a)." columns, expecting 8",E_USER_ERROR);
+ continue;
+ }
$id = parent::getNamespace().$a[0];
$label = $a[1].($a[2]?" (".$a[2].")":"");
@@ -230,7 +234,9 @@ function gene_associations(){
'NAS'=>'eco:0000034',
'ND'=>'eco:0000035',
'RCA'=>'eco:0000245',
- 'TAS'=>'eco:0000033'
+ 'TAS'=>'eco:0000033',
+ 'HEP'=>'eco:0007007',
+ 'HDA'=>'eco:0007005'
);