|
1 | 1 | <?php
|
2 | 2 | /**
|
3 |
| -Copyright (C) 2012 Michel Dumontier |
| 3 | +Copyright (C) 2012 Michel Dumontier, Alison Callahan |
4 | 4 |
|
5 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6 | 6 | this software and associated documentation files (the "Software"), to deal in
|
|
21 | 21 | SOFTWARE.
|
22 | 22 | */
|
23 | 23 |
|
24 |
| -require('../../php-lib/biopax2bio2rdf.php'); |
25 |
| - |
| 24 | +require_once(__DIR__.'/../../php-lib/biopax2bio2rdf.php'); |
26 | 25 | /**
|
27 | 26 | * Pathwaycommons RDFizer
|
28 |
| - * @version 1.0 |
| 27 | + * @version 2.0 |
29 | 28 | * @author Michel Dumontier
|
| 29 | + * @author Alison Callahan |
30 | 30 | * @description http://www.pathwaycommons.org
|
31 | 31 | */
|
32 |
| -class PathwaycommonsParser extends RDFFactory |
| 32 | +class PathwaycommonsParser extends Bio2RDFizer |
33 | 33 | {
|
34 | 34 | function __construct($argv) {
|
35 |
| - parent::__construct(); |
36 |
| - $this->SetDefaultNamespace("pathwaycommons"); |
37 |
| - |
38 |
| - // set and print application parameters |
39 |
| - $this->AddParameter('files',true,'all|biogrid|cell-map|hprd|humancyc|imid|intact|mint|nci-nature|reactome','all','biopax OWL files to process'); |
40 |
| - $this->AddParameter('indir',false,null,'/data/download/'.$this->GetNamespace().'/','directory to download into and parse from'); |
41 |
| - $this->AddParameter('outdir',false,null,'/data/rdf/'.$this->GetNamespace().'/','directory to place rdfized files'); |
42 |
| - $this->AddParameter('graph_uri',false,null,null,'provide the graph uri to generate n-quads instead of n-triples'); |
43 |
| - $this->AddParameter('gzip',false,'true|false','true','gzip the output'); |
44 |
| - $this->AddParameter('download',false,'true|false','false','set true to download files'); |
45 |
| - $this->AddParameter('download_url',false,null,'http://www.pathwaycommons.org/pc-snapshot/current-release/biopax/by_source/'); |
46 |
| - if($this->SetParameters($argv) == FALSE) { |
47 |
| - $this->PrintParameters($argv); |
48 |
| - exit; |
49 |
| - } |
50 |
| - if($this->CreateDirectory($this->GetParameterValue('indir')) === FALSE) exit; |
51 |
| - if($this->CreateDirectory($this->GetParameterValue('outdir')) === FALSE) exit; |
52 |
| - if($this->GetParameterValue('graph_uri')) $this->SetGraphURI($this->GetParameterValue('graph_uri')); |
53 |
| - |
54 |
| - return TRUE; |
| 35 | + parent::__construct($argv, "pathwaycommons"); |
| 36 | + parent::addParameter('files',true,'all|homo-sapiens|hprd|humancyc|nci-nature|panther-pathway|phosphositeplus|reactome','all','biopax OWL files to process'); |
| 37 | + parent::addParameter('download_url',false,null,'http://www.pathwaycommons.org/pc2/downloads/'); |
| 38 | + parent::initialize(); |
55 | 39 | }
|
56 | 40 |
|
57 | 41 | function Run()
|
58 | 42 | {
|
59 | 43 | // get the work
|
60 | 44 | if($this->GetParameterValue('files') == 'all') {
|
61 |
| - $sources = explode("|",$this->GetParameterList('files')); |
| 45 | + $sources = explode("|", parent::getParameterList('files')); |
62 | 46 | array_shift($sources);
|
63 | 47 | } else {
|
64 | 48 | // comma separated list
|
65 |
| - $sources = explode(",",$this->GetParameterValue('files')); |
| 49 | + $sources = explode(",", parent::getParameterValue('files')); |
66 | 50 | }
|
67 | 51 |
|
| 52 | + $download_files = array( |
| 53 | + "homo-sapiens" => "Pathway%20Commons%202%20homo%20sapiens.BIOPAX.owl.gz", |
| 54 | + "hprd" => "Pathway%20Commons%202%20HPRD.BIOPAX.owl.gz", |
| 55 | + "humancyc" => "Pathway%20Commons%202%20HumanCyc.BIOPAX.owl.gz", |
| 56 | + "nci-nature" => "Pathway%20Commons%202%20NCI_Nature.BIOPAX.owl.gz", |
| 57 | + "panther-pathway" => "Pathway%20Commons%202%20PANTHER%20Pathway.BIOPAX.owl.gz", |
| 58 | + "phosphositeplus" => "Pathway%20Commons%202%20PhosphoSitePlus.BIOPAX.owl.gz", |
| 59 | + "reactome" => "Pathway%20Commons%202%20Reactome.BIOPAX.owl.gz", |
| 60 | + ); |
| 61 | + |
| 62 | + $graph_uri = parent::getGraphURI(); |
| 63 | + if(parent::getParameterValue('dataset_graph') == true) parent::setGraphURI(parent::getDatasetURI()); |
| 64 | + |
| 65 | + $dataset_description = ''; |
| 66 | + |
68 | 67 | // iterate over the requested data
|
69 | 68 | foreach($sources AS $source) {
|
70 |
| - echo "processing $source..."; |
| 69 | + echo "processing $source... "; |
| 70 | + |
| 71 | + $ldir = parent::getParameterValue('indir'); |
| 72 | + $odir = parent::getParameterValue('outdir'); |
| 73 | + $rdir = parent::getParameterValue('download_url'); |
71 | 74 |
|
72 | 75 | // set the remote and input files
|
73 | 76 | $file = $source.".owl";
|
74 |
| - $zfile = $source.".owl.zip"; |
75 |
| - $rfile = $this->GetParameterValue('download_url').$zfile; |
76 |
| - $lfile = $this->GetParameterValue('indir').$zfile; |
| 77 | + $zfile = $source.".owl.gz"; |
| 78 | + $rfile = $rdir.$download_files[$source]; |
| 79 | + $lfile = $ldir.$zfile; |
77 | 80 |
|
78 | 81 | // download if if the file doesn't exist locally or we are told to
|
79 | 82 | if(!file_exists($lfile) || $this->GetParameterValue('download') == 'true') {
|
80 | 83 | // download
|
81 |
| - echo "downloading.."; |
| 84 | + echo "downloading... "; |
82 | 85 | file_put_contents($lfile, file_get_contents($rfile));
|
83 | 86 | }
|
84 | 87 |
|
85 | 88 | // extract the file out of the ziparchive
|
86 | 89 | // and load into a buffer
|
87 |
| - echo 'extracting...'; |
88 |
| - $zin = new ZipArchive(); |
89 |
| - if ($zin->open($lfile) === FALSE) { |
90 |
| - trigger_error("Unable to open $lfile"); |
| 90 | + echo 'extracting... '; |
| 91 | + |
| 92 | + if (($fpin = gzopen($lfile, "r")) === FALSE) { |
| 93 | + trigger_error("Unable to open $lfile", E_USER_ERROR); |
91 | 94 | exit;
|
92 | 95 | }
|
| 96 | + |
93 | 97 | $data = '';
|
94 |
| - $fpin = $zin->getStream($file); |
95 |
| - while($l = fgets($fpin)) $data .= $l; |
96 |
| - fclose($fpin); |
| 98 | + while (!gzeof($fpin)) { |
| 99 | + $buffer = gzgets($fpin, 4096); |
| 100 | + $data .= $buffer; |
| 101 | + } |
| 102 | + gzclose($fpin); |
97 | 103 |
|
98 | 104 | // set the output file
|
99 |
| - $outfile = $this->GetParameterValue('outdir').$source.'nt'; |
| 105 | + $suffix = parent::getParameterValue('output_format'); |
| 106 | + $outfile = $source.'.'.$suffix; |
| 107 | + |
100 | 108 | $gz = false;
|
101 |
| - if($this->GetParameterValue('graph_uri')) {$outfile = $this->GetParameterValue('outdir').$source.'nq';} |
102 |
| - if($this->GetParameterValue('gzip') == 'true') { |
103 |
| - $outfile .= '.gz'; |
| 109 | + if(strstr(parent::getParameterValue('output_format'), "gz")) { |
104 | 110 | $gz = true;
|
105 | 111 | }
|
106 |
| - $this->SetWriteFile($outfile, $gz); |
| 112 | + |
| 113 | + parent::setWriteFile($odir.$outfile, $gz); |
107 | 114 |
|
108 |
| - // parse |
109 |
| - $this->Parse($data); |
| 115 | + // send for parsing |
| 116 | + $p = new BioPAX2Bio2RDF($this); |
| 117 | + $p->SetBuffer($data) |
| 118 | + ->SetBioPAXVersion(3) |
| 119 | + ->SetBaseNamespace("http://purl.org/pc2/3/") |
| 120 | + ->SetBio2RDFNamespace("http://bio2rdf.org/pathwaycommons:") |
| 121 | + ->SetDatasetURI(parent::getDatasetURI()); |
| 122 | + $rdf = $p->Parse(); |
| 123 | + parent::addRDF($rdf); |
110 | 124 |
|
111 | 125 | // write to output
|
112 |
| - $this->WriteRDFBufferToWriteFile(); |
113 |
| - $this->GetWriteFile()->Close(); |
| 126 | + parent::writeRDFBufferToWriteFile(); |
| 127 | + parent::getWriteFile()->Close(); |
| 128 | + |
| 129 | + echo "done!".PHP_EOL; |
| 130 | + |
| 131 | + //generate dataset description |
| 132 | + echo "Generating dataset description for $zfile... "; |
| 133 | + $source_file = (new DataResource($this)) |
| 134 | + ->setURI($rfile) |
| 135 | + ->setTitle("Pathway Commons") |
| 136 | + ->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($lfile))) |
| 137 | + ->setFormat("rdf/xml") |
| 138 | + ->setPublisher("http://www.pathwaycommons.org/") |
| 139 | + ->setHomepage("http://www.pathwaycommons.org/") |
| 140 | + ->setRights("use") |
| 141 | + ->setRights("restricted-by-source-license") |
| 142 | + ->setLicense("http://www.pathwaycommons.org/pc2/home.html#data_sources") |
| 143 | + ->setDataset("http://identifiers.org/pathwaycommons/"); |
114 | 144 |
|
115 |
| - echo PHP_EOL; |
| 145 | + $dataset_description .= $source_file->toRDF(); |
| 146 | + echo "done!".PHP_EOL; |
116 | 147 | }
|
117 |
| - return TRUE; |
| 148 | + |
| 149 | + echo "Generating dataset description for Bio2RDF Pathways Commons dataset... "; |
| 150 | + |
| 151 | + $prefix = parent::getPrefix(); |
| 152 | + $bVersion = parent::getParameterValue('bio2rdf_release'); |
| 153 | + $date = date ("Y-m-d\TG:i:s\Z"); |
| 154 | + $output_file = (new DataResource($this)) |
| 155 | + ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/") |
| 156 | + ->setTitle("Bio2RDF v$bVersion RDF version of $prefix (generated at $date)") |
| 157 | + ->setSource($source_file->getURI()) |
| 158 | + ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pathwaycommons/pathwaycommons.php") |
| 159 | + ->setCreateDate($date) |
| 160 | + ->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html") |
| 161 | + ->setPublisher("http://bio2rdf.org") |
| 162 | + ->setRights("use-share-modify") |
| 163 | + ->setRights("by-attribution") |
| 164 | + ->setRights("restricted-by-source-license") |
| 165 | + ->setLicense("http://creativecommons.org/licenses/by/3.0/") |
| 166 | + ->setDataset(parent::getDatasetURI()); |
| 167 | + |
| 168 | + if($gz) $output_file->setFormat("application/gzip"); |
| 169 | + if(strstr(parent::getParameterValue('output_format'),"nt")) $output_file->setFormat("application/n-triples"); |
| 170 | + else $output_file->setFormat("application/n-quads"); |
| 171 | + |
| 172 | + $dataset_description .= $output_file->toRDF(); |
| 173 | + |
| 174 | + //write dataset description to file |
| 175 | + parent::setGraphURI($graph_uri); |
| 176 | + parent::setWriteFile($odir.parent::getBio2RDFReleaseFile()); |
| 177 | + parent::getWriteFile()->write($dataset_description); |
| 178 | + parent::getWriteFile()->close(); |
| 179 | + echo "done!".PHP_EOL; |
118 | 180 | }
|
119 | 181 |
|
120 | 182 |
|
|
0 commit comments