Skip to content

Commit e8a828c

Browse files
updated interpro parser to bio2rdfapi
1 parent d34b0d6 commit e8a828c

File tree

1 file changed

+79
-86
lines changed

1 file changed

+79
-86
lines changed

interpro/interpro.php

Lines changed: 79 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?php
22
/**
3-
Copyright (C) 2012 Michel Dumontier
3+
Copyright (C) 2012-2013 Michel Dumontier
44
55
Permission is hereby granted, free of charge, to any person obtaining a copy of
66
this software and associated documentation files (the "Software"), to deal in
@@ -20,54 +20,36 @@
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
SOFTWARE.
2222
*/
23-
require('../../php-lib/rdfapi.php');
24-
require('../../php-lib/xmlapi.php');
23+
require_once(__DIR__.'/../../php-lib/bio2rdfapi.php');
24+
require_once(__DIR__.'/../../php-lib/xmlapi.php');
2525
/**
2626
* InterPro RDFizer
27-
* @version 1.0
27+
* @version 2.0
2828
* @author Michel Dumontier
2929
* @description http://www.ebi.ac.uk/interpro/
3030
*/
31-
class AffymetrixParser extends RDFFactory
31+
class InterproParser extends Bio2RDFizer
3232
{
3333
private $version = null;
34-
35-
function __construct($argv) {
36-
parent::__construct();
37-
$this->SetDefaultNamespace("interpro");
38-
39-
// set and print application parameters
40-
$this->AddParameter('files',true,'all','all','');
41-
$this->AddParameter('indir',false,null,'/data/download/'.$this->GetNamespace().'/','directory to download into and parse from');
42-
$this->AddParameter('outdir',false,null,'/data/rdf/'.$this->GetNamespace().'/','directory to place rdfized files');
43-
$this->AddParameter('graph_uri',false,null,null,'provide the graph uri to generate n-quads instead of n-triples');
44-
$this->AddParameter('gzip',false,'true|false','true','gzip the output');
45-
$this->AddParameter('download',false,'true|false','false','set true to download files');
46-
$this->AddParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz','');
4734

48-
if($this->SetParameters($argv) == FALSE) {
49-
$this->PrintParameters($argv);
50-
exit;
51-
}
52-
if($this->CreateDirectory($this->GetParameterValue('indir')) === FALSE) exit;
53-
if($this->CreateDirectory($this->GetParameterValue('outdir')) === FALSE) exit;
54-
if($this->GetParameterValue('graph_uri')) $this->SetGraphURI($this->GetParameterValue('graph_uri'));
55-
56-
return TRUE;
35+
function __construct($argv) {
36+
parent::__construct($argv,"interpro");
37+
parent::addParameter('files',true,'all','all','');
38+
parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz','');
39+
parent::initialize();
5740
}
5841

5942
function Run()
6043
{
6144
// directory shortcuts
62-
$ldir = $this->GetParameterValue('indir');
63-
$odir = $this->GetParameterValue('outdir');
64-
45+
$ldir = parent::getParameterValue('indir');
46+
$odir = parent::getParameterValue('outdir');
6547

6648
// get the listings page
67-
$rfile = trim($this->GetParameterValue('download_url'));
49+
$rfile = trim(parent::getParameterValue('download_url'));
6850
$file = "interpro.xml.gz";
6951
$lfile = $ldir.$file;
70-
if(!file_exists($lfile) || $this->GetParameterValue("download") == "true") {
52+
if(!file_exists($lfile) || parent::getParameterValue("download") == "true") {
7153
echo "Downloading $lfile".PHP_EOL;
7254
$ret = file_get_contents($rfile);
7355
if($ret === FALSE) {
@@ -76,41 +58,37 @@ function Run()
7658
}
7759
file_put_contents($lfile,$ret);
7860
}
61+
echo "Loading XML file...";
7962
$cxml = new CXML($ldir,$file);
8063
$cxml->Parse();
81-
$xml = $cxml->GetXMLRoot();
82-
64+
$xml = $cxml->GetXMLRoot();
65+
echo "Done".PHP_EOL;
8366

8467
// set the write file
85-
$outfile = 'interpro.nt'; $gz=false;
86-
if($this->GetParameterValue('graph_uri')) {$outfile = 'interpro.nq';}
87-
if($this->GetParameterValue('gzip')) {
88-
$outfile .= '.gz';
89-
$gz = true;
90-
}
91-
$this->SetWriteFile($odir.$outfile, $gz);
68+
$gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
69+
$outfile = "interpro.".parent::getParameterValue('output_format');
70+
parent::setWriteFile($odir.$outfile, $gz);
9271

9372
echo "Parsing interpro xml file".PHP_EOL;
94-
$this->Parse($xml);
95-
$this->WriteRDFBufferToWriteFile();
96-
$this->GetWriteFile()->Close();
73+
$this->parse($xml);
74+
parent::writeRDFBufferToWriteFile();
75+
parent::getWriteFile()->close();
9776
echo "Done!".PHP_EOL;
9877

9978
// generate the release file
100-
$this->DeleteBio2RDFReleaseFiles($odir);
101-
$desc = $this->GetBio2RDFDatasetDescription(
102-
$this->GetNamespace(),
79+
$desc = parent::getBio2RDFDatasetDescription(
80+
parent::getPrefix(),
10381
"https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/intepro.php",
104-
$this->GetBio2RDFDownloadURL($this->GetNamespace()).$outfile,
82+
parent::getBio2RDFDownloadURL(parent::getPrefix()).$outfile,
10583
"http://www.ebi.ac.uk/interpro/",
10684
array("use-share-modify"),
10785
null, // license
108-
$this->GetParameterValue('download_url'),
86+
parent::getParameterValue('download_url'),
10987
$this->version
11088
);
111-
$this->SetWriteFile($odir.$this->GetBio2RDFReleaseFile($this->GetNamespace()));
112-
$this->GetWriteFile()->Write($desc);
113-
$this->GetWriteFile()->Close();
89+
parent::setWriteFile($odir.parent::getBio2RDFReleaseFile(parent::getPrefix()));
90+
parent::getWriteFile()->write($desc);
91+
parent::getWriteFile()->close();
11492

11593
return true;
11694
}
@@ -120,24 +98,29 @@ function Parse($xml)
12098
// state the dataset info
12199
foreach($xml->release->dbinfo AS $o) {
122100
$db = $o->attributes()->dbname." v".$o->attributes()->version." (".$o->attributes()->entry_count." entries) [".$o->attributes()->file_date."]";
123-
$this->AddRDF($this->QQuadL($this->GetDatasetURI(), "interpro_vocabulary:contains", $db));
101+
parent::addRDF(
102+
parent::triplifyString(parent::getDatasetURI(), parent::getVoc()."contains", $db)
103+
);
104+
if(((string)$o->attributes()->dbname) === "INTERPRO") {
105+
parent::setDatasetVersion($o->attributes()->version);
106+
}
124107
}
125108
// now interate over the entries
126109
foreach($xml->interpro AS $o) {
127-
$this->WriteRDFBufferToWriteFile();
110+
parent::writeRDFBufferToWriteFile();
128111

129112
$interpro_id = $o->attributes()->id;
130-
echo "Processing id... $interpro_id".PHP_EOL;
113+
echo "Processing $interpro_id".PHP_EOL;
131114

132115
$name = $o->name;
133116
$short_name = $o->attributes()->short_name;
134117
$type = $o->attributes()->type;
135-
$s = "interpro:$interpro_id";
118+
$s = parent::getNamespace().$interpro_id;
136119

137-
echo "Adding... $s rdfs:label $name ($short_name) $type [$s]".PHP_EOL;
138-
$this->AddRDF($this->QQuadL($s,"rdfs:label","$name ($short_name) $type [$s]"));
139-
$this->AddRDF($this->QQuad($s,"rdf:type","interpro_vocabulary:$type"));
140-
$this->AddRDF($this->QQuad($s,"void:inDataset",$this->GetDatasetURI()));
120+
//echo "Adding... $s rdfs:label $name ($short_name) $type [$s]".PHP_EOL;
121+
parent::addRDF(
122+
parent::describeIndividual($s,"$name ($short_name) $type", parent::getVoc().$type)
123+
);
141124

142125
// get the pubs
143126
unset($pubs);
@@ -148,7 +131,9 @@ function Parse($xml)
148131
$pmid = (string) $p->db_xref->attributes()->dbkey;
149132
$pubs['pid'][] = '<cite idref="'.$pid.'"/>';
150133
$pubs['pmid'][] = '<a href="http://www.ncbi.nlm.nih.gov/pubmed/'.$pmid.'">pubmed:'.$pmid.'</a>';
151-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-pubmed","pubmed:$pmid"));
134+
parent::addRDF(
135+
parent::triplify($s,parent::getVoc()."x-pubmed","pubmed:$pmid")
136+
);
152137
}
153138
}
154139
}
@@ -157,42 +142,56 @@ function Parse($xml)
157142
$abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract);
158143
}
159144

160-
$this->AddRDF($this->QQuadL($s,"dc:description",$this->SafeLiteral($abstract)));
145+
parent::addRDF(
146+
parent::triplifyString($s,"dc:description",$abstract)
147+
);
161148

162149
foreach($o->example_list->example AS $example) {
163150
$db = (string) $example->db_xref->attributes()->db;
164151
$id = (string) $example->db_xref->attributes()->dbkey;
165-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:example-entry", $this->GetNS()->MapQName("$db:$id")));
152+
parent::addRDF(
153+
parent::triplify($s,parent::getVoc()."example-entry", "$db:$id")
154+
);
166155
}
167156

168157
if(isset($o->parent_list->rel_ref)) {
169158
foreach($o->parent_list->rel_ref AS $parent) {
170159
$id = (string) $parent->attributes()->ipr_ref;
171-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:parent", "interpro:$id"));
160+
parent::addRDF(
161+
parent::triplify($s,parent::getVoc()."parent", "interpro:$id")
162+
);
172163
}
173164
}
174165
if(isset($o->child->rel_ref)) {
175166
foreach($o->child->rel_ref AS $child) {
176167
$id = (string) $child->attributes()->ipr_ref;
177-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:child", "interpro:$id"));
168+
parent::addRDF(
169+
parent::triplify($s,parent::getVoc()."child", "interpro:$id")
170+
);
178171
}
179172
}
180173
if(isset($o->contains->rel_ref)) {
181174
foreach($o->contains->rel_ref AS $contains) {
182175
$id = (string) $contains->attributes()->ipr_ref;
183-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:contains", "interpro:$id"));
176+
parent::addRDF(
177+
parent::triplify($s,parent::getVoc()."contains", "interpro:$id")
178+
);
184179
}
185180
}
186181
if(isset($o->found_in->rel_ref)) {
187182
foreach($o->found_in->rel_ref AS $f) {
188183
$id = (string) $f->attributes()->ipr_ref;
189-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:found-in", "interpro:$id"));
184+
parent::addRDF(
185+
parent::triplify($s,parent::getVoc()."found-in", "interpro:$id")
186+
);
190187
}
191188
}
192189
if(isset($o->sec_list->sec_ac)) {
193190
foreach($o->sec_ac AS $s) {
194191
$id = (string) $s->attributes()->acc;
195-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:secondary-accession", "interpro:$id"));
192+
parent::addRDF(
193+
parent::triplify($s,parent::getVoc()."secondary-accession", "interpro:$id")
194+
);
196195
}
197196
}
198197

@@ -202,45 +201,39 @@ function Parse($xml)
202201
foreach($o->member_list->db_xref AS $dbxref) {
203202
$db = (string) $dbxref->attributes()->db;
204203
$id = (string) $dbxref->attributes()->dbkey;
205-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-".strtolower($db), "$db:$id"));
204+
parent::addRDF(
205+
parent::triplify($s,parent::getVoc()."x-".strtolower($db), "$db:$id")
206+
);
206207
}
207208
}
208209
if(isset($o->external_doc_list)) {
209210
foreach($o->external_doc_list->db_xref AS $dbxref) {
210211
$db = (string) $dbxref->attributes()->db;
211212
$id = (string) $dbxref->attributes()->dbkey;
212-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-".strtolower($db), "$db:$id"));
213+
parent::addRDF(
214+
parent::triplify($s,parent::getVoc()."x-".strtolower($db), "$db:$id")
215+
);
213216
}
214217
}
215218
if(isset($o->structure_db_links->db_xref)) {
216219
foreach($o->structure_db_links->db_xref AS $dbxref) {
217220
$db = (string) $dbxref->attributes()->db;
218221
$id = (string) $dbxref->attributes()->dbkey;
219-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-".strtolower($db), "$db:$id"));
222+
parent::addRDF(
223+
parent::triplify($s,parent::getVoc()."x-".strtolower($db), "$db:$id")
224+
);
220225
}
221226
}
222227

223228
// taxon distribution
224229
foreach($o->taxonomy_distribution->taxon_data AS $t) {
225230
$organism = (string) $t->attributes()->name;
226231
$number = (string) $t->attributes()->proteins_count;
227-
$this->AddRDF($this->QQuadL($s,"interpro_vocabulary:taxon-distribution", "$organism ($number)"));
232+
parent::addRDF(
233+
parent::triplifyString($s,parent::getVoc()."taxon-distribution", "$organism ($number)")
234+
);
228235
}
229236
}
230237
}
231-
232238
}
233-
$start = microtime(true);
234-
235-
set_error_handler('error_handler');
236-
$parser = new AffymetrixParser($argv);
237-
$parser->Run();
238-
239-
$end = microtime(true);
240-
$time_taken = $end - $start;
241-
print "Started: ".date("l jS F \@ g:i:s a", $start)."\n";
242-
print "Finished: ".date("l jS F \@ g:i:s a", $end)."\n";
243-
print "Took: ".$time_taken." seconds\n"
244239
?>
245-
246-

0 commit comments

Comments
 (0)