Skip to content

Commit 96e57f2

Browse files
Merge pull request #268 from micheldumontier/interpro
update of Interpro to release 3
2 parents c971c6c + 79c0e26 commit 96e57f2

File tree

1 file changed

+122
-93
lines changed

1 file changed

+122
-93
lines changed

interpro/interpro.php

+122-93
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?php
22
/**
3-
Copyright (C) 2012 Michel Dumontier
3+
Copyright (C) 2012-2013 Michel Dumontier
44
55
Permission is hereby granted, free of charge, to any person obtaining a copy of
66
this software and associated documentation files (the "Software"), to deal in
@@ -20,54 +20,36 @@
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
SOFTWARE.
2222
*/
23-
require('../../php-lib/rdfapi.php');
24-
require('../../php-lib/xmlapi.php');
23+
require_once(__DIR__.'/../../php-lib/bio2rdfapi.php');
24+
require_once(__DIR__.'/../../php-lib/xmlapi.php');
2525
/**
2626
* InterPro RDFizer
27-
* @version 1.0
27+
* @version 2.0
2828
* @author Michel Dumontier
2929
* @description http://www.ebi.ac.uk/interpro/
3030
*/
31-
class AffymetrixParser extends RDFFactory
31+
class InterproParser extends Bio2RDFizer
3232
{
3333
private $version = null;
34-
35-
function __construct($argv) {
36-
parent::__construct();
37-
$this->SetDefaultNamespace("interpro");
38-
39-
// set and print application parameters
40-
$this->AddParameter('files',true,'all','all','');
41-
$this->AddParameter('indir',false,null,'/data/download/'.$this->GetNamespace().'/','directory to download into and parse from');
42-
$this->AddParameter('outdir',false,null,'/data/rdf/'.$this->GetNamespace().'/','directory to place rdfized files');
43-
$this->AddParameter('graph_uri',false,null,null,'provide the graph uri to generate n-quads instead of n-triples');
44-
$this->AddParameter('gzip',false,'true|false','true','gzip the output');
45-
$this->AddParameter('download',false,'true|false','false','set true to download files');
46-
$this->AddParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz','');
4734

48-
if($this->SetParameters($argv) == FALSE) {
49-
$this->PrintParameters($argv);
50-
exit;
51-
}
52-
if($this->CreateDirectory($this->GetParameterValue('indir')) === FALSE) exit;
53-
if($this->CreateDirectory($this->GetParameterValue('outdir')) === FALSE) exit;
54-
if($this->GetParameterValue('graph_uri')) $this->SetGraphURI($this->GetParameterValue('graph_uri'));
55-
56-
return TRUE;
35+
function __construct($argv) {
36+
parent::__construct($argv,"interpro");
37+
parent::addParameter('files',true,'all','all','');
38+
parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz','');
39+
parent::initialize();
5740
}
5841

5942
function Run()
6043
{
6144
// directory shortcuts
62-
$ldir = $this->GetParameterValue('indir');
63-
$odir = $this->GetParameterValue('outdir');
64-
45+
$ldir = parent::getParameterValue('indir');
46+
$odir = parent::getParameterValue('outdir');
6547

6648
// get the listings page
67-
$rfile = trim($this->GetParameterValue('download_url'));
49+
$rfile = trim(parent::getParameterValue('download_url'));
6850
$file = "interpro.xml.gz";
6951
$lfile = $ldir.$file;
70-
if(!file_exists($lfile) || $this->GetParameterValue("download") == "true") {
52+
if(!file_exists($lfile) || parent::getParameterValue("download") == "true") {
7153
echo "Downloading $lfile".PHP_EOL;
7254
$ret = file_get_contents($rfile);
7355
if($ret === FALSE) {
@@ -76,41 +58,67 @@ function Run()
7658
}
7759
file_put_contents($lfile,$ret);
7860
}
61+
echo "Loading XML file...";
7962
$cxml = new CXML($ldir,$file);
8063
$cxml->Parse();
81-
$xml = $cxml->GetXMLRoot();
82-
64+
$xml = $cxml->GetXMLRoot();
65+
echo "Done".PHP_EOL;
8366

8467
// set the write file
85-
$outfile = 'interpro.nt'; $gz=false;
86-
if($this->GetParameterValue('graph_uri')) {$outfile = 'interpro.nq';}
87-
if($this->GetParameterValue('gzip')) {
88-
$outfile .= '.gz';
89-
$gz = true;
90-
}
91-
$this->SetWriteFile($odir.$outfile, $gz);
68+
$gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
69+
$outfile = "interpro.".parent::getParameterValue('output_format');
70+
parent::setWriteFile($odir.$outfile, $gz);
9271

9372
echo "Parsing interpro xml file".PHP_EOL;
94-
$this->Parse($xml);
95-
$this->WriteRDFBufferToWriteFile();
96-
$this->GetWriteFile()->Close();
73+
$this->parse($xml);
74+
parent::writeRDFBufferToWriteFile();
75+
parent::getWriteFile()->close();
9776
echo "Done!".PHP_EOL;
77+
78+
79+
// let's make an nq file
80+
parent::setGraphURI(parent::getDatasetURI());
81+
82+
// dataset description
83+
$source_version = parent::getDatasetVersion();
84+
$source_file = (new DataResource($this))
85+
->setURI($rfile)
86+
->setTitle("InterPro v$source_version")
87+
->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($lfile)))
88+
->setFormat("application/xml")
89+
->setFormat("application/g-zip")
90+
->setPublisher("http://www.ebi.ac.uk/")
91+
->setHomepage("http://www.ebi.ac.uk/interpro/")
92+
->setRights("InterPro - Integrated Resource Of Protein Domains And Functional Sites. Copyright (C) 2001 The InterPro Consortium")
93+
->setLicense("http://www.ebi.ac.uk/interpro/faqs.html")
94+
->setDataset("http://identifiers.org/interpro/");
9895

99-
// generate the release file
100-
$this->DeleteBio2RDFReleaseFiles($odir);
101-
$desc = $this->GetBio2RDFDatasetDescription(
102-
$this->GetNamespace(),
103-
"https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/intepro.php",
104-
$this->GetBio2RDFDownloadURL($this->GetNamespace()).$outfile,
105-
"http://www.ebi.ac.uk/interpro/",
106-
array("use-share-modify"),
107-
null, // license
108-
$this->GetParameterValue('download_url'),
109-
$this->version
110-
);
111-
$this->SetWriteFile($odir.$this->GetBio2RDFReleaseFile($this->GetNamespace()));
112-
$this->GetWriteFile()->Write($desc);
113-
$this->GetWriteFile()->Close();
96+
$prefix = parent::getPrefix();
97+
$bVersion = parent::getParameterValue('bio2rdf_release');
98+
$date = date ("Y-m-d\TG:i:s\Z");
99+
$output_file = (new DataResource($this))
100+
->setURI("http://download.bio2df.org/release/$bVersion/$prefix/$outfile")
101+
->setTitle("Bio2RDF v$bVersion RDF version of $prefix v$source_version")
102+
->setSource($source_file->getURI())
103+
->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/interpro.php")
104+
->setCreateDate($date)
105+
->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html")
106+
->setPublisher("http://bio2rdf.org")
107+
->setRights("use-share-modify")
108+
->setRights("by-attribution")
109+
->setRights("restricted-by-source-license")
110+
->setLicense("http://creativecommons.org/licenses/by/3.0/")
111+
->setDataset(parent::getDatasetURI());
112+
113+
if($gz) $output_file->setFormat("application/gzip");
114+
if(strstr(parent::getParameterValue('output_format'),"nt")) $output_file->setFormat("application/n-triples");
115+
else $output_file->setFormat("application/n-quads");
116+
117+
$dataset_description = $source_file->toRDF().$output_file->toRDF();
118+
119+
parent::setWriteFile($odir.parent::getBio2RDFReleaseFile());
120+
parent::getWriteFile()->write($dataset_description);
121+
parent::getWriteFile()->close();
114122

115123
return true;
116124
}
@@ -120,24 +128,35 @@ function Parse($xml)
120128
// state the dataset info
121129
foreach($xml->release->dbinfo AS $o) {
122130
$db = $o->attributes()->dbname." v".$o->attributes()->version." (".$o->attributes()->entry_count." entries) [".$o->attributes()->file_date."]";
123-
$this->AddRDF($this->QQuadL($this->GetDatasetURI(), "interpro_vocabulary:contains", $db));
131+
parent::addRDF(
132+
parent::triplifyString(parent::getDatasetURI(), parent::getVoc()."contains", $db)
133+
);
134+
if(((string)$o->attributes()->dbname) === "INTERPRO") {
135+
parent::setDatasetVersion($o->attributes()->version);
136+
}
124137
}
138+
// get a potential id list
139+
$id_list = explode(",",parent::getParameterValue("id_list"));
140+
125141
// now interate over the entries
126142
foreach($xml->interpro AS $o) {
127-
$this->WriteRDFBufferToWriteFile();
143+
parent::writeRDFBufferToWriteFile();
128144

129145
$interpro_id = $o->attributes()->id;
130-
echo "Processing id... $interpro_id".PHP_EOL;
146+
if(isset($id_list) && !in_array($interpro_id,$id_list)) {
147+
continue;
148+
}
149+
echo "Processing $interpro_id".PHP_EOL;
131150

132151
$name = $o->name;
133152
$short_name = $o->attributes()->short_name;
134153
$type = $o->attributes()->type;
135-
$s = "interpro:$interpro_id";
154+
$s = parent::getNamespace().$interpro_id;
136155

137-
echo "Adding... $s rdfs:label $name ($short_name) $type [$s]".PHP_EOL;
138-
$this->AddRDF($this->QQuadL($s,"rdfs:label","$name ($short_name) $type [$s]"));
139-
$this->AddRDF($this->QQuad($s,"rdf:type","interpro_vocabulary:$type"));
140-
$this->AddRDF($this->QQuad($s,"void:inDataset",$this->GetDatasetURI()));
156+
//echo "Adding... $s rdfs:label $name ($short_name) $type [$s]".PHP_EOL;
157+
parent::addRDF(
158+
parent::describeIndividual($s,"$name ($short_name) $type", parent::getVoc().$type)
159+
);
141160

142161
// get the pubs
143162
unset($pubs);
@@ -148,7 +167,9 @@ function Parse($xml)
148167
$pmid = (string) $p->db_xref->attributes()->dbkey;
149168
$pubs['pid'][] = '<cite idref="'.$pid.'"/>';
150169
$pubs['pmid'][] = '<a href="http://www.ncbi.nlm.nih.gov/pubmed/'.$pmid.'">pubmed:'.$pmid.'</a>';
151-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-pubmed","pubmed:$pmid"));
170+
parent::addRDF(
171+
parent::triplify($s,parent::getVoc()."x-pubmed","pubmed:$pmid")
172+
);
152173
}
153174
}
154175
}
@@ -157,42 +178,56 @@ function Parse($xml)
157178
$abstract = str_replace($pubs['pid'],$pubs['pmid'],$abstract);
158179
}
159180

160-
$this->AddRDF($this->QQuadL($s,"dc:description",$this->SafeLiteral($abstract)));
181+
parent::addRDF(
182+
parent::triplifyString($s,"dc:description",$abstract)
183+
);
161184

162185
foreach($o->example_list->example AS $example) {
163186
$db = (string) $example->db_xref->attributes()->db;
164187
$id = (string) $example->db_xref->attributes()->dbkey;
165-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:example-entry", $this->GetNS()->MapQName("$db:$id")));
188+
parent::addRDF(
189+
parent::triplify($s,parent::getVoc()."example-entry", "$db:$id")
190+
);
166191
}
167192

168193
if(isset($o->parent_list->rel_ref)) {
169194
foreach($o->parent_list->rel_ref AS $parent) {
170195
$id = (string) $parent->attributes()->ipr_ref;
171-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:parent", "interpro:$id"));
196+
parent::addRDF(
197+
parent::triplify($s,parent::getVoc()."parent", "interpro:$id")
198+
);
172199
}
173200
}
174201
if(isset($o->child->rel_ref)) {
175202
foreach($o->child->rel_ref AS $child) {
176203
$id = (string) $child->attributes()->ipr_ref;
177-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:child", "interpro:$id"));
204+
parent::addRDF(
205+
parent::triplify($s,parent::getVoc()."child", "interpro:$id")
206+
);
178207
}
179208
}
180209
if(isset($o->contains->rel_ref)) {
181210
foreach($o->contains->rel_ref AS $contains) {
182211
$id = (string) $contains->attributes()->ipr_ref;
183-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:contains", "interpro:$id"));
212+
parent::addRDF(
213+
parent::triplify($s,parent::getVoc()."contains", "interpro:$id")
214+
);
184215
}
185216
}
186217
if(isset($o->found_in->rel_ref)) {
187218
foreach($o->found_in->rel_ref AS $f) {
188219
$id = (string) $f->attributes()->ipr_ref;
189-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:found-in", "interpro:$id"));
220+
parent::addRDF(
221+
parent::triplify($s,parent::getVoc()."found-in", "interpro:$id")
222+
);
190223
}
191224
}
192225
if(isset($o->sec_list->sec_ac)) {
193226
foreach($o->sec_ac AS $s) {
194227
$id = (string) $s->attributes()->acc;
195-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:secondary-accession", "interpro:$id"));
228+
parent::addRDF(
229+
parent::triplify($s,parent::getVoc()."secondary-accession", "interpro:$id")
230+
);
196231
}
197232
}
198233

@@ -202,45 +237,39 @@ function Parse($xml)
202237
foreach($o->member_list->db_xref AS $dbxref) {
203238
$db = (string) $dbxref->attributes()->db;
204239
$id = (string) $dbxref->attributes()->dbkey;
205-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-".strtolower($db), "$db:$id"));
240+
parent::addRDF(
241+
parent::triplify($s,parent::getVoc()."x-".strtolower($db), "$db:$id")
242+
);
206243
}
207244
}
208245
if(isset($o->external_doc_list)) {
209246
foreach($o->external_doc_list->db_xref AS $dbxref) {
210247
$db = (string) $dbxref->attributes()->db;
211248
$id = (string) $dbxref->attributes()->dbkey;
212-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-".strtolower($db), "$db:$id"));
249+
parent::addRDF(
250+
parent::triplify($s,parent::getVoc()."x-".strtolower($db), "$db:$id")
251+
);
213252
}
214253
}
215254
if(isset($o->structure_db_links->db_xref)) {
216255
foreach($o->structure_db_links->db_xref AS $dbxref) {
217256
$db = (string) $dbxref->attributes()->db;
218257
$id = (string) $dbxref->attributes()->dbkey;
219-
$this->AddRDF($this->QQuad($s,"interpro_vocabulary:x-".strtolower($db), "$db:$id"));
258+
parent::addRDF(
259+
parent::triplify($s,parent::getVoc()."x-".strtolower($db), "$db:$id")
260+
);
220261
}
221262
}
222263

223264
// taxon distribution
224265
foreach($o->taxonomy_distribution->taxon_data AS $t) {
225266
$organism = (string) $t->attributes()->name;
226267
$number = (string) $t->attributes()->proteins_count;
227-
$this->AddRDF($this->QQuadL($s,"interpro_vocabulary:taxon-distribution", "$organism ($number)"));
268+
parent::addRDF(
269+
parent::triplifyString($s,parent::getVoc()."taxon-distribution", "$organism ($number)")
270+
);
228271
}
229272
}
230273
}
231-
232274
}
233-
$start = microtime(true);
234-
235-
set_error_handler('error_handler');
236-
$parser = new AffymetrixParser($argv);
237-
$parser->Run();
238-
239-
$end = microtime(true);
240-
$time_taken = $end - $start;
241-
print "Started: ".date("l jS F \@ g:i:s a", $start)."\n";
242-
print "Finished: ".date("l jS F \@ g:i:s a", $end)."\n";
243-
print "Took: ".$time_taken." seconds\n"
244275
?>
245-
246-

0 commit comments

Comments
 (0)