1
1
<?php
2
2
/**
3
- Copyright (C) 2012 Michel Dumontier
3
+ Copyright (C) 2012-2013 Michel Dumontier
4
4
5
5
Permission is hereby granted, free of charge, to any person obtaining a copy of
6
6
this software and associated documentation files (the "Software"), to deal in
20
20
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
21
SOFTWARE.
22
22
*/
23
- require ( ' ../../php-lib/rdfapi .php ' );
24
- require ( ' ../../php-lib/xmlapi.php ' );
23
+ require_once ( __DIR__ . ' / ../../php-lib/bio2rdfapi .php ' );
24
+ require_once ( __DIR__ . ' / ../../php-lib/xmlapi.php ' );
25
25
/**
26
26
* InterPro RDFizer
27
- * @version 1 .0
27
+ * @version 2 .0
28
28
* @author Michel Dumontier
29
29
* @description http://www.ebi.ac.uk/interpro/
30
30
*/
31
- class AffymetrixParser extends RDFFactory
31
+ class InterproParser extends Bio2RDFizer
32
32
{
33
33
private $ version = null ;
34
-
35
- function __construct ($ argv ) {
36
- parent ::__construct ();
37
- $ this ->SetDefaultNamespace ("interpro " );
38
-
39
- // set and print application parameters
40
- $ this ->AddParameter ('files ' ,true ,'all ' ,'all ' ,'' );
41
- $ this ->AddParameter ('indir ' ,false ,null ,'/data/download/ ' .$ this ->GetNamespace ().'/ ' ,'directory to download into and parse from ' );
42
- $ this ->AddParameter ('outdir ' ,false ,null ,'/data/rdf/ ' .$ this ->GetNamespace ().'/ ' ,'directory to place rdfized files ' );
43
- $ this ->AddParameter ('graph_uri ' ,false ,null ,null ,'provide the graph uri to generate n-quads instead of n-triples ' );
44
- $ this ->AddParameter ('gzip ' ,false ,'true|false ' ,'true ' ,'gzip the output ' );
45
- $ this ->AddParameter ('download ' ,false ,'true|false ' ,'false ' ,'set true to download files ' );
46
- $ this ->AddParameter ('download_url ' ,false ,null ,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz ' ,'' );
47
34
48
- if ($ this ->SetParameters ($ argv ) == FALSE ) {
49
- $ this ->PrintParameters ($ argv );
50
- exit ;
51
- }
52
- if ($ this ->CreateDirectory ($ this ->GetParameterValue ('indir ' )) === FALSE ) exit ;
53
- if ($ this ->CreateDirectory ($ this ->GetParameterValue ('outdir ' )) === FALSE ) exit ;
54
- if ($ this ->GetParameterValue ('graph_uri ' )) $ this ->SetGraphURI ($ this ->GetParameterValue ('graph_uri ' ));
55
-
56
- return TRUE ;
35
+ function __construct ($ argv ) {
36
+ parent ::__construct ($ argv ,"interpro " );
37
+ parent ::addParameter ('files ' ,true ,'all ' ,'all ' ,'' );
38
+ parent ::addParameter ('download_url ' ,false ,null ,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz ' ,'' );
39
+ parent ::initialize ();
57
40
}
58
41
59
42
function Run ()
60
43
{
61
44
// directory shortcuts
62
- $ ldir = $ this ->GetParameterValue ('indir ' );
63
- $ odir = $ this ->GetParameterValue ('outdir ' );
64
-
45
+ $ ldir = parent ::getParameterValue ('indir ' );
46
+ $ odir = parent ::getParameterValue ('outdir ' );
65
47
66
48
// get the listings page
67
- $ rfile = trim ($ this -> GetParameterValue ('download_url ' ));
49
+ $ rfile = trim (parent :: getParameterValue ('download_url ' ));
68
50
$ file = "interpro.xml.gz " ;
69
51
$ lfile = $ ldir .$ file ;
70
- if (!file_exists ($ lfile ) || $ this -> GetParameterValue ("download " ) == "true " ) {
52
+ if (!file_exists ($ lfile ) || parent :: getParameterValue ("download " ) == "true " ) {
71
53
echo "Downloading $ lfile " .PHP_EOL ;
72
54
$ ret = file_get_contents ($ rfile );
73
55
if ($ ret === FALSE ) {
@@ -76,41 +58,67 @@ function Run()
76
58
}
77
59
file_put_contents ($ lfile ,$ ret );
78
60
}
61
+ echo "Loading XML file... " ;
79
62
$ cxml = new CXML ($ ldir ,$ file );
80
63
$ cxml ->Parse ();
81
- $ xml = $ cxml ->GetXMLRoot ();
82
-
64
+ $ xml = $ cxml ->GetXMLRoot ();
65
+ echo " Done " . PHP_EOL ;
83
66
84
67
// set the write file
85
- $ outfile = 'interpro.nt ' ; $ gz =false ;
86
- if ($ this ->GetParameterValue ('graph_uri ' )) {$ outfile = 'interpro.nq ' ;}
87
- if ($ this ->GetParameterValue ('gzip ' )) {
88
- $ outfile .= '.gz ' ;
89
- $ gz = true ;
90
- }
91
- $ this ->SetWriteFile ($ odir .$ outfile , $ gz );
68
+ $ gz = (strstr (parent ::getParameterValue ('output_format ' ),".gz " ) === FALSE )?false :true ;
69
+ $ outfile = "interpro. " .parent ::getParameterValue ('output_format ' );
70
+ parent ::setWriteFile ($ odir .$ outfile , $ gz );
92
71
93
72
echo "Parsing interpro xml file " .PHP_EOL ;
94
- $ this ->Parse ($ xml );
95
- $ this -> WriteRDFBufferToWriteFile ();
96
- $ this -> GetWriteFile ()->Close ();
73
+ $ this ->parse ($ xml );
74
+ parent :: writeRDFBufferToWriteFile ();
75
+ parent :: getWriteFile ()->close ();
97
76
echo "Done! " .PHP_EOL ;
77
+
78
+
79
+ // let's make an nq file
80
+ parent ::setGraphURI (parent ::getDatasetURI ());
81
+
82
+ // dataset description
83
+ $ source_version = parent ::getDatasetVersion ();
84
+ $ source_file = (new DataResource ($ this ))
85
+ ->setURI ($ rfile )
86
+ ->setTitle ("InterPro v $ source_version " )
87
+ ->setRetrievedDate ( date ("Y-m-d\TG:i:s\Z " , filemtime ($ lfile )))
88
+ ->setFormat ("application/xml " )
89
+ ->setFormat ("application/g-zip " )
90
+ ->setPublisher ("http://www.ebi.ac.uk/ " )
91
+ ->setHomepage ("http://www.ebi.ac.uk/interpro/ " )
92
+ ->setRights ("InterPro - Integrated Resource Of Protein Domains And Functional Sites. Copyright (C) 2001 The InterPro Consortium " )
93
+ ->setLicense ("http://www.ebi.ac.uk/interpro/faqs.html " )
94
+ ->setDataset ("http://identifiers.org/interpro/ " );
98
95
99
- // generate the release file
100
- $ this ->DeleteBio2RDFReleaseFiles ($ odir );
101
- $ desc = $ this ->GetBio2RDFDatasetDescription (
102
- $ this ->GetNamespace (),
103
- "https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/intepro.php " ,
104
- $ this ->GetBio2RDFDownloadURL ($ this ->GetNamespace ()).$ outfile ,
105
- "http://www.ebi.ac.uk/interpro/ " ,
106
- array ("use-share-modify " ),
107
- null , // license
108
- $ this ->GetParameterValue ('download_url ' ),
109
- $ this ->version
110
- );
111
- $ this ->SetWriteFile ($ odir .$ this ->GetBio2RDFReleaseFile ($ this ->GetNamespace ()));
112
- $ this ->GetWriteFile ()->Write ($ desc );
113
- $ this ->GetWriteFile ()->Close ();
96
+ $ prefix = parent ::getPrefix ();
97
+ $ bVersion = parent ::getParameterValue ('bio2rdf_release ' );
98
+ $ date = date ("Y-m-d\TG:i:s\Z " );
99
+ $ output_file = (new DataResource ($ this ))
100
+ ->setURI ("http://download.bio2df.org/release/ $ bVersion/ $ prefix/ $ outfile " )
101
+ ->setTitle ("Bio2RDF v $ bVersion RDF version of $ prefix v $ source_version " )
102
+ ->setSource ($ source_file ->getURI ())
103
+ ->setCreator ("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/interpro.php " )
104
+ ->setCreateDate ($ date )
105
+ ->setHomepage ("http://download.bio2rdf.org/release/ $ bVersion/ $ prefix/ $ prefix.html " )
106
+ ->setPublisher ("http://bio2rdf.org " )
107
+ ->setRights ("use-share-modify " )
108
+ ->setRights ("by-attribution " )
109
+ ->setRights ("restricted-by-source-license " )
110
+ ->setLicense ("http://creativecommons.org/licenses/by/3.0/ " )
111
+ ->setDataset (parent ::getDatasetURI ());
112
+
113
+ if ($ gz ) $ output_file ->setFormat ("application/gzip " );
114
+ if (strstr (parent ::getParameterValue ('output_format ' ),"nt " )) $ output_file ->setFormat ("application/n-triples " );
115
+ else $ output_file ->setFormat ("application/n-quads " );
116
+
117
+ $ dataset_description = $ source_file ->toRDF ().$ output_file ->toRDF ();
118
+
119
+ parent ::setWriteFile ($ odir .parent ::getBio2RDFReleaseFile ());
120
+ parent ::getWriteFile ()->write ($ dataset_description );
121
+ parent ::getWriteFile ()->close ();
114
122
115
123
return true ;
116
124
}
@@ -120,24 +128,35 @@ function Parse($xml)
120
128
// state the dataset info
121
129
foreach ($ xml ->release ->dbinfo AS $ o ) {
122
130
$ db = $ o ->attributes ()->dbname ." v " .$ o ->attributes ()->version ." ( " .$ o ->attributes ()->entry_count ." entries) [ " .$ o ->attributes ()->file_date ."] " ;
123
- $ this ->AddRDF ($ this ->QQuadL ($ this ->GetDatasetURI (), "interpro_vocabulary:contains " , $ db ));
131
+ parent ::addRDF (
132
+ parent ::triplifyString (parent ::getDatasetURI (), parent ::getVoc ()."contains " , $ db )
133
+ );
134
+ if (((string )$ o ->attributes ()->dbname ) === "INTERPRO " ) {
135
+ parent ::setDatasetVersion ($ o ->attributes ()->version );
136
+ }
124
137
}
138
+ // get a potential id list
139
+ $ id_list = explode (", " ,parent ::getParameterValue ("id_list " ));
140
+
125
141
// now interate over the entries
126
142
foreach ($ xml ->interpro AS $ o ) {
127
- $ this -> WriteRDFBufferToWriteFile ();
143
+ parent :: writeRDFBufferToWriteFile ();
128
144
129
145
$ interpro_id = $ o ->attributes ()->id ;
130
- echo "Processing id... $ interpro_id " .PHP_EOL ;
146
+ if (isset ($ id_list ) && !in_array ($ interpro_id ,$ id_list )) {
147
+ continue ;
148
+ }
149
+ echo "Processing $ interpro_id " .PHP_EOL ;
131
150
132
151
$ name = $ o ->name ;
133
152
$ short_name = $ o ->attributes ()->short_name ;
134
153
$ type = $ o ->attributes ()->type ;
135
- $ s = " interpro: $ interpro_id" ;
154
+ $ s = parent :: getNamespace (). $ interpro_id ;
136
155
137
- echo "Adding... $ s rdfs:label $ name ( $ short_name) $ type [ $ s] " .PHP_EOL ;
138
- $ this -> AddRDF ( $ this -> QQuadL ( $ s , " rdfs:label " , " $ name ( $ short_name ) $ type [ $ s ] " ));
139
- $ this -> AddRDF ( $ this -> QQuad ( $ s ,"rdf: type " ," interpro_vocabulary: $ type" ));
140
- $ this -> AddRDF ( $ this -> QQuad ( $ s , " void:inDataset " , $ this -> GetDatasetURI ()) );
156
+ // echo "Adding... $s rdfs:label $name ($short_name) $type [$s]".PHP_EOL;
157
+ parent :: addRDF (
158
+ parent :: describeIndividual ( $ s ,"$ name ( $ short_name ) $ type " , parent :: getVoc (). $ type)
159
+ );
141
160
142
161
// get the pubs
143
162
unset($ pubs );
@@ -148,7 +167,9 @@ function Parse($xml)
148
167
$ pmid = (string ) $ p ->db_xref ->attributes ()->dbkey ;
149
168
$ pubs ['pid ' ][] = '<cite idref=" ' .$ pid .'"/> ' ;
150
169
$ pubs ['pmid ' ][] = '<a href="http://www.ncbi.nlm.nih.gov/pubmed/ ' .$ pmid .'">pubmed: ' .$ pmid .'</a> ' ;
151
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x-pubmed " ,"pubmed: $ pmid " ));
170
+ parent ::addRDF (
171
+ parent ::triplify ($ s ,parent ::getVoc ()."x-pubmed " ,"pubmed: $ pmid " )
172
+ );
152
173
}
153
174
}
154
175
}
@@ -157,42 +178,56 @@ function Parse($xml)
157
178
$ abstract = str_replace ($ pubs ['pid ' ],$ pubs ['pmid ' ],$ abstract );
158
179
}
159
180
160
- $ this ->AddRDF ($ this ->QQuadL ($ s ,"dc:description " ,$ this ->SafeLiteral ($ abstract )));
181
+ parent ::addRDF (
182
+ parent ::triplifyString ($ s ,"dc:description " ,$ abstract )
183
+ );
161
184
162
185
foreach ($ o ->example_list ->example AS $ example ) {
163
186
$ db = (string ) $ example ->db_xref ->attributes ()->db ;
164
187
$ id = (string ) $ example ->db_xref ->attributes ()->dbkey ;
165
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:example-entry " , $ this ->GetNS ()->MapQName ("$ db: $ id " )));
188
+ parent ::addRDF (
189
+ parent ::triplify ($ s ,parent ::getVoc ()."example-entry " , "$ db: $ id " )
190
+ );
166
191
}
167
192
168
193
if (isset ($ o ->parent_list ->rel_ref )) {
169
194
foreach ($ o ->parent_list ->rel_ref AS $ parent ) {
170
195
$ id = (string ) $ parent ->attributes ()->ipr_ref ;
171
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:parent " , "interpro: $ id " ));
196
+ parent ::addRDF (
197
+ parent ::triplify ($ s ,parent ::getVoc ()."parent " , "interpro: $ id " )
198
+ );
172
199
}
173
200
}
174
201
if (isset ($ o ->child ->rel_ref )) {
175
202
foreach ($ o ->child ->rel_ref AS $ child ) {
176
203
$ id = (string ) $ child ->attributes ()->ipr_ref ;
177
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:child " , "interpro: $ id " ));
204
+ parent ::addRDF (
205
+ parent ::triplify ($ s ,parent ::getVoc ()."child " , "interpro: $ id " )
206
+ );
178
207
}
179
208
}
180
209
if (isset ($ o ->contains ->rel_ref )) {
181
210
foreach ($ o ->contains ->rel_ref AS $ contains ) {
182
211
$ id = (string ) $ contains ->attributes ()->ipr_ref ;
183
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:contains " , "interpro: $ id " ));
212
+ parent ::addRDF (
213
+ parent ::triplify ($ s ,parent ::getVoc ()."contains " , "interpro: $ id " )
214
+ );
184
215
}
185
216
}
186
217
if (isset ($ o ->found_in ->rel_ref )) {
187
218
foreach ($ o ->found_in ->rel_ref AS $ f ) {
188
219
$ id = (string ) $ f ->attributes ()->ipr_ref ;
189
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:found-in " , "interpro: $ id " ));
220
+ parent ::addRDF (
221
+ parent ::triplify ($ s ,parent ::getVoc ()."found-in " , "interpro: $ id " )
222
+ );
190
223
}
191
224
}
192
225
if (isset ($ o ->sec_list ->sec_ac )) {
193
226
foreach ($ o ->sec_ac AS $ s ) {
194
227
$ id = (string ) $ s ->attributes ()->acc ;
195
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:secondary-accession " , "interpro: $ id " ));
228
+ parent ::addRDF (
229
+ parent ::triplify ($ s ,parent ::getVoc ()."secondary-accession " , "interpro: $ id " )
230
+ );
196
231
}
197
232
}
198
233
@@ -202,45 +237,39 @@ function Parse($xml)
202
237
foreach ($ o ->member_list ->db_xref AS $ dbxref ) {
203
238
$ db = (string ) $ dbxref ->attributes ()->db ;
204
239
$ id = (string ) $ dbxref ->attributes ()->dbkey ;
205
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x- " .strtolower ($ db ), "$ db: $ id " ));
240
+ parent ::addRDF (
241
+ parent ::triplify ($ s ,parent ::getVoc ()."x- " .strtolower ($ db ), "$ db: $ id " )
242
+ );
206
243
}
207
244
}
208
245
if (isset ($ o ->external_doc_list )) {
209
246
foreach ($ o ->external_doc_list ->db_xref AS $ dbxref ) {
210
247
$ db = (string ) $ dbxref ->attributes ()->db ;
211
248
$ id = (string ) $ dbxref ->attributes ()->dbkey ;
212
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x- " .strtolower ($ db ), "$ db: $ id " ));
249
+ parent ::addRDF (
250
+ parent ::triplify ($ s ,parent ::getVoc ()."x- " .strtolower ($ db ), "$ db: $ id " )
251
+ );
213
252
}
214
253
}
215
254
if (isset ($ o ->structure_db_links ->db_xref )) {
216
255
foreach ($ o ->structure_db_links ->db_xref AS $ dbxref ) {
217
256
$ db = (string ) $ dbxref ->attributes ()->db ;
218
257
$ id = (string ) $ dbxref ->attributes ()->dbkey ;
219
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x- " .strtolower ($ db ), "$ db: $ id " ));
258
+ parent ::addRDF (
259
+ parent ::triplify ($ s ,parent ::getVoc ()."x- " .strtolower ($ db ), "$ db: $ id " )
260
+ );
220
261
}
221
262
}
222
263
223
264
// taxon distribution
224
265
foreach ($ o ->taxonomy_distribution ->taxon_data AS $ t ) {
225
266
$ organism = (string ) $ t ->attributes ()->name ;
226
267
$ number = (string ) $ t ->attributes ()->proteins_count ;
227
- $ this ->AddRDF ($ this ->QQuadL ($ s ,"interpro_vocabulary:taxon-distribution " , "$ organism ( $ number) " ));
268
+ parent ::addRDF (
269
+ parent ::triplifyString ($ s ,parent ::getVoc ()."taxon-distribution " , "$ organism ( $ number) " )
270
+ );
228
271
}
229
272
}
230
273
}
231
-
232
274
}
233
- $ start = microtime (true );
234
-
235
- set_error_handler ('error_handler ' );
236
- $ parser = new AffymetrixParser ($ argv );
237
- $ parser ->Run ();
238
-
239
- $ end = microtime (true );
240
- $ time_taken = $ end - $ start ;
241
- print "Started: " .date ("l jS F \@ g:i:s a " , $ start )."\n" ;
242
- print "Finished: " .date ("l jS F \@ g:i:s a " , $ end )."\n" ;
243
- print "Took: " .$ time_taken ." seconds \n"
244
275
?>
245
-
246
-
0 commit comments