1
1
<?php
2
2
/**
3
- Copyright (C) 2012 Michel Dumontier
3
+ Copyright (C) 2012-2013 Michel Dumontier
4
4
5
5
Permission is hereby granted, free of charge, to any person obtaining a copy of
6
6
this software and associated documentation files (the "Software"), to deal in
20
20
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
21
SOFTWARE.
22
22
*/
23
- require ( ' ../../php-lib/rdfapi .php ' );
24
- require ( ' ../../php-lib/xmlapi.php ' );
23
+ require_once ( __DIR__ . ' / ../../php-lib/bio2rdfapi .php ' );
24
+ require_once ( __DIR__ . ' / ../../php-lib/xmlapi.php ' );
25
25
/**
26
26
* InterPro RDFizer
27
- * @version 1 .0
27
+ * @version 2 .0
28
28
* @author Michel Dumontier
29
29
* @description http://www.ebi.ac.uk/interpro/
30
30
*/
31
- class AffymetrixParser extends RDFFactory
31
+ class InterproParser extends Bio2RDFizer
32
32
{
33
33
private $ version = null ;
34
-
35
- function __construct ($ argv ) {
36
- parent ::__construct ();
37
- $ this ->SetDefaultNamespace ("interpro " );
38
-
39
- // set and print application parameters
40
- $ this ->AddParameter ('files ' ,true ,'all ' ,'all ' ,'' );
41
- $ this ->AddParameter ('indir ' ,false ,null ,'/data/download/ ' .$ this ->GetNamespace ().'/ ' ,'directory to download into and parse from ' );
42
- $ this ->AddParameter ('outdir ' ,false ,null ,'/data/rdf/ ' .$ this ->GetNamespace ().'/ ' ,'directory to place rdfized files ' );
43
- $ this ->AddParameter ('graph_uri ' ,false ,null ,null ,'provide the graph uri to generate n-quads instead of n-triples ' );
44
- $ this ->AddParameter ('gzip ' ,false ,'true|false ' ,'true ' ,'gzip the output ' );
45
- $ this ->AddParameter ('download ' ,false ,'true|false ' ,'false ' ,'set true to download files ' );
46
- $ this ->AddParameter ('download_url ' ,false ,null ,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz ' ,'' );
47
34
48
- if ($ this ->SetParameters ($ argv ) == FALSE ) {
49
- $ this ->PrintParameters ($ argv );
50
- exit ;
51
- }
52
- if ($ this ->CreateDirectory ($ this ->GetParameterValue ('indir ' )) === FALSE ) exit ;
53
- if ($ this ->CreateDirectory ($ this ->GetParameterValue ('outdir ' )) === FALSE ) exit ;
54
- if ($ this ->GetParameterValue ('graph_uri ' )) $ this ->SetGraphURI ($ this ->GetParameterValue ('graph_uri ' ));
55
-
56
- return TRUE ;
35
+ function __construct ($ argv ) {
36
+ parent ::__construct ($ argv ,"interpro " );
37
+ parent ::addParameter ('files ' ,true ,'all ' ,'all ' ,'' );
38
+ parent ::addParameter ('download_url ' ,false ,null ,'ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz ' ,'' );
39
+ parent ::initialize ();
57
40
}
58
41
59
42
function Run ()
60
43
{
61
44
// directory shortcuts
62
- $ ldir = $ this ->GetParameterValue ('indir ' );
63
- $ odir = $ this ->GetParameterValue ('outdir ' );
64
-
45
+ $ ldir = parent ::getParameterValue ('indir ' );
46
+ $ odir = parent ::getParameterValue ('outdir ' );
65
47
66
48
// get the listings page
67
- $ rfile = trim ($ this -> GetParameterValue ('download_url ' ));
49
+ $ rfile = trim (parent :: getParameterValue ('download_url ' ));
68
50
$ file = "interpro.xml.gz " ;
69
51
$ lfile = $ ldir .$ file ;
70
- if (!file_exists ($ lfile ) || $ this -> GetParameterValue ("download " ) == "true " ) {
52
+ if (!file_exists ($ lfile ) || parent :: getParameterValue ("download " ) == "true " ) {
71
53
echo "Downloading $ lfile " .PHP_EOL ;
72
54
$ ret = file_get_contents ($ rfile );
73
55
if ($ ret === FALSE ) {
@@ -76,41 +58,37 @@ function Run()
76
58
}
77
59
file_put_contents ($ lfile ,$ ret );
78
60
}
61
+ echo "Loading XML file... " ;
79
62
$ cxml = new CXML ($ ldir ,$ file );
80
63
$ cxml ->Parse ();
81
- $ xml = $ cxml ->GetXMLRoot ();
82
-
64
+ $ xml = $ cxml ->GetXMLRoot ();
65
+ echo " Done " . PHP_EOL ;
83
66
84
67
// set the write file
85
- $ outfile = 'interpro.nt ' ; $ gz =false ;
86
- if ($ this ->GetParameterValue ('graph_uri ' )) {$ outfile = 'interpro.nq ' ;}
87
- if ($ this ->GetParameterValue ('gzip ' )) {
88
- $ outfile .= '.gz ' ;
89
- $ gz = true ;
90
- }
91
- $ this ->SetWriteFile ($ odir .$ outfile , $ gz );
68
+ $ gz = (strstr (parent ::getParameterValue ('output_format ' ),".gz " ) === FALSE )?false :true ;
69
+ $ outfile = "interpro. " .parent ::getParameterValue ('output_format ' );
70
+ parent ::setWriteFile ($ odir .$ outfile , $ gz );
92
71
93
72
echo "Parsing interpro xml file " .PHP_EOL ;
94
- $ this ->Parse ($ xml );
95
- $ this -> WriteRDFBufferToWriteFile ();
96
- $ this -> GetWriteFile ()->Close ();
73
+ $ this ->parse ($ xml );
74
+ parent :: writeRDFBufferToWriteFile ();
75
+ parent :: getWriteFile ()->close ();
97
76
echo "Done! " .PHP_EOL ;
98
77
99
78
// generate the release file
100
- $ this ->DeleteBio2RDFReleaseFiles ($ odir );
101
- $ desc = $ this ->GetBio2RDFDatasetDescription (
102
- $ this ->GetNamespace (),
79
+ $ desc = parent ::getBio2RDFDatasetDescription (
80
+ parent ::getPrefix (),
103
81
"https://github.com/bio2rdf/bio2rdf-scripts/blob/master/interpro/intepro.php " ,
104
- $ this -> GetBio2RDFDownloadURL ( $ this -> GetNamespace ()).$ outfile ,
82
+ parent :: getBio2RDFDownloadURL ( parent :: getPrefix ()).$ outfile ,
105
83
"http://www.ebi.ac.uk/interpro/ " ,
106
84
array ("use-share-modify " ),
107
85
null , // license
108
- $ this -> GetParameterValue ('download_url ' ),
86
+ parent :: getParameterValue ('download_url ' ),
109
87
$ this ->version
110
88
);
111
- $ this -> SetWriteFile ($ odir .$ this -> GetBio2RDFReleaseFile ( $ this -> GetNamespace ()));
112
- $ this -> GetWriteFile ()->Write ($ desc );
113
- $ this -> GetWriteFile ()->Close ();
89
+ parent :: setWriteFile ($ odir .parent :: getBio2RDFReleaseFile ( parent :: getPrefix ()));
90
+ parent :: getWriteFile ()->write ($ desc );
91
+ parent :: getWriteFile ()->close ();
114
92
115
93
return true ;
116
94
}
@@ -120,24 +98,29 @@ function Parse($xml)
120
98
// state the dataset info
121
99
foreach ($ xml ->release ->dbinfo AS $ o ) {
122
100
$ db = $ o ->attributes ()->dbname ." v " .$ o ->attributes ()->version ." ( " .$ o ->attributes ()->entry_count ." entries) [ " .$ o ->attributes ()->file_date ."] " ;
123
- $ this ->AddRDF ($ this ->QQuadL ($ this ->GetDatasetURI (), "interpro_vocabulary:contains " , $ db ));
101
+ parent ::addRDF (
102
+ parent ::triplifyString (parent ::getDatasetURI (), parent ::getVoc ()."contains " , $ db )
103
+ );
104
+ if (((string )$ o ->attributes ()->dbname ) === "INTERPRO " ) {
105
+ parent ::setDatasetVersion ($ o ->attributes ()->version );
106
+ }
124
107
}
125
108
// now interate over the entries
126
109
foreach ($ xml ->interpro AS $ o ) {
127
- $ this -> WriteRDFBufferToWriteFile ();
110
+ parent :: writeRDFBufferToWriteFile ();
128
111
129
112
$ interpro_id = $ o ->attributes ()->id ;
130
- echo "Processing id... $ interpro_id " .PHP_EOL ;
113
+ echo "Processing $ interpro_id " .PHP_EOL ;
131
114
132
115
$ name = $ o ->name ;
133
116
$ short_name = $ o ->attributes ()->short_name ;
134
117
$ type = $ o ->attributes ()->type ;
135
- $ s = " interpro: $ interpro_id" ;
118
+ $ s = parent :: getNamespace (). $ interpro_id ;
136
119
137
- echo "Adding... $ s rdfs:label $ name ( $ short_name) $ type [ $ s] " .PHP_EOL ;
138
- $ this -> AddRDF ( $ this -> QQuadL ( $ s , " rdfs:label " , " $ name ( $ short_name ) $ type [ $ s ] " ));
139
- $ this -> AddRDF ( $ this -> QQuad ( $ s ,"rdf: type " ," interpro_vocabulary: $ type" ));
140
- $ this -> AddRDF ( $ this -> QQuad ( $ s , " void:inDataset " , $ this -> GetDatasetURI ()) );
120
+ // echo "Adding... $s rdfs:label $name ($short_name) $type [$s]".PHP_EOL;
121
+ parent :: addRDF (
122
+ parent :: describeIndividual ( $ s ,"$ name ( $ short_name ) $ type " , parent :: getVoc (). $ type)
123
+ );
141
124
142
125
// get the pubs
143
126
unset($ pubs );
@@ -148,7 +131,9 @@ function Parse($xml)
148
131
$ pmid = (string ) $ p ->db_xref ->attributes ()->dbkey ;
149
132
$ pubs ['pid ' ][] = '<cite idref=" ' .$ pid .'"/> ' ;
150
133
$ pubs ['pmid ' ][] = '<a href="http://www.ncbi.nlm.nih.gov/pubmed/ ' .$ pmid .'">pubmed: ' .$ pmid .'</a> ' ;
151
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x-pubmed " ,"pubmed: $ pmid " ));
134
+ parent ::addRDF (
135
+ parent ::triplify ($ s ,parent ::getVoc ()."x-pubmed " ,"pubmed: $ pmid " )
136
+ );
152
137
}
153
138
}
154
139
}
@@ -157,42 +142,56 @@ function Parse($xml)
157
142
$ abstract = str_replace ($ pubs ['pid ' ],$ pubs ['pmid ' ],$ abstract );
158
143
}
159
144
160
- $ this ->AddRDF ($ this ->QQuadL ($ s ,"dc:description " ,$ this ->SafeLiteral ($ abstract )));
145
+ parent ::addRDF (
146
+ parent ::triplifyString ($ s ,"dc:description " ,$ abstract )
147
+ );
161
148
162
149
foreach ($ o ->example_list ->example AS $ example ) {
163
150
$ db = (string ) $ example ->db_xref ->attributes ()->db ;
164
151
$ id = (string ) $ example ->db_xref ->attributes ()->dbkey ;
165
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:example-entry " , $ this ->GetNS ()->MapQName ("$ db: $ id " )));
152
+ parent ::addRDF (
153
+ parent ::triplify ($ s ,parent ::getVoc ()."example-entry " , "$ db: $ id " )
154
+ );
166
155
}
167
156
168
157
if (isset ($ o ->parent_list ->rel_ref )) {
169
158
foreach ($ o ->parent_list ->rel_ref AS $ parent ) {
170
159
$ id = (string ) $ parent ->attributes ()->ipr_ref ;
171
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:parent " , "interpro: $ id " ));
160
+ parent ::addRDF (
161
+ parent ::triplify ($ s ,parent ::getVoc ()."parent " , "interpro: $ id " )
162
+ );
172
163
}
173
164
}
174
165
if (isset ($ o ->child ->rel_ref )) {
175
166
foreach ($ o ->child ->rel_ref AS $ child ) {
176
167
$ id = (string ) $ child ->attributes ()->ipr_ref ;
177
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:child " , "interpro: $ id " ));
168
+ parent ::addRDF (
169
+ parent ::triplify ($ s ,parent ::getVoc ()."child " , "interpro: $ id " )
170
+ );
178
171
}
179
172
}
180
173
if (isset ($ o ->contains ->rel_ref )) {
181
174
foreach ($ o ->contains ->rel_ref AS $ contains ) {
182
175
$ id = (string ) $ contains ->attributes ()->ipr_ref ;
183
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:contains " , "interpro: $ id " ));
176
+ parent ::addRDF (
177
+ parent ::triplify ($ s ,parent ::getVoc ()."contains " , "interpro: $ id " )
178
+ );
184
179
}
185
180
}
186
181
if (isset ($ o ->found_in ->rel_ref )) {
187
182
foreach ($ o ->found_in ->rel_ref AS $ f ) {
188
183
$ id = (string ) $ f ->attributes ()->ipr_ref ;
189
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:found-in " , "interpro: $ id " ));
184
+ parent ::addRDF (
185
+ parent ::triplify ($ s ,parent ::getVoc ()."found-in " , "interpro: $ id " )
186
+ );
190
187
}
191
188
}
192
189
if (isset ($ o ->sec_list ->sec_ac )) {
193
190
foreach ($ o ->sec_ac AS $ s ) {
194
191
$ id = (string ) $ s ->attributes ()->acc ;
195
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:secondary-accession " , "interpro: $ id " ));
192
+ parent ::addRDF (
193
+ parent ::triplify ($ s ,parent ::getVoc ()."secondary-accession " , "interpro: $ id " )
194
+ );
196
195
}
197
196
}
198
197
@@ -202,45 +201,39 @@ function Parse($xml)
202
201
foreach ($ o ->member_list ->db_xref AS $ dbxref ) {
203
202
$ db = (string ) $ dbxref ->attributes ()->db ;
204
203
$ id = (string ) $ dbxref ->attributes ()->dbkey ;
205
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x- " .strtolower ($ db ), "$ db: $ id " ));
204
+ parent ::addRDF (
205
+ parent ::triplify ($ s ,parent ::getVoc ()."x- " .strtolower ($ db ), "$ db: $ id " )
206
+ );
206
207
}
207
208
}
208
209
if (isset ($ o ->external_doc_list )) {
209
210
foreach ($ o ->external_doc_list ->db_xref AS $ dbxref ) {
210
211
$ db = (string ) $ dbxref ->attributes ()->db ;
211
212
$ id = (string ) $ dbxref ->attributes ()->dbkey ;
212
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x- " .strtolower ($ db ), "$ db: $ id " ));
213
+ parent ::addRDF (
214
+ parent ::triplify ($ s ,parent ::getVoc ()."x- " .strtolower ($ db ), "$ db: $ id " )
215
+ );
213
216
}
214
217
}
215
218
if (isset ($ o ->structure_db_links ->db_xref )) {
216
219
foreach ($ o ->structure_db_links ->db_xref AS $ dbxref ) {
217
220
$ db = (string ) $ dbxref ->attributes ()->db ;
218
221
$ id = (string ) $ dbxref ->attributes ()->dbkey ;
219
- $ this ->AddRDF ($ this ->QQuad ($ s ,"interpro_vocabulary:x- " .strtolower ($ db ), "$ db: $ id " ));
222
+ parent ::addRDF (
223
+ parent ::triplify ($ s ,parent ::getVoc ()."x- " .strtolower ($ db ), "$ db: $ id " )
224
+ );
220
225
}
221
226
}
222
227
223
228
// taxon distribution
224
229
foreach ($ o ->taxonomy_distribution ->taxon_data AS $ t ) {
225
230
$ organism = (string ) $ t ->attributes ()->name ;
226
231
$ number = (string ) $ t ->attributes ()->proteins_count ;
227
- $ this ->AddRDF ($ this ->QQuadL ($ s ,"interpro_vocabulary:taxon-distribution " , "$ organism ( $ number) " ));
232
+ parent ::addRDF (
233
+ parent ::triplifyString ($ s ,parent ::getVoc ()."taxon-distribution " , "$ organism ( $ number) " )
234
+ );
228
235
}
229
236
}
230
237
}
231
-
232
238
}
233
- $ start = microtime (true );
234
-
235
- set_error_handler ('error_handler ' );
236
- $ parser = new AffymetrixParser ($ argv );
237
- $ parser ->Run ();
238
-
239
- $ end = microtime (true );
240
- $ time_taken = $ end - $ start ;
241
- print "Started: " .date ("l jS F \@ g:i:s a " , $ start )."\n" ;
242
- print "Finished: " .date ("l jS F \@ g:i:s a " , $ end )."\n" ;
243
- print "Took: " .$ time_taken ." seconds \n"
244
239
?>
245
-
246
-
0 commit comments