31
31
32
32
class WormbaseParser extends Bio2RDFizer {
33
33
34
- function __construct ($ argv, $ path ) {
34
+ function __construct ($ argv ) {
35
35
parent ::__construct ($ argv , "wormbase " );
36
- parent ::addParameter ('files ' , true , null , 'all|geneIDs|functional_description|gene_association |gene_interactions|phenotype_association ' ,'all ' ,'files to process ' );
37
- parent ::addParameter ('release ' , true , null , 'WS235 ' )
38
- parent::addParameter ('download_url ' , false , null 'ftp://ftp.wormbase.org/pub/wormbase/')
36
+ parent ::addParameter ('files ' , true , 'all|geneIDs|functional_descriptions|gene_associations |gene_interactions|phenotype_associations ' ,'all ' ,'files to process ' );
37
+ parent ::addParameter ('release ' , true , null , 'WS235 ' );
38
+ parent ::addParameter ('download_url ' , false , null , 'ftp://ftp.wormbase.org/pub/wormbase/ ' );
39
39
parent ::initialize ();
40
40
}//constructor
41
41
42
- public function Run (){
42
+ public function run (){
43
43
44
44
if (parent ::getParameterValue ('download ' ) === true )
45
45
{
@@ -62,19 +62,19 @@ function download(){
62
62
}
63
63
64
64
$ remote_files = array (
65
- "geneIDs " => "species/c_elegans/annotation/geneIDs/c_elegans. " .parent ::parameterValue ('release ' ).".geneIDs.txt.gz " ,
66
- "functional_description " => "species/c_elegans/annotation/functional_descriptions/c_elegans. " .parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
67
- "gene_association " => "releases/ " .parent ::getParameterValue ('release ' )."/ONTOLOGY/gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
68
- "gene_interactions " => "species/c_elegans/annotation/gene_interactions/c_elegans. " .parent ::parameterValue ('release ' ).".gene_interactions.txt.gz " ,
69
- "phenotype_association " => "releases/ " .parent ::getParameterValue ('release ' )."/ONTOLOGY/phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
65
+ "geneIDs " => "species/c_elegans/annotation/geneIDs/c_elegans. " .parent ::getParameterValue ('release ' ).".geneIDs.txt.gz " ,
66
+ "functional_descriptions " => "species/c_elegans/annotation/functional_descriptions/c_elegans. " .parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
67
+ "gene_associations " => "releases/ " .parent ::getParameterValue ('release ' )."/ONTOLOGY/gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
68
+ "gene_interactions " => "species/c_elegans/annotation/gene_interactions/c_elegans. " .parent ::getParameterValue ('release ' ).".gene_interactions.txt.gz " ,
69
+ "phenotype_associations " => "releases/ " .parent ::getParameterValue ('release ' )."/ONTOLOGY/phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
70
70
);
71
71
72
72
$ local_files = array (
73
- "geneIDs " => "c_elegans. " .parent ::parameterValue ('release ' ).".geneIDs.txt.gz " ,
74
- "functional_description " => parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
75
- "gene_association " => "gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
76
- "gene_interactions " => "c_elegans. " .parent ::parameterValue ('release ' ).".gene_interactions.txt.gz " ,
77
- "phenotype_association " => "phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
73
+ "geneIDs " => "c_elegans. " .parent ::getParameterValue ('release ' ).".geneIDs.txt.gz " ,
74
+ "functional_descriptions " => parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
75
+ "gene_associations " => "gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
76
+ "gene_interactions " => "c_elegans. " .parent ::getParameterValue ('release ' ).".gene_interactions.txt.gz " ,
77
+ "phenotype_associations " => "phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
78
78
);
79
79
80
80
//set directory values
@@ -84,7 +84,9 @@ function download(){
84
84
foreach ($ files as $ file ){
85
85
$ rfile = $ rdir .$ remote_files [$ file ];
86
86
$ lfile = $ ldir .$ local_files [$ file ];
87
- parent ::downloadSingle ($ rfile , $ lfile );
87
+ echo "Downloading " .$ rfile ."... " ;
88
+ Utils::DownloadSingle ($ rfile , $ lfile );
89
+ echo "done! " .PHP_EOL ;
88
90
}
89
91
90
92
}
@@ -97,47 +99,114 @@ function process(){
97
99
$ files = explode (", " ,parent ::getParameterValue ('files ' ));
98
100
}
99
101
102
+ $ remote_files = array (
103
+ "geneIDs " => "species/c_elegans/annotation/geneIDs/c_elegans. " .parent ::getParameterValue ('release ' ).".geneIDs.txt.gz " ,
104
+ "functional_descriptions " => "species/c_elegans/annotation/functional_descriptions/c_elegans. " .parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
105
+ "gene_associations " => "releases/ " .parent ::getParameterValue ('release ' )."/ONTOLOGY/gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
106
+ "gene_interactions " => "species/c_elegans/annotation/gene_interactions/c_elegans. " .parent ::getParameterValue ('release ' ).".gene_interactions.txt.gz " ,
107
+ "phenotype_associations " => "releases/ " .parent ::getParameterValue ('release ' )."/ONTOLOGY/phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
108
+ );
109
+
100
110
$ local_files = array (
101
- "geneIDs " => "c_elegans. " .parent ::parameterValue ('release ' ).".geneIDs.txt.gz " ,
102
- "functional_description " => parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
103
- "gene_association " => "gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
104
- "gene_interactions " => "c_elegans. " .parent ::parameterValue ('release ' ).".gene_interactions.txt.gz " ,
105
- "phenotype_association " => "phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
111
+ "geneIDs " => "c_elegans. " .parent ::getParameterValue ('release ' ).".geneIDs.txt.gz " ,
112
+ "functional_descriptions " => parent ::getParameterValue ('release ' ).".functional_descriptions.txt.gz " ,
113
+ "gene_associations " => "gene_association. " .parent ::getParameterValue ('release ' ).".wb.ce " ,
114
+ "gene_interactions " => "c_elegans. " .parent ::getParameterValue ('release ' ).".gene_interactions.txt.gz " ,
115
+ "phenotype_associations " => "phenotype_association. " .parent ::getParameterValue ('release ' ).".wb "
106
116
);
107
117
108
118
$ idir = parent ::getParameterValue ('indir ' );
109
119
$ odir = parent ::getParameterValue ('outdir ' );
120
+ $ rdir = parent ::getParameterValue ('download_url ' );
121
+
122
+ $ dataset_description = '' ;
123
+
124
+ $ graph_uri = parent ::getGraphURI ();
125
+ if (parent ::getParameterValue ('dataset_graph ' ) == true ) parent ::setGraphURI (parent ::getDatasetURI ());
110
126
111
127
foreach ($ files as $ file ){
112
128
$ lfile = $ idir .$ local_files [$ file ];
129
+ $ rfile = $ rdir .$ remote_files [$ file ];
130
+
131
+ if (!file_exists ($ lfile )) {
132
+ trigger_error ($ lfile ." not found. Will attempt to download. " .PHP_EOL , E_USER_WARNING );
133
+ echo "Downloading $ rfile... " ;
134
+ Utils::DownloadSingle ($ rfile , $ lfile );
135
+ echo "done! " .PHP_EOL ;
136
+ }
137
+
113
138
if (strstr ($ lfile , "gz " )){
114
139
parent ::setReadFile ($ lfile , TRUE );
115
140
} else {
116
141
parent ::setReadFile ($ lfile , FALSE );
117
142
}
118
143
119
144
$ suffix = parent ::getParameterValue ('output_format ' );
120
- $ ofile = $ file .". " .$ suffix ;
145
+ $ ofile = " wormbase_celegans_ " . parent :: getParameterValue ( ' release ' ). " _ " . $ file .". " .$ suffix ;
121
146
122
147
if (strstr (parent ::getParameterValue ('output_format ' ), "gz " )) {
123
148
$ gz = true ;
124
149
}
125
150
126
- $ this -> SetWriteFile ($ odir .$ file , $ gz );
151
+ parent :: setWriteFile ($ odir .$ ofile , $ gz );
127
152
128
- echo "Processing $ file... "
153
+ echo "Processing $ file... " ;
129
154
$ fnx = $ file ;
130
155
$ this -> $ fnx ();
131
- echo " done!";
156
+ echo "done! " .PHP_EOL ;
157
+
158
+ parent ::getWriteFile ()->close ();
159
+
160
+ // generate the dataset release file
161
+ echo "Generating dataset description for $ ofile... " ;
162
+ // dataset description
163
+ $ source_file = (new DataResource ($ this ))
164
+ ->setURI ($ rfile )
165
+ ->setTitle ("WormBase C. elegans Release " .parent ::getParameterValue ('release ' )." subset ( $ file) " )
166
+ ->setRetrievedDate ( date ("Y-m-d\TG:i:s\Z " , filemtime ($ lfile )))
167
+ ->setFormat ("text/tab-separated-value " )
168
+ ->setFormat ("application/gzip " )
169
+ ->setPublisher ("http://wormbase.org/ " )
170
+ ->setHomepage ("http://wormbase.org/ " )
171
+ ->setRights ("use " )
172
+ ->setRights ("restricted-by-source-license " )
173
+ ->setLicense ("http://www.wormbase.org/about/policies " )
174
+ ->setDataset ("http://identifiers.org/wormbase/ " );
175
+
176
+ $ prefix = parent ::getPrefix ();
177
+ $ bVersion = parent ::getParameterValue ('bio2rdf_release ' );
178
+ $ date = date ("Y-m-d\TG:i:s\Z " );
179
+ $ output_file = (new DataResource ($ this ))
180
+ ->setURI ("http://download.bio2rdf.org/release/ $ bVersion/ $ prefix/ $ ofile " )
181
+ ->setTitle ("Bio2RDF v $ bVersion RDF version of $ prefix (generated at $ date) " )
182
+ ->setSource ($ source_file ->getURI ())
183
+ ->setCreator ("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/wormbase/wormbase.php " )
184
+ ->setCreateDate ($ date )
185
+ ->setHomepage ("http://download.bio2rdf.org/release/ $ bVersion/ $ prefix/ $ prefix.html " )
186
+ ->setPublisher ("http://bio2rdf.org " )
187
+ ->setRights ("use-share-modify " )
188
+ ->setRights ("by-attribution " )
189
+ ->setRights ("restricted-by-source-license " )
190
+ ->setLicense ("http://creativecommons.org/licenses/by/3.0/ " )
191
+ ->setDataset (parent ::getDatasetURI ());
192
+ if ($ gz ) $ output_file ->setFormat ("application/gzip " );
193
+ if (strstr (parent ::getParameterValue ('output_format ' ),"nt " )) $ output_file ->setFormat ("application/n-triples " );
194
+ else $ output_file ->setFormat ("application/n-quads " );
195
+ $ dataset_description .= $ source_file ->toRDF ().$ output_file ->toRDF ();
196
+ echo "done! " .PHP_EOL ;
132
197
}
198
+ parent ::setGraphURI ($ graph_uri );
199
+ parent ::setWriteFile ($ odir .parent ::getBio2RDFReleaseFile ());
200
+ parent ::getWriteFile ()->write ($ dataset_description );
201
+ parent ::getWriteFile ()->close ();
133
202
}
134
203
135
204
function geneIDs (){
136
205
$ first = true ;
137
206
while ($ l = $ this ->GetReadFile ()->Read ()){
138
207
if ($ l [0 ] == '# ' ) continue ;
139
208
140
- $ data = explode ("\t " ,trim ($ l ));
209
+ $ data = explode (", " ,trim ($ l ));
141
210
142
211
if ($ first ) {
143
212
if (($ c = count ($ data ) != 3 )) {
@@ -147,32 +216,32 @@ function geneIDs(){
147
216
}
148
217
//add the rdf:type
149
218
150
- $ id = parent ::getNamespace ().$ gene_IDs [0 ];
151
- $ gene_label = "WormBase gene " .$ gene_IDs [1 ]." with cosmid name " .$ gene_IDs [2 ];
219
+ $ id = parent ::getNamespace ().$ data [0 ];
220
+ $ gene_label = "WormBase gene " .$ data [1 ]." with cosmid name " .$ data [2 ];
152
221
153
222
parent ::addRDF (
154
223
parent ::describeIndividual ($ id , $ gene_label , parent ::getVoc ()."Gene " )
155
224
);
156
225
157
226
//add gene approved name
158
- if ($ gene_IDs [1 ] != '' ) {
227
+ if ($ data [1 ] != '' ) {
159
228
parent ::addRDF (
160
- parent ::triplifyString ($ id , parent ::getVoc ()."has_approved_gene_name " , $ gene_IDs [1 ])
229
+ parent ::triplifyString ($ id , parent ::getVoc ()."has_approved_gene_name " , $ data [1 ])
161
230
);
162
231
}
163
232
#Add cosmid name
164
- if ($ gene_IDs [2 ] != '' ) {
165
- $ cosmid_id = parent ::getRes ().$ gene_IDs [2 ];
233
+ if ($ data [2 ] != '' ) {
234
+ $ cosmid_id = parent ::getRes ().$ data [2 ];
166
235
parent ::addRDF (
167
- parent ::describeIndividual ($ cosmid_id , "Gene/cosmid name for " .$ gene_IDs [0 ], parent ::getVoc ()."Cosmid_gene " )
236
+ parent ::describeIndividual ($ cosmid_id , "Gene/cosmid name for " .$ data [0 ], parent ::getVoc ()."Cosmid_gene " ).
168
237
parent ::triplify ($ id , parent ::getVoc ()."has_sequence/cosmid_name " , $ cosmid_id )
169
238
);
170
239
}
171
240
parent ::WriteRDFBufferToWriteFile ();
172
241
}//while
173
242
}# Funcion Gene_IDs
174
243
175
- function functional_description (){
244
+ function functional_descriptions (){
176
245
177
246
$ start = '/(^WBGene[0-9]+)\s/ ' ;
178
247
$ end = '/^=\n/ ' ;
@@ -190,28 +259,29 @@ function functional_description(){
190
259
if (preg_match ($ end ,$ l )== 1 ){
191
260
$ collect = false ;
192
261
parent ::addRDF (
193
- parent ::triplifyString (parent ::getNamespace ().$ WBGene , parent ::getVoc ()."gene_description " , $ current_description )
262
+ parent ::triplifyString (parent ::getNamespace ().$ WBGene , parent ::getVoc ()."gene_description " , trim ( $ current_description) )
194
263
);
195
264
$ current_description ='' ;
196
265
}
197
266
198
267
if ($ collect == true ){
199
- $ current_description = $ current_description .rtrim ($ l );
268
+ $ current_description = $ current_description ." " . rtrim ($ l );
200
269
}
201
270
}
202
271
parent ::WriteRDFBufferToWriteFile ();
203
272
}#function functional_descri
204
273
205
- private function gene_association (){
274
+ private function gene_associations (){
206
275
207
- while ($ l = parent ::getReadFile->Read ()){
276
+ while ($ l = parent ::getReadFile ()->Read ()){
277
+ if ($ l [0 ] == '# ' ) continue ;
208
278
209
279
$ data = explode ("\t" , $ l );
210
280
$ gene = $ data [1 ];
211
- $ go = $ data [3 ];
212
- $ papers = $ data [4 ];
213
- $ evidence_type = $ data [5 ];
214
- $ taxon = $ data [9 ];
281
+ $ go = $ data [4 ];
282
+ $ papers = $ data [5 ];
283
+ $ evidence_type = $ data [6 ];
284
+ $ taxon = $ data [12 ];
215
285
216
286
$ go_evidence_type = array (
217
287
'IC ' =>'eco:0000001 ' ,
@@ -221,6 +291,8 @@ private function gene_association(){
221
291
'IGI ' =>'eco:0000316 ' ,
222
292
'IMP ' =>'eco:0000315 ' ,
223
293
'IPI ' =>'eco:0000021 ' ,
294
+ 'ISM ' => 'eco:0000202 ' ,
295
+ 'ISO ' => 'eco:0000201 ' ,
224
296
'ISS ' =>'eco:0000044 ' ,
225
297
'NAS ' =>'eco:0000034 ' ,
226
298
'ND ' =>'eco:0000035 ' ,
@@ -232,21 +304,26 @@ private function gene_association(){
232
304
$ association_label = $ gene ." " .$ go ." association " ;
233
305
parent ::addRDF (
234
306
parent ::describeIndividual ($ association_id , $ association_label , parent ::getVoc ()."Gene-GO-Association " ).
235
- parent ::triplify ($ association_id , parent ::getVoc ()."evidence_type " , $ go_evidence_type [$ evidence_type ]).
236
307
parent ::triplify ($ association_id , parent ::getVoc ()."gene " , parent ::getNamespace ().$ gene ).
237
308
parent ::triplify ($ association_id , parent ::getVoc ()."go_term " , $ go ).
238
309
parent ::triplify ($ association_id , parent ::getVoc ()."taxon " , $ taxon )
239
310
);
240
311
312
+ if ($ evidence_type !== '' ){
313
+ parent ::addRDF (
314
+ parent ::triplify ($ association_id , parent ::getVoc ()."evidence_type " , $ go_evidence_type [$ evidence_type ])
315
+ );
316
+ }
317
+
241
318
$ split_papers = explode ("| " , $ papers );
242
319
foreach ($ split_papers as $ paper ){
243
320
$ paper_id = null ;
244
321
$ split_paper = explode (": " , $ paper );
245
- if ($ paper [0 ] == "PMID " ){
246
- $ paper_id = "pubmed: " .$ paper [1 ];
247
- } elseif ($ paper [0 ] == "WB_REF " ){
248
- $ paper_id = parent ::getNamespace ().$ paper [1 ];
249
- $ paper_label = "Wormbase paper " .$ paper [1 ];
322
+ if ($ split_paper [0 ] == "PMID " ){
323
+ $ paper_id = "pubmed: " .$ split_paper [1 ];
324
+ } elseif ($ split_paper [0 ] == "WB_REF " ){
325
+ $ paper_id = parent ::getNamespace ().$ split_paper [1 ];
326
+ $ paper_label = "Wormbase paper " .$ split_paper [1 ];
250
327
parent ::addRDF (
251
328
parent ::describeIndividual ($ paper_id , $ paper_label , parent ::getVoc ()."Publication " )
252
329
);
@@ -260,9 +337,11 @@ private function gene_association(){
260
337
}
261
338
262
339
//phenotype association
263
- function phenotype_association (){
340
+ function phenotype_associations (){
264
341
265
342
while ($ l = parent ::getReadFile ()->Read ()){
343
+ if ($ l [0 ] == '# ' ) continue ;
344
+
266
345
$ data = explode ("\t" , $ l );
267
346
268
347
$ gene = $ data [1 ];
@@ -273,10 +352,10 @@ function phenotype_association(){
273
352
274
353
if ($ not == "NOT " ){
275
354
276
- $ pa_id = parent ::getRes ().md5 ($ gene .$ not .$ phenotype .$ paper .$ variant );
355
+ $ pa_id = parent ::getRes ().md5 ($ gene .$ not .$ phenotype .$ paper .$ var_rnai );
277
356
$ pa_label = "Gene-phenotype non-association between " .$ gene ." and " .$ phenotype ." under condition " .$ var_rnai ;
278
357
279
- $ npa_id = parent ::getRes ().md5 ($ gene .$ not .$ phenotype .$ paper .$ variant ."negative property assertion " );
358
+ $ npa_id = parent ::getRes ().md5 ($ gene .$ not .$ phenotype .$ paper .$ var_rnai ."negative property assertion " );
280
359
$ npa_label = "Negative property assertion stating that gene " .$ gene . "is not associated with phenotype " .$ phenotype ;
281
360
282
361
parent ::addRDF (
@@ -298,7 +377,7 @@ function phenotype_association(){
298
377
}
299
378
300
379
parent ::addRDF (
301
- parent ::describeIndividual ($ npa_id , $ npa_label , "owl:NegativeObjectPropertyAssertion " )
380
+ parent ::describeIndividual ($ npa_id , $ npa_label , "owl:NegativeObjectPropertyAssertion " ).
302
381
parent ::triplify ($ npa_id , "owl:sourceIndividual " , parent ::getNamespace ().$ gene ).
303
382
parent ::triplify ($ npa_id , "owl:assertionProperty " , parent ::getVoc ()."has-associated-phenotype " ).
304
383
parent ::triplify ($ npa_id , "owl:targetIndividual " , parent ::getNamespace ().$ phenotype )
@@ -307,7 +386,7 @@ function phenotype_association(){
307
386
308
387
309
388
} else {
310
- $ pa_id = parent ::getRes ().md5 ($ gene .$ phenotype .$ paper .$ variant );
389
+ $ pa_id = parent ::getRes ().md5 ($ gene .$ phenotype .$ paper .$ var_rnai );
311
390
$ pa_label = "Gene-phenotype association between " .$ gene ." and " .$ phenotype ." under condition " .$ var_rnai ;
312
391
parent ::addRDF (
313
392
parent ::describeIndividual ($ pa_id , $ pa_label , parent ::getVoc ()."Gene-Phenotype-Association " ).
@@ -328,15 +407,14 @@ function phenotype_association(){
328
407
);
329
408
}
330
409
}
331
-
332
-
410
+ parent ::WriteRDFBufferToWriteFile ();
333
411
}//while
334
- parent ::WriteRDFBufferToWriteFile ();
335
412
} ##phenotype_association
336
413
337
414
private function gene_interactions (){
338
415
#1 Regular expression to cath the data
339
416
while ($ l = parent ::getReadFile ()->Read ()){
417
+ if ($ l [0 ] == '# ' ) continue ;
340
418
341
419
$ data = explode ("\t" , $ l );
342
420
$ interaction = $ data [0 ];
@@ -367,7 +445,7 @@ private function gene_interactions(){
367
445
);
368
446
369
447
$ npa_id = parent ::getRes ().md5 ($ interaction_id ."negative property assertion " );
370
- $ npa_label = "Negative property assertion stating that " .$ gene1 ." and " .$ gene2 ."do not have a " .$ interaction_type ." interaction " ;
448
+ $ npa_label = "Negative property assertion stating that " .$ gene1 ." and " .$ gene2 ." do not have a " .$ interaction_type ." interaction " ;
371
449
372
450
parent ::addRDF (
373
451
parent ::describeIndividual ($ npa_id , $ npa_label , "owl:NegativeObjectPropertyAssertion " ).
@@ -385,7 +463,7 @@ private function gene_interactions(){
385
463
parent ::triplify (parent ::getNamespace ().$ gene1 , $ int_pred , parent ::getNamespace ().$ gene2 )
386
464
);
387
465
} else {
388
- $ interaction_label = $ int_additional_info ." " .strtolower ($ interaction_type ). "interaction between " .$ gene1 ." and " .$ gene2 ;
466
+ $ interaction_label = $ int_additional_info ." " .strtolower ($ interaction_type ). " interaction between " .$ gene1 ." and " .$ gene2 ;
389
467
parent ::addRDF (
390
468
parent ::describeIndividual ($ interaction_id , $ interaction_label , parent ::getVoc ().$ int_additional_info ."- " .$ interaction_type ."-Interaction " ).
391
469
parent ::describeClass (parent ::getVoc ().$ int_additional_info ."- " .$ interaction_type ."-Interaction " , $ int_additional_info ." " .$ interaction_type ." Interaction " , parent ::getVoc ().$ interaction_type ."-Interaction " ).
0 commit comments