Skip to content

Commit 837932b

Browse files
Finished updating wormbase parser to release 3
1 parent 3edbfda commit 837932b

File tree

1 file changed

+136
-58
lines changed

1 file changed

+136
-58
lines changed

wormbase/wormbase.php

Lines changed: 136 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@
3131

3232
class WormbaseParser extends Bio2RDFizer {
3333

34-
function __construct($argv, $path) {
34+
function __construct($argv) {
3535
parent::__construct($argv, "wormbase");
36-
parent::addParameter('files', true, null, 'all|geneIDs|functional_description|gene_association|gene_interactions|phenotype_association','all','files to process');
37-
parent::addParameter('release', true, null, 'WS235')
38-
parent::addParameter('download_url', false, null 'ftp://ftp.wormbase.org/pub/wormbase/')
36+
parent::addParameter('files', true, 'all|geneIDs|functional_descriptions|gene_associations|gene_interactions|phenotype_associations','all','files to process');
37+
parent::addParameter('release', true, null, 'WS235');
38+
parent::addParameter('download_url', false, null,'ftp://ftp.wormbase.org/pub/wormbase/');
3939
parent::initialize();
4040
}//constructor
4141

42-
public function Run(){
42+
public function run(){
4343

4444
if(parent::getParameterValue('download') === true)
4545
{
@@ -62,19 +62,19 @@ function download(){
6262
}
6363

6464
$remote_files = array(
65-
"geneIDs" => "species/c_elegans/annotation/geneIDs/c_elegans.".parent::parameterValue('release').".geneIDs.txt.gz",
66-
"functional_description" => "species/c_elegans/annotation/functional_descriptions/c_elegans.".parent::getParameterValue('release').".functional_descriptions.txt.gz",
67-
"gene_association" => "releases/".parent::getParameterValue('release')."/ONTOLOGY/gene_association.".parent::getParameterValue('release').".wb.ce",
68-
"gene_interactions" => "species/c_elegans/annotation/gene_interactions/c_elegans.".parent::parameterValue('release').".gene_interactions.txt.gz",
69-
"phenotype_association" => "releases/".parent::getParameterValue('release')."/ONTOLOGY/phenotype_association.".parent::getParameterValue('release').".wb"
65+
"geneIDs" => "species/c_elegans/annotation/geneIDs/c_elegans.".parent::getParameterValue('release').".geneIDs.txt.gz",
66+
"functional_descriptions" => "species/c_elegans/annotation/functional_descriptions/c_elegans.".parent::getParameterValue('release').".functional_descriptions.txt.gz",
67+
"gene_associations" => "releases/".parent::getParameterValue('release')."/ONTOLOGY/gene_association.".parent::getParameterValue('release').".wb.ce",
68+
"gene_interactions" => "species/c_elegans/annotation/gene_interactions/c_elegans.".parent::getParameterValue('release').".gene_interactions.txt.gz",
69+
"phenotype_associations" => "releases/".parent::getParameterValue('release')."/ONTOLOGY/phenotype_association.".parent::getParameterValue('release').".wb"
7070
);
7171

7272
$local_files = array(
73-
"geneIDs" => "c_elegans.".parent::parameterValue('release').".geneIDs.txt.gz",
74-
"functional_description" => parent::getParameterValue('release').".functional_descriptions.txt.gz",
75-
"gene_association" => "gene_association.".parent::getParameterValue('release').".wb.ce",
76-
"gene_interactions" => "c_elegans.".parent::parameterValue('release').".gene_interactions.txt.gz",
77-
"phenotype_association" => "phenotype_association.".parent::getParameterValue('release').".wb"
73+
"geneIDs" => "c_elegans.".parent::getParameterValue('release').".geneIDs.txt.gz",
74+
"functional_descriptions" => parent::getParameterValue('release').".functional_descriptions.txt.gz",
75+
"gene_associations" => "gene_association.".parent::getParameterValue('release').".wb.ce",
76+
"gene_interactions" => "c_elegans.".parent::getParameterValue('release').".gene_interactions.txt.gz",
77+
"phenotype_associations" => "phenotype_association.".parent::getParameterValue('release').".wb"
7878
);
7979

8080
//set directory values
@@ -84,7 +84,9 @@ function download(){
8484
foreach($files as $file){
8585
$rfile = $rdir.$remote_files[$file];
8686
$lfile = $ldir.$local_files[$file];
87-
parent::downloadSingle($rfile, $lfile);
87+
echo "Downloading ".$rfile."... ";
88+
Utils::DownloadSingle($rfile, $lfile);
89+
echo "done!".PHP_EOL;
8890
}
8991

9092
}
@@ -97,47 +99,114 @@ function process(){
9799
$files = explode(",",parent::getParameterValue('files'));
98100
}
99101

102+
$remote_files = array(
103+
"geneIDs" => "species/c_elegans/annotation/geneIDs/c_elegans.".parent::getParameterValue('release').".geneIDs.txt.gz",
104+
"functional_descriptions" => "species/c_elegans/annotation/functional_descriptions/c_elegans.".parent::getParameterValue('release').".functional_descriptions.txt.gz",
105+
"gene_associations" => "releases/".parent::getParameterValue('release')."/ONTOLOGY/gene_association.".parent::getParameterValue('release').".wb.ce",
106+
"gene_interactions" => "species/c_elegans/annotation/gene_interactions/c_elegans.".parent::getParameterValue('release').".gene_interactions.txt.gz",
107+
"phenotype_associations" => "releases/".parent::getParameterValue('release')."/ONTOLOGY/phenotype_association.".parent::getParameterValue('release').".wb"
108+
);
109+
100110
$local_files = array(
101-
"geneIDs" => "c_elegans.".parent::parameterValue('release').".geneIDs.txt.gz",
102-
"functional_description" => parent::getParameterValue('release').".functional_descriptions.txt.gz",
103-
"gene_association" => "gene_association.".parent::getParameterValue('release').".wb.ce",
104-
"gene_interactions" => "c_elegans.".parent::parameterValue('release').".gene_interactions.txt.gz",
105-
"phenotype_association" => "phenotype_association.".parent::getParameterValue('release').".wb"
111+
"geneIDs" => "c_elegans.".parent::getParameterValue('release').".geneIDs.txt.gz",
112+
"functional_descriptions" => parent::getParameterValue('release').".functional_descriptions.txt.gz",
113+
"gene_associations" => "gene_association.".parent::getParameterValue('release').".wb.ce",
114+
"gene_interactions" => "c_elegans.".parent::getParameterValue('release').".gene_interactions.txt.gz",
115+
"phenotype_associations" => "phenotype_association.".parent::getParameterValue('release').".wb"
106116
);
107117

108118
$idir = parent::getParameterValue('indir');
109119
$odir = parent::getParameterValue('outdir');
120+
$rdir = parent::getParameterValue('download_url');
121+
122+
$dataset_description = '';
123+
124+
$graph_uri = parent::getGraphURI();
125+
if(parent::getParameterValue('dataset_graph') == true) parent::setGraphURI(parent::getDatasetURI());
110126

111127
foreach($files as $file){
112128
$lfile = $idir.$local_files[$file];
129+
$rfile = $rdir.$remote_files[$file];
130+
131+
if(!file_exists($lfile)) {
132+
trigger_error($lfile." not found. Will attempt to download.".PHP_EOL, E_USER_WARNING);
133+
echo "Downloading $rfile... ";
134+
Utils::DownloadSingle($rfile, $lfile);
135+
echo "done!".PHP_EOL;
136+
}
137+
113138
if(strstr($lfile, "gz")){
114139
parent::setReadFile($lfile, TRUE);
115140
} else {
116141
parent::setReadFile($lfile, FALSE);
117142
}
118143

119144
$suffix = parent::getParameterValue('output_format');
120-
$ofile = $file.".".$suffix;
145+
$ofile = "wormbase_celegans_".parent::getParameterValue('release')."_".$file.".".$suffix;
121146

122147
if(strstr(parent::getParameterValue('output_format'), "gz")) {
123148
$gz = true;
124149
}
125150

126-
$this->SetWriteFile($odir.$file, $gz);
151+
parent::setWriteFile($odir.$ofile, $gz);
127152

128-
echo "Processing $file... "
153+
echo "Processing $file... ";
129154
$fnx = $file;
130155
$this-> $fnx();
131-
echo "done!";
156+
echo "done!".PHP_EOL;
157+
158+
parent::getWriteFile()->close();
159+
160+
// generate the dataset release file
161+
echo "Generating dataset description for $ofile... ";
162+
// dataset description
163+
$source_file = (new DataResource($this))
164+
->setURI($rfile)
165+
->setTitle("WormBase C. elegans Release ".parent::getParameterValue('release')." subset ($file)")
166+
->setRetrievedDate( date ("Y-m-d\TG:i:s\Z", filemtime($lfile)))
167+
->setFormat("text/tab-separated-value")
168+
->setFormat("application/gzip")
169+
->setPublisher("http://wormbase.org/")
170+
->setHomepage("http://wormbase.org/")
171+
->setRights("use")
172+
->setRights("restricted-by-source-license")
173+
->setLicense("http://www.wormbase.org/about/policies")
174+
->setDataset("http://identifiers.org/wormbase/");
175+
176+
$prefix = parent::getPrefix();
177+
$bVersion = parent::getParameterValue('bio2rdf_release');
178+
$date = date ("Y-m-d\TG:i:s\Z");
179+
$output_file = (new DataResource($this))
180+
->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile")
181+
->setTitle("Bio2RDF v$bVersion RDF version of $prefix (generated at $date)")
182+
->setSource($source_file->getURI())
183+
->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/wormbase/wormbase.php")
184+
->setCreateDate($date)
185+
->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html")
186+
->setPublisher("http://bio2rdf.org")
187+
->setRights("use-share-modify")
188+
->setRights("by-attribution")
189+
->setRights("restricted-by-source-license")
190+
->setLicense("http://creativecommons.org/licenses/by/3.0/")
191+
->setDataset(parent::getDatasetURI());
192+
if($gz) $output_file->setFormat("application/gzip");
193+
if(strstr(parent::getParameterValue('output_format'),"nt")) $output_file->setFormat("application/n-triples");
194+
else $output_file->setFormat("application/n-quads");
195+
$dataset_description .= $source_file->toRDF().$output_file->toRDF();
196+
echo "done!".PHP_EOL;
132197
}
198+
parent::setGraphURI($graph_uri);
199+
parent::setWriteFile($odir.parent::getBio2RDFReleaseFile());
200+
parent::getWriteFile()->write($dataset_description);
201+
parent::getWriteFile()->close();
133202
}
134203

135204
function geneIDs(){
136205
$first = true;
137206
while($l = $this->GetReadFile()->Read()){
138207
if($l[0] == '#') continue;
139208

140-
$data = explode("\t",trim($l));
209+
$data = explode(",",trim($l));
141210

142211
if($first) {
143212
if(($c = count($data) != 3)) {
@@ -147,32 +216,32 @@ function geneIDs(){
147216
}
148217
//add the rdf:type
149218

150-
$id = parent::getNamespace().$gene_IDs[0];
151-
$gene_label = "WormBase gene ".$gene_IDs[1]." with cosmid name ".$gene_IDs[2];
219+
$id = parent::getNamespace().$data[0];
220+
$gene_label = "WormBase gene ".$data[1]." with cosmid name ".$data[2];
152221

153222
parent::addRDF(
154223
parent::describeIndividual($id, $gene_label, parent::getVoc()."Gene")
155224
);
156225

157226
//add gene approved name
158-
if ($gene_IDs[1] != '') {
227+
if ($data[1] != '') {
159228
parent::addRDF(
160-
parent::triplifyString($id, parent::getVoc()."has_approved_gene_name", $gene_IDs[1])
229+
parent::triplifyString($id, parent::getVoc()."has_approved_gene_name", $data[1])
161230
);
162231
}
163232
#Add cosmid name
164-
if ($gene_IDs[2] != '') {
165-
$cosmid_id = parent::getRes().$gene_IDs[2];
233+
if ($data[2] != '') {
234+
$cosmid_id = parent::getRes().$data[2];
166235
parent::addRDF(
167-
parent::describeIndividual($cosmid_id, "Gene/cosmid name for ".$gene_IDs[0], parent::getVoc()."Cosmid_gene")
236+
parent::describeIndividual($cosmid_id, "Gene/cosmid name for ".$data[0], parent::getVoc()."Cosmid_gene").
168237
parent::triplify($id, parent::getVoc()."has_sequence/cosmid_name", $cosmid_id)
169238
);
170239
}
171240
parent::WriteRDFBufferToWriteFile();
172241
}//while
173242
}# Funcion Gene_IDs
174243

175-
function functional_description(){
244+
function functional_descriptions(){
176245

177246
$start = '/(^WBGene[0-9]+)\s/';
178247
$end = '/^=\n/';
@@ -190,28 +259,29 @@ function functional_description(){
190259
if (preg_match($end,$l)== 1 ){
191260
$collect = false;
192261
parent::addRDF(
193-
parent::triplifyString(parent::getNamespace().$WBGene, parent::getVoc()."gene_description", $current_description)
262+
parent::triplifyString(parent::getNamespace().$WBGene, parent::getVoc()."gene_description", trim($current_description))
194263
);
195264
$current_description='';
196265
}
197266

198267
if ($collect == true){
199-
$current_description = $current_description.rtrim($l);
268+
$current_description = $current_description." ".rtrim($l);
200269
}
201270
}
202271
parent::WriteRDFBufferToWriteFile();
203272
}#function functional_descri
204273

205-
private function gene_association(){
274+
private function gene_associations(){
206275

207-
while($l = parent::getReadFile->Read()){
276+
while($l = parent::getReadFile()->Read()){
277+
if($l[0] == '#') continue;
208278

209279
$data = explode("\t", $l);
210280
$gene = $data[1];
211-
$go = $data[3];
212-
$papers = $data[4];
213-
$evidence_type = $data[5];
214-
$taxon = $data[9];
281+
$go = $data[4];
282+
$papers = $data[5];
283+
$evidence_type = $data[6];
284+
$taxon = $data[12];
215285

216286
$go_evidence_type = array(
217287
'IC'=>'eco:0000001',
@@ -221,6 +291,8 @@ private function gene_association(){
221291
'IGI'=>'eco:0000316',
222292
'IMP'=>'eco:0000315',
223293
'IPI'=>'eco:0000021',
294+
'ISM' => 'eco:0000202',
295+
'ISO' => 'eco:0000201',
224296
'ISS'=>'eco:0000044',
225297
'NAS'=>'eco:0000034',
226298
'ND'=>'eco:0000035',
@@ -232,21 +304,26 @@ private function gene_association(){
232304
$association_label = $gene." ".$go." association";
233305
parent::addRDF(
234306
parent::describeIndividual($association_id, $association_label, parent::getVoc()."Gene-GO-Association").
235-
parent::triplify($association_id, parent::getVoc()."evidence_type", $go_evidence_type[$evidence_type]).
236307
parent::triplify($association_id, parent::getVoc()."gene", parent::getNamespace().$gene).
237308
parent::triplify($association_id, parent::getVoc()."go_term", $go).
238309
parent::triplify($association_id, parent::getVoc()."taxon", $taxon)
239310
);
240311

312+
if($evidence_type !== ''){
313+
parent::addRDF(
314+
parent::triplify($association_id, parent::getVoc()."evidence_type", $go_evidence_type[$evidence_type])
315+
);
316+
}
317+
241318
$split_papers = explode("|", $papers);
242319
foreach($split_papers as $paper){
243320
$paper_id = null;
244321
$split_paper = explode(":", $paper);
245-
if($paper[0] == "PMID"){
246-
$paper_id = "pubmed:".$paper[1];
247-
} elseif($paper[0] == "WB_REF"){
248-
$paper_id = parent::getNamespace().$paper[1];
249-
$paper_label = "Wormbase paper ".$paper[1];
322+
if($split_paper[0] == "PMID"){
323+
$paper_id = "pubmed:".$split_paper[1];
324+
} elseif($split_paper[0] == "WB_REF"){
325+
$paper_id = parent::getNamespace().$split_paper[1];
326+
$paper_label = "Wormbase paper ".$split_paper[1];
250327
parent::addRDF(
251328
parent::describeIndividual($paper_id, $paper_label, parent::getVoc()."Publication")
252329
);
@@ -260,9 +337,11 @@ private function gene_association(){
260337
}
261338

262339
//phenotype association
263-
function phenotype_association(){
340+
function phenotype_associations(){
264341

265342
while($l = parent::getReadFile()->Read()){
343+
if($l[0] == '#') continue;
344+
266345
$data = explode("\t", $l);
267346

268347
$gene = $data[1];
@@ -273,10 +352,10 @@ function phenotype_association(){
273352

274353
if($not == "NOT"){
275354

276-
$pa_id = parent::getRes().md5($gene.$not.$phenotype.$paper.$variant);
355+
$pa_id = parent::getRes().md5($gene.$not.$phenotype.$paper.$var_rnai);
277356
$pa_label = "Gene-phenotype non-association between ".$gene." and ".$phenotype." under condition ".$var_rnai;
278357

279-
$npa_id = parent::getRes().md5($gene.$not.$phenotype.$paper.$variant."negative property assertion");
358+
$npa_id = parent::getRes().md5($gene.$not.$phenotype.$paper.$var_rnai."negative property assertion");
280359
$npa_label = "Negative property assertion stating that gene ".$gene. "is not associated with phenotype ".$phenotype;
281360

282361
parent::addRDF(
@@ -298,7 +377,7 @@ function phenotype_association(){
298377
}
299378

300379
parent::addRDF(
301-
parent::describeIndividual($npa_id, $npa_label, "owl:NegativeObjectPropertyAssertion")
380+
parent::describeIndividual($npa_id, $npa_label, "owl:NegativeObjectPropertyAssertion").
302381
parent::triplify($npa_id, "owl:sourceIndividual", parent::getNamespace().$gene).
303382
parent::triplify($npa_id, "owl:assertionProperty", parent::getVoc()."has-associated-phenotype").
304383
parent::triplify($npa_id, "owl:targetIndividual", parent::getNamespace().$phenotype)
@@ -307,7 +386,7 @@ function phenotype_association(){
307386

308387

309388
} else {
310-
$pa_id = parent::getRes().md5($gene.$phenotype.$paper.$variant);
389+
$pa_id = parent::getRes().md5($gene.$phenotype.$paper.$var_rnai);
311390
$pa_label = "Gene-phenotype association between ".$gene." and ".$phenotype." under condition ".$var_rnai;
312391
parent::addRDF(
313392
parent::describeIndividual($pa_id, $pa_label, parent::getVoc()."Gene-Phenotype-Association").
@@ -328,15 +407,14 @@ function phenotype_association(){
328407
);
329408
}
330409
}
331-
332-
410+
parent::WriteRDFBufferToWriteFile();
333411
}//while
334-
parent::WriteRDFBufferToWriteFile();
335412
} ##phenotype_association
336413

337414
private function gene_interactions(){
338415
#1 Regular expression to cath the data
339416
while($l = parent::getReadFile()->Read()){
417+
if($l[0] == '#') continue;
340418

341419
$data = explode("\t", $l);
342420
$interaction = $data[0];
@@ -367,7 +445,7 @@ private function gene_interactions(){
367445
);
368446

369447
$npa_id = parent::getRes().md5($interaction_id."negative property assertion");
370-
$npa_label = "Negative property assertion stating that ".$gene1." and ".$gene2."do not have a ".$interaction_type." interaction";
448+
$npa_label = "Negative property assertion stating that ".$gene1." and ".$gene2." do not have a ".$interaction_type." interaction";
371449

372450
parent::addRDF(
373451
parent::describeIndividual($npa_id, $npa_label, "owl:NegativeObjectPropertyAssertion").
@@ -385,7 +463,7 @@ private function gene_interactions(){
385463
parent::triplify(parent::getNamespace().$gene1, $int_pred, parent::getNamespace().$gene2)
386464
);
387465
} else {
388-
$interaction_label = $int_additional_info." ".strtolower($interaction_type). "interaction between ".$gene1." and ".$gene2;
466+
$interaction_label = $int_additional_info." ".strtolower($interaction_type). " interaction between ".$gene1." and ".$gene2;
389467
parent::addRDF(
390468
parent::describeIndividual($interaction_id, $interaction_label, parent::getVoc().$int_additional_info."-".$interaction_type."-Interaction").
391469
parent::describeClass(parent::getVoc().$int_additional_info."-".$interaction_type."-Interaction", $int_additional_info." ".$interaction_type." Interaction", parent::getVoc().$interaction_type."-Interaction").

0 commit comments

Comments
 (0)