@@ -58,22 +58,62 @@ function __construct($argv) {
58
58
}//constructor
59
59
60
60
function Run (){
61
+ if (parent ::getParameterValue ('download ' ) === true )
62
+ {
63
+ $ this ->download ();
64
+ }
65
+ if (parent ::getParameterValue ('process ' ) === true )
66
+ {
67
+ $ this ->process ();
68
+ }
69
+ }//run
70
+
71
+ function download (){
61
72
$ ldir = parent ::getParameterValue ('indir ' );
62
- $ odir = parent ::getParameterValue ('outdir ' );
63
73
$ rdir = parent ::getParameterValue ('download_url ' );
64
74
65
- //make sure directories end with slash
66
- if (substr ($ ldir , -1 ) !== "/ " ){
67
- $ ldir = $ ldir ."/ " ;
75
+ //which files are to be converted?
76
+ $ files = parent ::getParameterValue ('files ' );
77
+ if ($ files == 'all ' ) {
78
+ $ files = $ this ->getPackageMap ();
79
+ } else {
80
+ $ sel_arr = explode (", " ,$ selectedPackage );
81
+ $ pm = $ this ->getPackageMap ();
82
+ $ files = array ();
83
+ foreach ($ sel_arr as $ a ){
84
+ if (array_key_exists ($ a , $ pm )){
85
+ $ files [$ a ] = $ pm [$ a ];
86
+ }
87
+ }
68
88
}
69
-
70
- if (substr ($ odir , -1 ) !== "/ " ){
71
- $ odir = $ odir ."/ " ;
89
+
90
+ //now iterate over the files array
91
+ foreach ($ files as $ id => $ file ){
92
+ echo "Processing $ id ... " ;
93
+
94
+ $ lfile = $ ldir .$ id .".gz " ;
95
+
96
+ // download
97
+ //don't use subdirectory GENE_INFO for saving local version of All_data.gene_info.gz
98
+ if ($ id == "gene2sts " || $ id == "gene2unigene " ) {
99
+ $ rfile = "compress.zlib:// " .$ rdir .$ file ;
100
+ } else {
101
+ $ rfile = $ rdir .$ file ;
102
+ }
103
+ Utils::DownloadSingle ($ rfile , $ lfile );
72
104
}
73
105
106
+ }
107
+
108
+ function process (){
109
+
110
+ $ ldir = parent ::getParameterValue ('indir ' );
111
+ $ odir = parent ::getParameterValue ('outdir ' );
112
+ $ rdir = parent ::getParameterValue ('download_url ' );
113
+
74
114
//which files are to be converted?
75
- $ selectedPackage = trim ($ this ->GetParameterValue ('files ' ));
76
- if ($ selectedPackage == 'all ' ) {
115
+ $ files = trim ($ this ->GetParameterValue ('files ' ));
116
+ if ($ files == 'all ' ) {
77
117
$ files = $ this ->getPackageMap ();
78
118
} else {
79
119
$ sel_arr = explode (", " ,$ selectedPackage );
@@ -85,14 +125,22 @@ function Run(){
85
125
}
86
126
}
87
127
}
128
+
129
+ //set dataset graph to be dataset URI
130
+ $ graph_uri = parent ::getGraphURI ();
131
+ if (parent ::getParameterValue ('dataset_graph ' ) == true ) parent ::setGraphURI (parent ::getDatasetURI ());
132
+
133
+ $ dataset_description = '' ;
134
+
88
135
//now iterate over the files array
89
136
foreach ($ files as $ id => $ file ){
90
137
echo "Processing $ id ... " ;
91
138
92
139
$ lfile = $ ldir .$ id .".gz " ;
93
140
94
141
// download
95
- if (!file_exists ($ lfile ) || parent ::getParameterValue ('download ' ) == true ) {
142
+ if (!file_exists ($ lfile )) {
143
+ trigger_error ($ lfile ." not found. Will attempt to download. " , E_USER_NOTICE );
96
144
//don't use subdirectory GENE_INFO for saving local version of All_data.gene_info.gz
97
145
if ($ id == "gene2sts " || $ id == "gene2unigene " ) {
98
146
$ rfile = "compress.zlib:// " .$ rdir .$ file ;
@@ -102,7 +150,7 @@ function Run(){
102
150
Utils::DownloadSingle ($ rfile , $ lfile );
103
151
}
104
152
105
- $ ofile = $ odir . $ id .".nt " ;
153
+ $ ofile = $ id .".nt " ;
106
154
$ gz = false ;
107
155
if (parent ::getParameterValue ('graph_uri ' )) {
108
156
$ ofile .= ".nq " ;
@@ -113,34 +161,63 @@ function Run(){
113
161
}
114
162
115
163
parent ::setReadFile ($ lfile , true );
116
- parent ::setWriteFile ($ ofile , $ gz );
164
+ parent ::setWriteFile ($ odir . $ ofile , $ gz );
117
165
$ fnx = $ id ;
118
- echo ' parsing ... ' ;
166
+ echo ' Processing $id ... ' . PHP_EOL ;
119
167
$ this ->$ fnx ();
120
168
echo 'done! ' .PHP_EOL ;
121
169
parent ::getReadFile ()->Close ();
122
170
parent ::getWriteFile ()->Close ();
171
+
172
+ // generate the dataset release file
173
+ echo "Generating dataset description... " ;
174
+ // dataset description
175
+ $ source_file = (new DataResource ($ this ))
176
+ ->setURI ($ rfile )
177
+ ->setTitle ("NCBI Gene ( $ id) " )
178
+ ->setRetrievedDate ( date ("Y-m-d\TG:i:s\Z " , filemtime ($ lfile )))
179
+ ->setFormat ("text/tab-separated-value " )
180
+ ->setFormat ("application/gzip " )
181
+ ->setPublisher ("http://www.ncbi.nlm.nih.gov " )
182
+ ->setHomepage ("http://www.ncbi.nlm.nih.gov/gene " )
183
+ ->setRights ("use-share-modify " )
184
+ ->setLicense ("http://www.ncbi.nlm.nih.gov/About/disclaimer.html " )
185
+ ->setDataset ("http://identifiers.org/ncbigene/ " );
186
+
187
+ $ prefix = parent ::getPrefix ();
188
+ $ bVersion = parent ::getParameterValue ('bio2rdf_release ' );
189
+ $ date = date ("Y-m-d\TG:i:s\Z " );
190
+ $ output_file = (new DataResource ($ this ))
191
+ ->setURI ("http://download.bio2df.org/release/ $ bVersion/ $ prefix/ $ ofile " )
192
+ ->setTitle ("Bio2RDF v $ bVersion RDF version of $ prefix (generated at $ date) " )
193
+ ->setSource ($ source_file ->getURI ())
194
+ ->setCreator ("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/gene/entrez_gene.php " )
195
+ ->setCreateDate ($ date )
196
+ ->setHomepage ("http://download.bio2rdf.org/release/ $ bVersion/ $ prefix/ $ prefix.html " )
197
+ ->setPublisher ("http://bio2rdf.org " )
198
+ ->setRights ("use-share-modify " )
199
+ ->setRights ("by-attribution " )
200
+ ->setRights ("restricted-by-source-license " )
201
+ ->setLicense ("http://creativecommons.org/licenses/by/3.0/ " )
202
+ ->setDataset (parent ::getDatasetURI ());
203
+
204
+ if ($ gz ) $ output_file ->setFormat ("application/gzip " );
205
+ if (strstr (parent ::getParameterValue ('output_format ' ),"nt " )) $ output_file ->setFormat ("application/n-triples " );
206
+ else $ output_file ->setFormat ("application/n-quads " );
207
+
208
+ $ dataset_description .= $ source_file ->toRDF ().$ output_file ->toRDF ();
123
209
124
210
}//foreach
125
211
126
- // generate the dataset release file
127
- echo "generating dataset release file... " ;
128
- $ desc = parent ::getBio2RDFDatasetDescription (
129
- $ this ->getPrefix (),
130
- "https://github.com/bio2rdf/bio2rdf-scripts/blob/master/gene/entrez_gene.php " ,
131
- $ this ->getBio2RDFDownloadURL ($ this ->getNamespace ()),
132
- "http://yeastgenome.org " ,
133
- array ("use-share-modify " ),
134
- "http://www.ncbi.nlm.nih.gov/About/disclaimer.html " ,
135
- parent ::getParameterValue ('download_url ' ),
136
- parent ::getDatasetVersion ()
137
- );
138
- $ this ->setWriteFile ($ odir .$ this ->getBio2RDFReleaseFile ($ this ->getNamespace ()));
139
- $ this ->getWriteFile ()->write ($ desc );
140
- $ this ->getWriteFile ()->close ();
212
+ //set graph URI back to default value
213
+ parent ::setGraphURI ($ graph_uri );
214
+ //write dataset description to file
215
+ parent ::setWriteFile ($ odir .parent ::getBio2RDFReleaseFile ());
216
+ parent ::getWriteFile ()->write ($ dataset_description );
217
+ parent ::getWriteFile ()->close ();
141
218
echo "done! " .PHP_EOL ;
142
219
143
- }//run
220
+ }
144
221
145
222
#see: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README
146
223
private function gene2vega (){
@@ -673,16 +750,4 @@ public function getPackageMap(){
673
750
return self ::$ packageMap ;
674
751
}
675
752
}
676
-
677
- $ start = microtime (true );
678
-
679
- set_error_handler ('error_handler ' );
680
- $ parser = new EntrezGeneParser ($ argv );
681
- $ parser -> Run ();
682
-
683
- $ end = microtime (true );
684
- $ time_taken = $ end - $ start ;
685
- print "Started: " .date ("l jS F \@ g:i:s a " , $ start )."\n" ;
686
- print "Finished: " .date ("l jS F \@ g:i:s a " , $ end )."\n" ;
687
- print "Took: " .$ time_taken ." seconds \n"
688
753
?>
0 commit comments