|
| 1 | +<?php |
| 2 | +/** |
| 3 | +Copyright (C) 2013 Michel Dumontier |
| 4 | +
|
| 5 | +Permission is hereby granted, free of charge, to any person obtaining a copy of |
| 6 | +this software and associated documentation files (the "Software"), to deal in |
| 7 | +the Software without restriction, including without limitation the rights to |
| 8 | +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
| 9 | +of the Software, and to permit persons to whom the Software is furnished to do |
| 10 | +so, subject to the following conditions: |
| 11 | +
|
| 12 | +The above copyright notice and this permission notice shall be included in all |
| 13 | +copies or substantial portions of the Software. |
| 14 | +
|
| 15 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 18 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 19 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 20 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 21 | +SOFTWARE. |
| 22 | +*/ |
| 23 | + |
| 24 | + |
| 25 | +/** |
| 26 | + * An RDF generator for DailyMed |
| 27 | + * documentation: https://dailymed.nlm.nih.gov/ |
| 28 | + * @version 1.0 |
| 29 | + * @author Michel Dumontier |
| 30 | +*/ |
| 31 | + |
| 32 | +require_once(__DIR__.'/../../php-lib/bio2rdfapi.php'); |
| 33 | + |
| 34 | +class DailymedParser extends Bio2RDFizer |
| 35 | +{ |
| 36 | + function __construct($argv) { |
| 37 | + parent::__construct($argv, "dailymed"); |
| 38 | + parent::addParameter('files',true,'all|prescription|otc','all','all or comma-separated list of short names to process'); |
| 39 | + parent::addParameter('download_url',false,null,'ftp://public.nlm.nih.gov/nlmdata/.dailymed/'); |
| 40 | + parent::initialize(); |
| 41 | + } |
| 42 | + |
| 43 | + var $filemap = array( |
| 44 | + "prescription" => array( |
| 45 | + "dm_spl_release_human_rx_part1.zip", |
| 46 | + "dm_spl_release_human_rx_part2.zip", |
| 47 | + "dm_spl_release_human_rx_part3.zip", |
| 48 | + "dm_spl_release_human_rx_part4.zip"), |
| 49 | + "otc" => array( |
| 50 | + "dm_spl_release_human_otc_part1.zip", |
| 51 | + "dm_spl_release_human_otc_part2.zip", |
| 52 | + "dm_spl_release_human_otc_part3.zip", |
| 53 | + "dm_spl_release_human_otc_part4.zip", |
| 54 | + "dm_spl_release_human_otc_part5.zip", |
| 55 | + "dm_spl_release_human_otc_part6.zip", |
| 56 | + "dm_spl_release_human_otc_part7.zip" |
| 57 | + ) |
| 58 | + |
| 59 | + ); |
| 60 | + |
| 61 | + function run() |
| 62 | + { |
| 63 | + $dd = ''; |
| 64 | + $ldir = parent::getParameterValue('indir'); |
| 65 | + $odir = parent::getParameterValue('outdir'); |
| 66 | + $tdir = $ldir."tmp/"; |
| 67 | + @mkdir ($tdir, 0777); |
| 68 | + |
| 69 | + $files = parent::getParameterValue('files'); |
| 70 | + if($files == 'all') { |
| 71 | + $files = explode('|', parent::getParameterList('files')); |
| 72 | + array_shift($files); |
| 73 | + } else { |
| 74 | + $files = explode(',', parent::getParameterValue('files')); |
| 75 | + } |
| 76 | + |
| 77 | + |
| 78 | + foreach($files AS $filetype) { |
| 79 | + echo "processing $filetype ..."; |
| 80 | + |
| 81 | + // download files |
| 82 | + $files_ = $this->filemap[$filetype]; |
| 83 | + foreach($files_ AS $file) { |
| 84 | + $lfile = $ldir.$file; |
| 85 | + $rfile = parent::getParameterValue('download_url').$file; |
| 86 | + if(!file_exists($lfile) || parent::getParameterValue('download') == 'true') { |
| 87 | + $ret = utils::downloadSingle($rfile,$lfile); |
| 88 | + if($ret === false) { |
| 89 | + echo "unable to download $file ... skipping".PHP_EOL; |
| 90 | + continue; |
| 91 | + } |
| 92 | + } |
| 93 | + } |
| 94 | +/* |
| 95 | + $xmlfile = "c:/data/download/dailymed/tmp/8ae4a0c1-1424-47a9-9a59-7fe38bedc0c7.xml"; |
| 96 | + $this->setReadFile($xmlfile); |
| 97 | + $this->$filetype($xmlfile); |
| 98 | + exit; |
| 99 | +*/ |
| 100 | + // process files |
| 101 | + $z= 0; |
| 102 | + foreach($files_ AS $file) { |
| 103 | + $lfile = $ldir.$file; |
| 104 | + $zin1 = new ZipArchive(); |
| 105 | + if ($zin1->open($lfile) === FALSE) { |
| 106 | + trigger_error("Unable to open $lfile"); |
| 107 | + exit; |
| 108 | + } |
| 109 | + |
| 110 | + $suffix = parent::getParameterValue('output_format'); |
| 111 | + $ofile = "dailymed-".substr($file,0,-4).'.'.$suffix; |
| 112 | + $gz = strstr(parent::getParameterValue('output_format'), "gz")?($gz=true):($gz=false); |
| 113 | + parent::setWriteFile($odir.$ofile, $gz); |
| 114 | + |
| 115 | + for($i = 0; $i < $zin1->count(); $i++) { |
| 116 | + //if(++$z == 20) break; |
| 117 | + $entry = $zin1->getNameIndex($i); |
| 118 | + echo "processing $entry".PHP_EOL; |
| 119 | + |
| 120 | + // extract the dailymed entry (zip file) as a temporary file |
| 121 | + $fileinfo = pathinfo($entry); |
| 122 | + $tfile = $tdir.$fileinfo['basename']; |
| 123 | + if(!file_exists($tfile)) { |
| 124 | + //break; |
| 125 | + copy("zip://".$lfile."#".$entry, $tfile); |
| 126 | + } |
| 127 | + |
| 128 | + // read the dailmed entry zip file |
| 129 | + $zin2 = new ZipArchive(); |
| 130 | + if ($zin2->open($tfile) !== TRUE) { |
| 131 | + trigger_error("Unable to open $lfile2",E_USER_ERROR); |
| 132 | + exit; |
| 133 | + } |
| 134 | + |
| 135 | + // now find, extract, and process the xml file |
| 136 | + for($j = 0; $j < $zin2->count(); $j++) { |
| 137 | + $f = $zin2->getNameIndex($j); |
| 138 | + if(!strstr($f,".xml")) continue; |
| 139 | + |
| 140 | + $fileinfo = pathinfo($f); |
| 141 | + $xmlfile = $tdir.$fileinfo['basename']; |
| 142 | + $gzxml = $xmlfile.".gz"; |
| 143 | + if(!file_exists($gzxml)) { |
| 144 | + copy("zip://".$tfile."#".$f, "compress.zlib://".$gzxml); |
| 145 | + } |
| 146 | + |
| 147 | + $this->setReadFile($gzxml); |
| 148 | + $this->$filetype($gzxml); |
| 149 | + $this->getReadFile()->close(); |
| 150 | + $this->clear(); |
| 151 | + //unlink($xmlfile); |
| 152 | + } |
| 153 | + $zin2->close(); |
| 154 | + //unlink($tfile); |
| 155 | + } |
| 156 | + $zin1->close(); |
| 157 | + parent::getWriteFile()->close(); |
| 158 | + } |
| 159 | + |
| 160 | + // dataset description |
| 161 | + $source_file = (new DataResource($this)) |
| 162 | + ->setURI($rfile) |
| 163 | + ->setTitle("Dailymed: $file") |
| 164 | + ->setRetrievedDate(parent::getDate(filemtime($lfile))) |
| 165 | + ->setFormat("application/xml") |
| 166 | + ->setPublisher("https://dailymed.nlm.nih.gov") |
| 167 | + ->setHomepage("https://dailymed.nlm.nih.gov") |
| 168 | + ->setRights("use") |
| 169 | + ->setLicense("http://creativecommons.org/licenses/by-nd/3.0/") |
| 170 | + ->setDataset("http://identifiers.org/dailmed/"); |
| 171 | + |
| 172 | + $prefix = parent::getPrefix(); |
| 173 | + $bVersion = parent::getParameterValue('bio2rdf_release'); |
| 174 | + $date = parent::getDate(filemtime($odir.$ofile)); |
| 175 | + |
| 176 | + $output_file = (new DataResource($this)) |
| 177 | + ->setURI("http://download.bio2rdf.org/release/$bVersion/$prefix/$ofile") |
| 178 | + ->setTitle("Bio2RDF v$bVersion RDF version of $prefix") |
| 179 | + ->setSource($source_file->getURI()) |
| 180 | + ->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/dailymed/dailymed.php") |
| 181 | + ->setCreateDate($date) |
| 182 | + ->setHomepage("http://download.bio2rdf.org/release/$bVersion/$prefix/$prefix.html") |
| 183 | + ->setPublisher("http://bio2rdf.org") |
| 184 | + ->setRights("use-share-modify") |
| 185 | + ->setRights("by-attribution") |
| 186 | + ->setRights("restricted-by-source-license") |
| 187 | + ->setLicense("http://creativecommons.org/licenses/by/3.0/") |
| 188 | + ->setDataset(parent::getDatasetURI()); |
| 189 | + |
| 190 | + $gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true; |
| 191 | + if($gz) $output_file->setFormat("application/gzip"); |
| 192 | + if(strstr(parent::getParameterValue('output_format'),"nt")) $output_file->setFormat("application/n-triples"); |
| 193 | + else $output_file->setFormat("application/n-quads"); |
| 194 | + |
| 195 | + $dd .= $source_file->toRDF().$output_file->toRDF(); |
| 196 | + |
| 197 | + }//foreach |
| 198 | + parent::writeToReleaseFile($dd); |
| 199 | + } |
| 200 | + |
| 201 | + function otc($file) |
| 202 | + { |
| 203 | + return $this->prescription($file); |
| 204 | + } |
| 205 | + |
| 206 | + function prescription($file) |
| 207 | + { |
| 208 | + $xml = new CXML($file); |
| 209 | + parent::addRDF( |
| 210 | + parent::describeClass(parent::getVoc()."Indication-Section", "FDA product label indication section") |
| 211 | + ); |
| 212 | + |
| 213 | + while($xml->parse("document") == TRUE) { |
| 214 | + $x = $xml->getXMLRoot(); |
| 215 | + $setid = $x->setId->attributes()->root; |
| 216 | + $id = parent::getNamespace().$setid; |
| 217 | + $title = addslashes(str_replace( array("\t", "\r","\n", '"'), array(" ","","",""), (string) $x->title)); |
| 218 | + #@todo look elsewhere if empty |
| 219 | + |
| 220 | + $type_id = "loinc:".$x->code->attributes()->code; |
| 221 | + $type_label = $x->code->attributes()->displayName; |
| 222 | + |
| 223 | + parent::addRDF( |
| 224 | + parent::describeIndividual($id, $title, $type_id). |
| 225 | + parent::describeClass($type_id, $type_label) |
| 226 | + ); |
| 227 | + |
| 228 | + $z = 1; |
| 229 | + foreach($x->component->structuredBody->component AS $c) { |
| 230 | + $section = $c->section; |
| 231 | + $code = (string) @$section->code->attributes()->code; |
| 232 | + if($code != "34067-9" and $code != "42229-5") continue; // indications |
| 233 | + $type_id = "loinc:$code"; |
| 234 | + |
| 235 | + if(isset($section->text)) { |
| 236 | + $sid = parent::getRes().md5($section->text->asXML()); |
| 237 | + $x = (string) $section->text->asXML(); |
| 238 | + |
| 239 | + $x = preg_replace('/(?i)<[^>]*>/', ' ', $x); |
| 240 | + setlocale(LC_ALL, 'en_GB'); |
| 241 | + $x = @iconv('UTF-8', 'ASCII//IGNORE', $x); |
| 242 | + $x = str_replace(array('"',"'",'\\','�'),'', $x); |
| 243 | + $x = trim(preg_replace("/\s+/",' ',$x)); |
| 244 | + $x = addslashes($x); |
| 245 | + |
| 246 | + parent::addRDF( |
| 247 | + parent::describeIndividual($sid, "indication section", $type_id). |
| 248 | + parent::triplifyString($sid, "rdf:value", $x). |
| 249 | + parent::triplifyString($sid, parent::getVoc()."strlen", strlen($x)). |
| 250 | + parent::triplify($id, parent::getVoc()."indicationSection", $sid) |
| 251 | + ); |
| 252 | + } |
| 253 | +/* |
| 254 | + if(isset($section->component->section)) { |
| 255 | + foreach($section->component as $component) { |
| 256 | + $component_code = (string) @$component->section->code->attributes()->code; |
| 257 | + $component_type_id = "loinc:$component_code"; |
| 258 | + $component_label = (string) $component->title; |
| 259 | +
|
| 260 | + $sid = parent::getRes().md5($id.$component->section->text->asXML()); |
| 261 | + $content = addslashes(trim((string) $component->section->text->asXML())); |
| 262 | + if($content != "") { |
| 263 | + parent::addRDF( |
| 264 | + parent::describeIndividual($sid, "$component_label section $z for $id", $component_type_id). |
| 265 | + parent::triplifyString($sid, "rdf:value", $content). |
| 266 | + parent::triplifyString($sid, parent::getVoc()."order", $z++). |
| 267 | + parent::triplify($id, parent::getVoc()."indicationSection", $sid) |
| 268 | + ); |
| 269 | + } |
| 270 | + } |
| 271 | + |
| 272 | + } else |
| 273 | + */ |
| 274 | + |
| 275 | + /* for processing individual paragraphs |
| 276 | + $n = 0; |
| 277 | + foreach($section->text->paragraph AS $paragraph) { |
| 278 | + $pid = parent::getRes().md5($paragraph->asXML()); |
| 279 | + $content = trim((string) $paragraph); |
| 280 | + if($content == "") continue; |
| 281 | +
|
| 282 | + parent::addRDF( |
| 283 | + parent::describeIndividual($pid, "indication section ".++$n." for $id", $type_id). |
| 284 | + parent::triplifyString($pid, "rdf:value", $content). |
| 285 | + parent::triplifyString($pid, parent::getVoc()."order", $n). |
| 286 | + parent::triplify($id, parent::getVoc()."indicationSection", $pid) |
| 287 | + ); |
| 288 | + } |
| 289 | + */ |
| 290 | + } |
| 291 | + |
| 292 | + } |
| 293 | + unset($xml); |
| 294 | + parent::writeRDFBufferToWriteFile(); |
| 295 | + } |
| 296 | +} |
| 297 | +?> |
0 commit comments