diff --git a/composer.json b/composer.json index e52bd00..c299292 100644 --- a/composer.json +++ b/composer.json @@ -22,6 +22,7 @@ "wittiws/htmlawed":"dev-master", "wittiws/phpquery":"dev-master", "wittiws/quipxml":"dev-master", + "wittiws/splash":"dev-master", "zendframework/zend-mail":"2.7.*" }, "autoload":{ diff --git a/src/Configuration/ConfigurationDefaults.php b/src/Configuration/ConfigurationDefaults.php index b6b5964..a5828d7 100644 --- a/src/Configuration/ConfigurationDefaults.php +++ b/src/Configuration/ConfigurationDefaults.php @@ -146,6 +146,11 @@ public function __construct(&$settings) { '#engine' => 'Convert\\Unoconv', ), ), + 'pptx->json' => array( + 'nativemeta:default' => array( + '#engine' => 'Convert\\NativeMeta', + ), + ), 'rtf->pdf' => array( 'unoconv:default' => array( '#engine' => 'Convert\\Unoconv', diff --git a/src/Engine/Convert/NativeMeta.php b/src/Engine/Convert/NativeMeta.php new file mode 100644 index 0000000..80866b9 --- /dev/null +++ b/src/Engine/Convert/NativeMeta.php @@ -0,0 +1,119 @@ +conversion[0]) { + case 'pptx': + // Open the pptx file + $pptx = new \ZipArchive; + if (TRUE !== $pptx->open($source)) { + throw new \ErrorException("Unable to open the PPTX file"); + } + + // Build the list of files. + $files = array(); + for ($i = 0; $i < $pptx->numFiles; $i++) { + $files[] = $pptx->getNameIndex($i); + } + $files = Splash::fromArray($files); + + // Build the slides. + $meta['slides'] = array(); + foreach ($files->regex("@ppt/slides/slide\d+.xml$@") as $file) { + $slide = array(); + $number = preg_replace('@^ppt/slides/slide(\d+)\.xml$@s', '\1', $file); + $slide['number'] = $number; + + // Detect the title. + // ppt/slides/slide1.xml + $xml_slide = Quip::load($pptx->getFromName($file)); + $title = $xml_slide->xpath("//p:sp//p:ph[@type='title' or @type='ctrTitle']")->xpath('../../..')->html(); + // http://www.datypic.com/sc/ooxml/e-a_br-1.html + $title = preg_replace('@]*>@s', "\n", $title); + $title = trim(strip_tags($title)); + $slide['title'] = $title; + + // Detect any notes. + // ppt/slides/_rels/slide1.xml.rels + // The 'Relationship' tag name does not work in this xpath for unknown reasons. + $xml_rels = Quip::load($pptx->getFromName("ppt/slides/_rels/slide$number.xml.rels")); + $note_id = $xml_rels->xpath("//*[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide']")->eq(0)['Target']; + + // Load the notes from the connected XML. + // ../notesSlides/notesSlide1.xml + // becomes ppt/notesSlides/notesSlide1.xml + if (substr($note_id, 0, 3) === '../') { + $note_id = preg_replace('@^\.\./@', 'ppt/', $note_id); + $xml_note = Quip::load($pptx->getFromName($note_id)); + $note = $xml_note->html(); + // The a:p tag appears preferred within notes. + $note = preg_replace('@]*>@s', "\n", $note); + $note = preg_replace('@]*)?>@s', "\n", $note); + // The slide number appears in notes within a:fld. + $note = preg_replace('@@s', "", $note); + $note = trim(strip_tags($note)); + $slide['notes'] = $note; + } + + $meta['slides'][$number - 1] = $slide; + } + ksort($meta['slides']); + + break; + + default: + throw new \InvalidArgumentException("Unsupported conversion source type requested"); + } + + switch ($this->conversion[1]) { + case 'json': + $output = json_encode($meta, JSON_PRETTY_PRINT + | JSON_PARTIAL_OUTPUT_ON_ERROR); + file_put_contents($destination, $output); + return $this; + + default: + throw new \InvalidArgumentException("Unsupported conversion destination type requested"); + } + } + + protected function getHelpInstallation($os, $os_version) { + $help = array( + 'title' => 'Native Meta Data Extractor', + ); + switch ($os) { + case 'Ubuntu': + $help['os'] = 'confirmed on Ubuntu 16.04'; + $help['notes'] = array( + 'composer update', + ); + return $help; + } + + return parent::getHelpInstallation($os, $os_version); + } + + public function isAvailable() { + return TRUE; + } + +} \ No newline at end of file