Skip to content

Commit

Permalink
#90 - Extract title and notes from PPTX files into a json meta data file
Browse files Browse the repository at this point in the history
  • Loading branch information
stackpr committed Feb 24, 2017
1 parent 01e13a5 commit 6d0667d
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 0 deletions.
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"wittiws/htmlawed":"dev-master",
"wittiws/phpquery":"dev-master",
"wittiws/quipxml":"dev-master",
"wittiws/splash":"dev-master",
"zendframework/zend-mail":"2.7.*"
},
"autoload":{
Expand Down
5 changes: 5 additions & 0 deletions src/Configuration/ConfigurationDefaults.php
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ public function __construct(&$settings) {
'#engine' => 'Convert\\Unoconv',
),
),
'pptx->json' => array(
'nativemeta:default' => array(
'#engine' => 'Convert\\NativeMeta',
),
),
'rtf->pdf' => array(
'unoconv:default' => array(
'#engine' => 'Convert\\Unoconv',
Expand Down
119 changes: 119 additions & 0 deletions src/Engine/Convert/NativeMeta.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
<?php
/*
* This file is part of the FileConverter package.
*
* (c) Greg Payne
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace FileConverter\Engine\Convert;
use FileConverter\Engine\EngineBase;
use Splash\Splash;
use QuipXml\Quip;

class NativeMeta extends EngineBase {
/**
* @todo use Message::fromString($raw) to convert from eml to other formats
*/
public function convertFile($source, $destination) {
$meta = array();

// Extract meta data based on the file type.
switch ($this->conversion[0]) {
case 'pptx':
// Open the pptx file
$pptx = new \ZipArchive;
if (TRUE !== $pptx->open($source)) {
throw new \ErrorException("Unable to open the PPTX file");
}

// Build the list of files.
$files = array();
for ($i = 0; $i < $pptx->numFiles; $i++) {
$files[] = $pptx->getNameIndex($i);
}
$files = Splash::fromArray($files);

// Build the slides.
$meta['slides'] = array();
foreach ($files->regex("@ppt/slides/slide\d+.xml$@") as $file) {
$slide = array();
$number = preg_replace('@^ppt/slides/slide(\d+)\.xml$@s', '\1', $file);
$slide['number'] = $number;

// Detect the title.
// ppt/slides/slide1.xml
$xml_slide = Quip::load($pptx->getFromName($file));
$title = $xml_slide->xpath("//p:sp//p:ph[@type='title' or @type='ctrTitle']")->xpath('../../..')->html();
// http://www.datypic.com/sc/ooxml/e-a_br-1.html
$title = preg_replace('@<a:br[^>]*>@s', "\n", $title);
$title = trim(strip_tags($title));
$slide['title'] = $title;

// Detect any notes.
// ppt/slides/_rels/slide1.xml.rels
// The 'Relationship' tag name does not work in this xpath for unknown reasons.
$xml_rels = Quip::load($pptx->getFromName("ppt/slides/_rels/slide$number.xml.rels"));
$note_id = $xml_rels->xpath("//*[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide']")->eq(0)['Target'];

// Load the notes from the connected XML.
// ../notesSlides/notesSlide1.xml
// becomes ppt/notesSlides/notesSlide1.xml
if (substr($note_id, 0, 3) === '../') {
$note_id = preg_replace('@^\.\./@', 'ppt/', $note_id);
$xml_note = Quip::load($pptx->getFromName($note_id));
$note = $xml_note->html();
// The a:p tag appears preferred within notes.
$note = preg_replace('@<a:br[^>]*>@s', "\n", $note);
$note = preg_replace('@<a:p(?:\s[^>]*)?>@s', "\n", $note);
// The slide number appears in notes within a:fld.
$note = preg_replace('@<a:fld.*?</a:fld>@s', "", $note);
$note = trim(strip_tags($note));
$slide['notes'] = $note;
}

$meta['slides'][$number - 1] = $slide;
}
ksort($meta['slides']);

break;

default:
throw new \InvalidArgumentException("Unsupported conversion source type requested");
}

switch ($this->conversion[1]) {
case 'json':
$output = json_encode($meta, JSON_PRETTY_PRINT
| JSON_PARTIAL_OUTPUT_ON_ERROR);
file_put_contents($destination, $output);
return $this;

default:
throw new \InvalidArgumentException("Unsupported conversion destination type requested");
}
}

protected function getHelpInstallation($os, $os_version) {
$help = array(
'title' => 'Native Meta Data Extractor',
);
switch ($os) {
case 'Ubuntu':
$help['os'] = 'confirmed on Ubuntu 16.04';
$help['notes'] = array(
'composer update',
);
return $help;
}

return parent::getHelpInstallation($os, $os_version);
}

public function isAvailable() {
return TRUE;
}

}

0 comments on commit 6d0667d

Please sign in to comment.