Skip to content

Commit

Permalink
Merge branch 'release/1.0.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
bsorrentino committed Mar 21, 2022
2 parents d05fa64 + e29d8b6 commit bec516f
Show file tree
Hide file tree
Showing 11 changed files with 2,369 additions and 6,570 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.

## [1.0.0](https://github.com/bsorrentino/pdf-tools/compare/v0.5.2...v1.0.0) (2022-03-21)

* refactor: upgrade pdf-dist version, build on nodejs 16 and test also on macos M1 ([91952e2](https://github.com/bsorrentino/pdf-tools/commit/91952e262b189185068c2fe8e3bf46de942ca811))

### [0.5.2](https://github.com/bsorrentino/pdf-tools/compare/v0.5.1...v0.5.2) (2022-03-16)


Expand Down
8,700 changes: 2,185 additions & 6,515 deletions package-lock.json

Large diffs are not rendered by default.

38 changes: 25 additions & 13 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@bsorrentino/pdf-tools",
"version": "0.5.2",
"version": "1.0.0",
"description": "",
"main": "index.js",
"bin": {
Expand All @@ -17,25 +17,24 @@
"author": "bsorrentino <[email protected]> (http://soulsoftware-bsc.blogspot.it/)",
"license": "MIT",
"dependencies": {
"canvas": "^2.6.1",
"commander": "^6.2.0",
"canvas": "^2.9.1",
"commander": "^9.1.0",
"enumify": "^2.0.0",
"jimp": "^0.16.1",
"pdfjs-dist": "^2.6.347"
"pdfjs-dist": "^2.13.216"
},
"devDependencies": {
"@types/jest": "^26.0.22",
"@types/node": "^10.17.44",
"@types/pdfjs-dist": "^2.7.1",
"@types/jest": "^27.4.1",
"@types/node": "^16.11.26",
"cz-conventional-changelog": "^3.3.0",
"jest": "^26.6.3",
"jest": "^27.5.1",
"standard-version": "^9.3.2",
"ts-jest": "^26.5.5",
"typescript": "^4.0.5",
"zx": "^6.0.1"
"ts-jest": "^27.1.3",
"typescript": "^4.6.2",
"zx": "^6.0.6"
},
"engines": {
"node": ">=14"
"node": ">=16"
},
"config": {
"commitizen": {
Expand All @@ -45,6 +44,19 @@
"standard-version": {
"skip": {
"tag": true
},
"types": [
{"type": "feat", "section": "Features"},
{"type": "fix", "section": "Bug Fixes"},
{"type": "chore", "hidden": true},
{"type": "docs", "section": "Documentation"},
{"type": "style", "hidden": true},
{"type": "refactor", "section": "Refactoring"},
{"type": "perf", "hidden": true},
{"type": "test", "hidden": true},
{"type": "build", "section": "Build"}
],
"commitUrlFormat": "https://github.com/mokkapps/changelog-generator-demo/commits/{{hash}}",
"compareUrlFormat": "https://github.com/mokkapps/changelog-generator-demo/compare/{{previousTag}}...{{currentTag}}"
}
}
}
82 changes: 82 additions & 0 deletions samples/getinfo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/
*
* @ref https://github.com/mozilla/pdf.js/blob/master/examples/node/getinfo.js
*/

//
// Basic node example that prints document metadata and text content.
// Requires single file built version of PDF.js -- please run
// `gulp singlefile` before running the example.
//

// Run `gulp dist-install` to generate 'pdfjs-dist' npm package files.
const pdfjsLib = require("pdfjs-dist/legacy/build/pdf.js");

// Loading file from file system into typed array
const pdfPath =
process.argv[2] || "samples/article-with-links.pdf";

// Will be using promises to load document, pages and misc data instead of
// callback.
const loadingTask = pdfjsLib.getDocument(pdfPath);
loadingTask.promise
.then(function (doc) {
const numPages = doc.numPages;
console.log("# Document Loaded");
console.log("Number of Pages: " + numPages);
console.log();

let lastPromise; // will be used to chain promises
lastPromise = doc.getMetadata().then(function (data) {
console.log("# Metadata Is Loaded");
console.log("## Info");
console.log(JSON.stringify(data.info, null, 2));
console.log();
if (data.metadata) {
console.log("## Metadata");
console.log(JSON.stringify(data.metadata.getAll(), null, 2));
console.log();
}
});

const loadPage = function (pageNum) {
return doc.getPage(pageNum).then(function (page) {
console.log("# Page " + pageNum);
const viewport = page.getViewport({ scale: 1.0 });
console.log("Size: " + viewport.width + "x" + viewport.height);
console.log();
return page
.getTextContent()
.then(function (content) {
// Content contains lots of information about the text layout and
// styles, but we need only strings at the moment
const strings = content.items.map(function (item) {
return item.str;
});
console.log("## Text Content");
console.log(strings.join(" "));
// Release page resources.
page.cleanup();
})
.then(function () {
console.log();
});
});
};
// Loading of the first page will wait on metadata and subsequent loadings
// will wait on the previous pages.
for (let i = 1; i <= numPages; i++) {
lastPromise = lastPromise.then(loadPage.bind(null, i));
}
return lastPromise;
})
.then(
function () {
console.log("# End of Document");
},
function (err) {
console.error("Error: " + err);
}
);
76 changes: 47 additions & 29 deletions src/__tests__/link.test.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
import 'pdfjs-dist/legacy/build/pdf.js';
/**
* @link: https://github.com/mozilla/pdf.js/blob/master/examples/node/pdf2png/pdf2png.js
*/
/// <reference path="../pdfjs.d.ts" />

import path from 'path'
import { getDocument, Util } from 'pdfjs-dist';
import type { PDFPageProxy } from 'pdfjs-dist/types/display/api';
import { getDocument, Util, PDFPageProxy } from 'pdfjs-dist/legacy/build/pdf.js';
import { getLinks, matchLink } from '../pdf2md.link';
import { Word } from '../pdf2md.model';
import { TextItem } from 'pdfjs-dist/types/src/display/api';
// import { DH_CHECK_P_NOT_PRIME } from 'constants';

// Some PDFs need external cmaps.
const CMAP_URL = "../../../node_modules/pdfjs-dist/cmaps/";
const CMAP_PACKED = true;

// Where the standard fonts are located.
const STANDARD_FONT_DATA_URL =
"../../../node_modules/pdfjs-dist/standard_fonts/";

async function getText(page:PDFPageProxy):Promise<Word[]> {
const scale = 1.0;

const viewport = page.getViewport({ scale: scale });

const textContent = await page.getTextContent()

const words = textContent.items.map(item => {
const words = textContent.items
.filter( (item:any) => item.transform!==undefined )
.map( (item:any) => {

item = item as TextItem

const tx = Util.transform(viewport.transform, item.transform)

Expand All @@ -37,38 +51,42 @@ async function getText(page:PDFPageProxy):Promise<Word[]> {
return words
}

test( 'parse link', () => {
type WorkWithLinkTuple = [Word,PDFLink|undefined]

return getDocument({
test( 'parse link', async () => {

const doc = await getDocument({
url: path.join( 'samples', 'article-with-links.pdf'),
cMapUrl: CMAP_URL,
cMapPacked: CMAP_PACKED,
}).promise
.then( doc => {
expect(doc).not.toBeNull()
expect(doc.numPages).toEqual(9)
return doc.getPage(1)
.then( page => {
expect(page).not.toBeUndefined()
expect(page).not.toBeNull()
standardFontDataUrl: STANDARD_FONT_DATA_URL
}).promise


expect(doc).not.toBeNull()
expect(doc.numPages).toEqual(9)

const page = await doc.getPage(1)

expect(page).not.toBeUndefined()
expect(page).not.toBeNull()

return Promise.all( [getText(page), getLinks(page)] )
}).then( ( [words, links] ) => {
// console.log( 'words', words )
expect(links?.length).toEqual(2)
const [words, links] = await Promise.all( [getText(page), getLinks(page)] )

const dataverseWord = words.find( word => word.text.localeCompare('Dataverse')==0 )
expect(dataverseWord).not.toBeNull()
expect(links?.length).toEqual(2)

const workdwithlink = words.map( word => {
const dataverseWord = words.find( word => word.text.localeCompare('Dataverse')==0 )
expect(dataverseWord).not.toBeNull()

const workdwithlink =
words.map<WorkWithLinkTuple>( word => {
const result = links.find( link => matchLink( word, link ))
return [word, result ]
}).filter( ([ word, link ] ) => link!=null )
return [word, result]
})
.filter( ([ _, link ] ) => link!=null )

// console.log( workdwithlink )
expect(workdwithlink?.length).toEqual(4)
workdwithlink.forEach( lnk => console.log( 'workdwithlink', lnk) )
// console.log( workdwithlink )
expect(workdwithlink.length).toEqual(7)

})

})
})
})
13 changes: 8 additions & 5 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// import 'pdfjs-dist/es5/build/pdf.js';
import 'pdfjs-dist/legacy/build/pdf.js';
import fs from 'fs'
import { promisify } from 'util'
import path from 'path';
Expand All @@ -10,12 +8,16 @@ import { globals } from './pdf2md.global';
import { program } from 'commander'
import { assert } from 'console';
import { pdfToMarkdown } from './pdf2md.main';
import { getDocument, OPS } from 'pdfjs-dist';
import { getDocument, OPS } from 'pdfjs-dist/legacy/build/pdf.js'

// Some PDFs need external cmaps.
const CMAP_URL = "../../../node_modules/pdfjs-dist/cmaps/";
const CMAP_PACKED = true;

// Where the standard fonts are located.
const STANDARD_FONT_DATA_URL =
"../../../node_modules/pdfjs-dist/standard_fonts/";

const readFile = promisify(fs.readFile)
const checkFileExistsAsync = promisify(fs.access)
const mkdirAsync = promisify(fs.mkdir)
Expand Down Expand Up @@ -105,6 +107,7 @@ async function savePagesAsImages(pdfPath: string) {
data: data,
cMapUrl: CMAP_URL,
cMapPacked: CMAP_PACKED,
standardFontDataUrl: STANDARD_FONT_DATA_URL
}).promise

// const metadata = await pdfDocument.getMetadata()
Expand All @@ -129,15 +132,15 @@ async function savePagesAsImages(pdfPath: string) {
export async function run() {

const choosePath = ( pdfPath:any, cmdobj:any ) =>
( cmdobj.parent.outdir ) ?
( cmdobj.parent?.outdir ) ?
cmdobj.parent.outdir :
path.basename(pdfPath, '.pdf')

const {version} = require('../package.json')

program.version( version )
.name('pdftools')
.option('-o, --outdir [folder]', 'output folder')
.option('-o, --outdir [folder]', 'output folder' )

program.command('pdfximages <pdf>')
.description('extract images (as png) from pdf and save it to the given folder')
Expand Down
2 changes: 1 addition & 1 deletion src/pdf2md.image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import path from 'path'
import { promisify } from 'util'
import Jimp from 'jimp'
import { globals } from "./pdf2md.global"
import type { PDFPageProxy } from "pdfjs-dist/types/display/api"
import { PDFPageProxy } from 'pdfjs-dist/legacy/build/pdf.js'

enum PDFImageKind {
GRAYSCALE_1BPP = 1,
Expand Down
3 changes: 2 additions & 1 deletion src/pdf2md.link.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { PDFPageProxy } from 'pdfjs-dist/types/display/api';

import { PDFPageProxy } from 'pdfjs-dist/legacy/build/pdf.js'
import { Rect } from "./pdf2md.model"

/**
Expand Down
9 changes: 7 additions & 2 deletions src/pdf2md.main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@ import { promisify } from 'util'
import { processPage, Page } from './pdf2md.page';
import { toMarkdown } from './pdf2md.markdown';
import { globals } from './pdf2md.global';
import { getDocument } from 'pdfjs-dist';
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.js'

// Some PDFs need external cmaps.
const CMAP_URL = "../../../node_modules/pdfjs-dist/cmaps/";
const CMAP_PACKED = true;

// Where the standard fonts are located.
const STANDARD_FONT_DATA_URL =
"../../../node_modules/pdfjs-dist/standard_fonts/";

const readFile = promisify(fs.readFile)
const writeFile = promisify(fs.writeFile)

Expand All @@ -37,7 +41,8 @@ export async function pdfToMarkdown(pdfPath: string) {
const pdfDocument = await getDocument({
data: data,
cMapUrl: CMAP_URL,
cMapPacked: CMAP_PACKED
cMapPacked: CMAP_PACKED,
standardFontDataUrl: STANDARD_FONT_DATA_URL
}).promise

const numPages = pdfDocument.numPages
Expand Down
10 changes: 7 additions & 3 deletions src/pdf2md.page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ import { globals } from "./pdf2md.global";
import { writePageImageOrReuseOneFromCache } from "./pdf2md.image";
import { EnhancedWord, Rect, Word, Image, Font } from "./pdf2md.model";

import { OPS, Util } from 'pdfjs-dist';
import { OPS, PDFPageProxy, Util } from 'pdfjs-dist/legacy/build/pdf.js'
// doesn't work with parcel
import type { PDFPageProxy } from 'pdfjs-dist/types/display/api';
import { getLinks, matchLink } from "./pdf2md.link";
import { TextItem } from "pdfjs-dist/types/src/display/api";


type TransformationMatrix = [
Expand Down Expand Up @@ -340,7 +340,11 @@ export async function processPage(page: PDFPageProxy) {

const textContent = await page.getTextContent()

const words = textContent.items.map(item => {
const words = textContent.items
.filter( (item:any) => item.transform!==undefined )
.map( (item:any) => {

item = item as TextItem

const tx = Util.transform(viewport.transform, item.transform)

Expand Down
2 changes: 1 addition & 1 deletion src/pdfjs.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@

}

type PDFLink = {
declare type PDFLink = {
x1:number
y1:number
x2:number
Expand Down

0 comments on commit bec516f

Please sign in to comment.