-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathget-pdfs-of-issues.py
executable file
·160 lines (130 loc) · 5.25 KB
/
get-pdfs-of-issues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2014, 2015 Mark Longair
# This script is distributed under the terms of the GNU General Public License.
# See the COPYING file in this repository for the complete text of the license.
from distutils import spawn
import doctest
import errno
import hashlib
from optparse import OptionParser
import os
from os.path import dirname, exists, isdir, join, realpath, relpath, splitext
import re
import requests
import subprocess
import sys
import tempfile
from github import standard_headers, get_issues
if not spawn.find_executable('pandoc'):
print >> sys.stderr, "pandoc couldn't be found on PATH"
sys.exit(1)
# This is a script that downloads all the GitHub issues in a
# particular repository and generates a PDF for each one; the idea is
# to produce easily printable versions of all the issues for a
# repository.
# To get pandoc to produce PDFs, I also needed to install:
#
# texlive-fonts-recommended
# texlive-latex-base
# texlive-latex-extra
# texlive-latex-recommended
# texlive-xetex
cwd = os.getcwd()
repo_directory = realpath(join(dirname(__file__)))
images_directory = relpath(join(repo_directory, 'images'), cwd)
pdfs_directory = relpath(join(repo_directory, 'pdfs'), cwd)
def mkdir_p(d):
try:
os.makedirs(d)
except OSError, e:
if e.errno == errno.EEXIST and isdir(d):
pass
else:
raise
mkdir_p(images_directory)
mkdir_p(pdfs_directory)
def replace_image(match, download=True):
"""Rewrite an re match object that matched an image tag
Download the image and return a version of the tag rewritten
to refer to the local version. The local version is named
after the MD5sum of the URL.
>>> m = re.search(r'\!\[(.*?)\]\((.*?)\)',
... 'an image coming up ')
>>> replace_image(m, download=False)
''
"""
caption = match.group(1)
url = match.group(2)
hashed_url = hashlib.md5(url).hexdigest()
extension = splitext(url)[1]
if not extension:
raise Exception, "No extension at the end of {0}".format(url)
image_filename = join(images_directory, hashed_url) + extension
if download:
if not exists(image_filename):
r = requests.get(url)
with open(image_filename, 'w') as f:
f.write(r.content)
return "".format(caption, image_filename)
def replace_images(md):
"""Rewrite a Markdown string to replace any images with local versions
'md' should be a GitHub Markdown string; the return value is a version
of this where any references to images have been downloaded and replaced
by a reference to a local copy.
"""
return re.sub(r'\!\[(.*?)\]\((.*?)\)', replace_image, md)
def main(repo):
for number, title, body, issue in get_issues(repo):
pdf_filename = join(pdfs_directory,
'{}.pdf'.format(number))
if exists(pdf_filename):
continue
ntf = tempfile.NamedTemporaryFile(suffix='.md', delete=False)
md_filename = ntf.name
if 'pull_request' in issue and issue['pull_request']['html_url']:
continue
print "Doing issue", number, title
with open(md_filename, 'w') as f:
f.write(u"# {0} {1}\n\n".format(number, title).encode('utf-8'))
f.write("### Reported by {0}\n\n".format(issue['user']['login']))
# Increase the indent level of any Markdown heading
body = re.sub(r'^(#+)', r'#\1', body or '')
body = replace_images(body)
body = body.encode('utf-8')
f.write(body)
f.write("\n\n")
if issue['comments'] > 0:
comments_request = requests.get(issue['comments_url'],
headers=standard_headers)
for comment in comments_request.json():
f.write("### Comment from {0}\n\n".format(comment['user']['login']))
comment_body = comment['body']
# Make sure any headings in comments are less
# prominent than those we create around the comment.
comment_body = re.sub(r'^(#+)', r'###\1', comment_body)
comment_body = replace_images(comment_body)
f.write(comment_body.encode('utf-8'))
f.write("\n\n")
subprocess.check_call(['pandoc',
'--latex-engine=xelatex',
'-f',
'markdown_github',
md_filename,
'-o',
pdf_filename])
os.remove(ntf.name)
usage = """Usage: %prog [options] REPOSITORY
Repository should be username/repository from GitHub, e.g. mysociety/pombola"""
parser = OptionParser(usage=usage)
parser.add_option("-t", "--test",
action="store_true", dest="test", default=False,
help="Run doctests")
(options, args) = parser.parse_args()
if options.test:
doctest.testmod()
else:
if len(args) != 1:
parser.print_help()
else:
main(args[0])