-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_lib_report.py
executable file
·236 lines (189 loc) · 7.73 KB
/
generate_lib_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/local/bin/python3
"""Generate reports about a given library."""
import argparse
import collections
import lib_types
import report_formats
def gen_year_report(lib, format_module):
"""Generate table with one row per year, and another column showing the
number of papers published that year.
return report as a text string.
"""
report = format_module.gen_year_report(lib, lib.get_years())
return(u"".join(report))
def gen_journal_report(lib, format_module):
"""Generate table with one row per journal and a column showing the
number of pubs printed in that journal.
return report as a text string.
"""
report = format_module.gen_journal_report(lib)
return(u"".join(report))
def gen_tag_year_report(lib, format_module):
"""Generate table with one row per year, and one column per tag.
Each cell shows the number of papers a tag was attached to that year.
return report as a text string.
"""
# Preprocess. Need to know order of tags and years.
# Count number of papers with each tag
n_papers_w_tag = {}
for tag in lib.get_tags():
n_papers_w_tag[tag] = len(lib.get_pubs(tag=tag))
# sort tags by paper count, max first
tags_in_count_order = [
tag for tag in sorted(
n_papers_w_tag.keys(),
key=lambda key_value: - n_papers_w_tag[key_value])]
report = format_module.gen_tag_year_report(
lib, tags_in_count_order, n_papers_w_tag, lib.get_years())
return(u"".join(report))
def gen_tag_count_date_range_report(
lib, format_module,
num_tag_column_groups,
entry_start_date, entry_end_date):
"""Generate a table with with each entry showing the tag name,
and the number of papers tagged with that tag during the given date range.
Each cell shows the number of papers a tag was attached to that year.
Return report as a text string.
"""
# Preprocess. Need to know order of tags
tags = lib.get_tags()
# Count number of papers with each tag
n_papers_w_tag = {}
for tag in tags:
n_papers_w_tag[tag] = len(
lib.get_pubs(
tag=tag,
start_entry_date=entry_start_date,
end_entry_date=entry_end_date))
# sort tags by paper count, max first, then alphabetical
tags_in_count_order = [
tag for tag in
sorted(
n_papers_w_tag.keys(),
key=lambda key_value: (
"{0} {1}".format(
str(1000000 - n_papers_w_tag[key_value]).zfill(7),
key_value.lower())))]
# time for an ordered dict? With tags in count order? I think so.
tags_ord_dict = collections.OrderedDict()
for tag in tags_in_count_order:
tags_ord_dict[tag] = n_papers_w_tag[tag]
# get total # of papers during time range
n_total_papers = len(lib.get_pubs(
start_entry_date=entry_start_date, end_entry_date=entry_end_date))
report = format_module.gen_tag_count_date_range_report(
tags_ord_dict, n_total_papers, lib,
num_tag_column_groups,
entry_start_date, entry_end_date)
return u"".join(report)
def gen_pubs_date_range_report(
lib, format_module,
entry_start_date, entry_end_date):
"""
Generate the list of publications in the library.
Return report as a text string.
"""
report = format_module.gen_pubs_date_range_report(
lib, entry_start_date, entry_end_date)
return u"".join(report)
def get_args():
"""
Parse command line arguments.
"""
arg_parser = argparse.ArgumentParser(
description="Generate reports for a publication library.")
arg_parser.add_argument(
"--libtype", required=True,
help=(
"What type of library are we reading in, and generating "
+ "the report for. Options are "
+ lib_types.get_lib_types_as_text_list()
+ "."))
arg_parser.add_argument(
"--inputlibpath", required=True,
help="path to the library")
arg_parser.add_argument(
"--onlineliburl", required=True,
help=(
"Base URL of the online version of the library. Used to "
+ "generate links in reports."))
arg_parser.add_argument(
"--reportformat", required=True,
help=(
"What format generate the report in. Options are "
+ report_formats.get_formats_as_text_list()
+ "."))
arg_parser.add_argument(
"--journal", required=False, action="store_true",
help="Produce table showing number of papers in different journals.")
arg_parser.add_argument(
"--year", required=False, action="store_true",
help="Produce table showing number of papers published each year.")
arg_parser.add_argument(
"--tagyear", required=False, action="store_true",
help=(
"Produce table showing number of papers with each tag, "
+ "each year."))
arg_parser.add_argument(
"--yeartag", required=False, action="store_true",
help=(
"Produce table showing number of papers with each year, "
+ "each tag."))
arg_parser.add_argument(
"--tagcountdaterange", required=False, action="store_true",
help=(
"Produce table showing number of papers that were tagged with "
+ "each tag during a given time perioud. --entrystartdate and "
+ "--entryenddate parameters are required if --tagcountdaterange "
+ "is specified."))
arg_parser.add_argument(
"--entrystartdate", required=False,
help=(
"--tagcountdaterange will report on papers with entry dates "
+ "greater than or equal to this date. Example: 2016-12-29"))
arg_parser.add_argument(
"--entryenddate", required=False,
help=(
"--tagcountdaterange will report on papers with entry dates "
+ "less than or equal to this date. Example: 2017-01-29"))
arg_parser.add_argument(
"--onlythesetags", required=False,
help=(
"Can either generate a report about all tags in the library, "
+ "or, only about a subset of tags. If this parameter is given "
+ "then only the tags listed in this file will be reported on. "
+ "List one tag per line."))
arg_parser.add_argument(
"--numtagcolumngroups", required=False, type=int, default=4,
help=(
"Specifies how many tags (and their counts) should be listed "
+ "in each row of a tag report. Default is 4."))
return(arg_parser.parse_args())
def generate_lib_report(args):
lib_module = lib_types.get_lib_module(args.libtype)
input_lib = lib_module.PubLibrary(args.inputlibpath, args.onlineliburl)
# Setup fast access to lib on anything we might report on.
input_lib.prep_for_reports(args.onlythesetags)
# What format should the report be in?
format_module = report_formats.get_format_module(args.reportformat)
# Generate each report that was requested.
if args.journal:
print(gen_journal_report(input_lib, format_module))
if args.year:
print(gen_year_report(input_lib, format_module))
if args.tagyear:
print(gen_tag_year_report(input_lib, format_module))
if args.tagcountdaterange:
print(gen_tag_count_date_range_report(
input_lib, format_module,
args.numtagcolumngroups,
args.entrystartdate, args.entryenddate))
if args.pubsdaterange:
print(gen_pubs_date_range_report(
input_lib, format_module,
args.entrystartdate, args.entryenddate))
return None
# MAIN
if __name__ == '__main__':
command_line_args = get_args()
generate_lib_report(command_line_args)