-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocs_to_json.py
172 lines (126 loc) · 5.27 KB
/
docs_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
"""
This is a utility for converting a directory tree of text files into JSON
format suitable for use by Clarity. It will recurse through the directory
tree rooted at the specified directory, load the text files, and write
the JSON result to stdout.
Run as follows to generate the file 'input.json' using files in the
directory tree rooted at "data". The files will be entered with the "Nursing"
report type, a source of "Columbia", and an index that starts at 3000000.
python3 ./docs_to_json.py -d "data" -i 3000000 -t "Nursing" -s "Columbia" > input.json
Upload input.json to Clarity's Amazon AWS instance with this command:
curl 'http://18.220.133.76:8983/solr/sample/update?commit=true' \
--data-binary @input.json -H 'Content-type:application/json'
"""
import os
import sys
import json
import optparse
import datetime
VERSION_MAJOR = 0
VERSION_MINOR = 1
MODULE_NAME = 'docs_to_json.py'
###############################################################################
def to_json(doc_list, index_start, report_type, source):
"""
Generate JSON output using the strings in 'doc_list' for the
'report_text' field.
"""
index = int(index_start)
# current datetime will be used as the timestamp for all docs
now = datetime.datetime.utcnow().isoformat()
dict_list = []
for doc in doc_list:
this_dict = {}
this_dict['report_type'] = report_type
this_dict['id'] = str(index)
this_dict['report_id'] = str(index)
this_dict['source'] = source
this_dict['report_date'] = now + 'Z'
this_dict['subject'] = "-1"
this_dict['report_text'] = doc
dict_list.append(this_dict)
index += 1
return json.dumps(dict_list, indent=4)
###############################################################################
def get_version():
return '{0} {1}.{2}'.format(MODULE_NAME, VERSION_MAJOR, VERSION_MINOR)
###############################################################################
def show_help():
print(get_version())
print("""
USAGE: python3 ./{0} -d <dirname> [-hv]
OPTIONS:
-d, --dir <quoted string> Path to directory containing docs to ingest.
-i, --index <integer> Starting value for Solr document id
-t, --type <quoted string> JSON report type field
-s, --source <quoted string> JSON source field
FLAGS:
-h, --help Print this information and exit.
-v, --version Print version information and exit.
""".format(MODULE_NAME))
###############################################################################
if __name__ == '__main__':
optparser = optparse.OptionParser(add_help_option=False)
optparser.add_option('-d', '--dir', action='store', dest='directory')
optparser.add_option('-i', '--index', action='store', dest='index')
optparser.add_option('-t', '--type', action='store', dest='report_type')
optparser.add_option('-s', '--source', action='store', dest='source')
optparser.add_option('-v', '--version', action='store_true', dest='get_version')
optparser.add_option('-h', '--help', action='store_true', dest='show_help', default=False)
opts, other = optparser.parse_args(sys.argv)
# show help if no command line arguments
if opts.show_help or 1 == len(sys.argv):
show_help()
sys.exit(0)
if opts.get_version:
print(get_version())
sys.exit(0)
directory = opts.directory
if directory is None:
print('error: a directory must be specified on the command line')
sys.exit(-1)
if not os.path.isdir(directory):
print('error: directory {0} does not exist'.format(directory))
sys.exit(-1)
index = opts.index
if index is None:
print('error: a starting index must be specified on the command line')
sys.exit(-1)
index = int(index)
report_type = opts.report_type
if report_type is None:
print('error: a report type must be specified on the command line')
sys.exit(-1)
source = opts.source
if source is None:
print('error: a source must be specified on the command line')
sys.exit(-1)
docs = []
# recurse through the file tree rooted at 'directory'
for root, subdirs, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
# read text files; skip binaries and anything else causing an error
try:
infile = open(filepath, 'r')
except (OSError, IOError) as e:
continue
except Excdeption as e:
continue
with infile:
try:
doc = infile.read()
except UnicodeDecodeError as e:
continue
except (OSError, IOError) as e:
continue
except Exception as e:
continue
if 0 == len(doc):
continue
# successfully read document, so add text to list
docs.append(doc)
# convert to JSON for import into Clarity Solr
json_string = to_json(docs, index, report_type, source)
print(json_string)