-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit-documents.py
59 lines (44 loc) · 1.41 KB
/
split-documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python
import sys
import os
import json
input_directory = sys.argv[1]
output_directory = sys.argv[2]
filenames = []
for filename in os.listdir(input_directory):
if (".log" in filename):
continue
if (".hydra" in filename):
continue
filenames.append(filename)
total_fileid = 1
for filename in filenames:
full_filename = os.path.join(input_directory, filename)
print(full_filename)
fileid = 1
output = []
f = open(full_filename)
for line in f:
document = json.loads(line)
content = document['content']
url = document['url']
output_page = {"url": url, "content": content}
output.append(json.dumps(output_page))
fileid = fileid + 1
if ((fileid % 1000) == 0):
output_filename = output_directory + "/" + str(total_fileid)
print(output_filename)
outFile = open(output_filename, "w")
outFile.write('\n'.join(output))
outFile.close()
output = []
total_fileid = total_fileid + 1
fileid = 1
if (fileid != 1):
print ("Warning: some files remaining" + str(fileid))
output_filename = output_directory + "/" + str(total_fileid)
print(output_filename)
outFile = open(output_filename, "w")
outFile.write('\n'.join(output))
outFile.close()
total_fileid = total_fileid + 1