-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
245 lines (224 loc) · 10 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import re, json, os, datetime, codecs
from copy import deepcopy
from lxml import html, etree
from tkinter import filedialog
from sys import exit
from time import strptime
# TO DO: DEFENSE ROLLS
# debugging function
def log_html(tag):
print('Exporting tag as log.txt')
with open('log.txt','wb') as logfile:
logfile.write(html.tostring(tag, pretty_print=True))
exit()
def get_webpage_from_file():
filename = filedialog.askopenfilename(title='HTML file to parse')
return html.fromstring(open(filename, 'r', encoding='utf-8').read())
# removes the masses of unnecessary whitespace around the HTML page
def remove_whitespace(text_list):
return ''.join(map(lambda t: re.sub(r'\s+',' ',t),text_list)).strip()
# extract roll details from the inlinerollresult span title
def get_roll_details(sheet_roll):
roll = sheet_roll.find_class('inlinerollresult')[0]
return remove_whitespace(html.fromstring(roll.get('title')).itertext()).strip()
def get_text(message):
return remove_whitespace(message.find_class('sheet-left')[0].itertext()).strip()
def parse_sheet_rolls(sheet_rolls, parsed_message):
parsed_message['type'] = 'skill roll'
for sheet_roll in sheet_rolls:
parsed_message['text'] = get_text(sheet_roll)
parsed_message['roll_detail'] = get_roll_details(sheet_roll)
# for skill rolls, first cell is the left one and only has "check"
# we want the second one for the roll result
parsed_message['result'] = sheet_roll.find_class('sheet-roll-cell')[1].text_content().strip()
# some skill checks have notes
description = sheet_roll.find_class('sheet-roll-description')
if description:
parsed_message['notes'] = description[0].text_content().strip()
else:
parsed_message['notes'] = ''
def parse_sheet_attacks(attacks, parsed_message):
attack_rolls = []
parsed_message['type'] = 'attack'
for attack in attacks:
# text comes with a ton of whitespace that we need to remove
try:
parsed_message['text'] = get_text(attack)
except:
parsed_message['text'] = ''
rows = attack.find_class('sheet-roll-row')
# last two children are damage type and notes, which we have to treat differently
for row in rows[:-2]:
attack_roll = {}
children = row.getchildren()
attack_roll['name'] = children[0].text_content().strip()
# extract roll details from the inlinerollresult span title
try:
attack_roll['roll_detail'] = parsed_message['roll_detail'] = get_roll_details(attack)
# result
attack_roll['result'] = children[1].text_content().strip()
except:
attack_roll['roll_detail'] = ''
attack_roll['result'] = ''
attack_rolls.append(attack_roll)
parsed_message['attacks'] = attack_rolls
try:
parsed_message['notes'] = rows[-1].text_content().strip()
except:
parsed_message['notes'] = ''
def parse_defences(defences, parsed_message):
parsed_message['type'] = 'defence'
for defence in defences:
# extract roll details from the inlinerollresult span title
parsed_message['roll_detail'] = get_roll_details(defence)
parsed_message['result'] = defence.find_class('sheet-roll-cell')[1].text_content().strip()
# text is found in the child with class sheet-left
parsed_message['text'] = remove_whitespace([defence.find_class('sheet-left')[0].text_content()])
def parse_spells(spells, parsed_message):
parsed_message['type'] = 'spell'
for spell in spells:
# TODO: remove this terrible hack of a job and handle the spell case properly
try:
parsed_message['text'] = get_text(spell)
except:
continue
def parse_abilities(abilities, parsed_message):
parsed_message['type'] = 'ability'
for ability in abilities:
parsed_message['text'] = get_text(ability)
try:
parsed_message['result'] = ability.find_class('sheet-roll-cell')[1].text_content().strip()
except:
parsed_message['result'] = ''
def parse_log():
chatlog=[]
print('Importing file...')
messages_html = get_webpage_from_file().find_class('message')
print('Parsing chatlog...')
cur_time = None
for msg in messages_html:
parsed_message = {}
temp = deepcopy(msg)
msg_classes = list(temp.classes)
# add message owner
owner = temp.find_class('by')
if owner:
# last character will always be ":" which we don't want
cur_owner = owner[0].text_content()[:-1]
parsed_message['owner'] = cur_owner
# add message timestamp
timestamp = temp.find_class('tstamp')
if timestamp:
cur_time = timestamp[0].text_content() # '%B %d, %Y %I:%M%p'
parsed_message['timestamp'] = cur_time
if 'general' in msg_classes:
sheet_rolls = temp.find_class('sheet-rolltemplate-pf_generic')
attacks = temp.find_class('sheet-rolltemplate-pf_attack')
spells = temp.find_class('sheet-rolltemplate-pf_spell')
abilities = temp.find_class('sheet-rolltemplate-pf_ability')
defences = temp.find_class('sheet-rolltemplate-pf_defense')
if sheet_rolls:
parse_sheet_rolls(sheet_rolls, parsed_message)
elif attacks:
parse_sheet_attacks(attacks, parsed_message)
elif spells:
parse_spells(spells, parsed_message)
elif abilities:
parse_abilities(abilities, parsed_message)
elif defences:
parse_defences(defences, parsed_message)
else:
parsed_message['type'] = 'message'
# checking the div for the spacer child to check if it's the first message from a new user
if temp.find_class('spacer'):
# first four children are spacer, avatar, timestamp and owner
# which we don't want in the message
parsed_message['text'] = ''.join(list(temp.itertext())[3:]).strip()
# if it's the same user but a new message, they aren't there, and the tag as no children just text
else:
parsed_message['text'] = temp.text_content().strip()
else:
parsed_message['owner'] = ''
if 'emote' in msg_classes:
parsed_message['type'] = 'action'
parsed_message['text'] = temp.text_content().strip()
elif 'desc' in msg_classes:
parsed_message['type'] = 'description'
parsed_message['text'] = temp.text_content().strip()
elif 'rollresult' in msg_classes:
parsed_message['type'] = 'roll'
parsed_message['formula'] = temp.find_class('formula')[0].text_content().strip()
roll_list = []
rolls = temp.find_class('diceroll')
for roll in rolls:
roll_list.append(roll.find_class('didroll')[0].text_content())
parsed_message['rolls'] = ','.join(roll_list)
parsed_message['result'] = temp.find_class('rolled')[0].text_content().strip()
else:
continue
chatlog.append(parsed_message)
with open('output.txt','w', encoding='utf8') as output:
json.dump(chatlog, output)
return json.dumps(chatlog)
def export_dialogue_lines(chatlog):
lines = []
for line_dic in chatlog:
if line_dic['type'] == 'message':
lines.append(line_dic['owner'] + ': ' + line_dic['text'])
if line_dic['type'] in ('description','action'):
lines.append(line_dic['text'])
return lines
def export_dialogue(chatlog_json):
print('Exporting dialogue...')
lines = export_dialogue_lines(json.loads(chatlog_json))
with open('dialogue.txt','w', encoding='utf-8') as file_output:
for line in lines:
file_output.write(line)
file_output.write('\n')
def split_sessions(chatlog_json):
# filename = filedialog.askopenfilename(title='chatlog output to split')
chatlog = json.loads(chatlog_json)
if os.path.exists('./sessions'):
print('Emptying folder...')
for file in os.scandir('./sessions'):
os.unlink(file.path)
else:
os.mkdir('./sessions')
os.chdir('./sessions')
cur_date = None
session_log = []
print('Exporting chatlog...')
for line in chatlog:
try:
timestamp = datetime.datetime.strptime(line['timestamp'], '%B %d, %Y %I:%M%p').replace(hour=0, minute=0)
if not cur_date:
cur_date = timestamp
except:
# sometimes the timestamp will be missing hours and minutes
# but then we don't care since we're only looking at day and month
# so the day of the session doesn't change
timestamp = cur_date
if timestamp == cur_date:
session_log.append(line)
elif timestamp != cur_date:
# export chatlog to a text file
filename = '-'.join([str(cur_date.year), str(cur_date.month).zfill(2), str(cur_date.day).zfill(2)])
print('Writing ' + filename + '.txt...')
output_file = open(filename + '.txt', 'w')
output_file.write(json.dumps(session_log))
output_file.close()
# reset log then add the first line
cur_date = timestamp
session_log = []
session_log.append(line)
else:
print('Something went wrong with the timestamp.')
# export last chatlog that wasn't done in the loop
filename = '-'.join([str(timestamp.year), str(timestamp.month).zfill(2), str(timestamp.day).zfill(2)])
print('Writing ' + filename + '.txt...')
output_file = open(filename + '.txt', 'w')
output_file.write(json.dumps(session_log))
output_file.close()
chatlog = parse_log()
split_sessions(chatlog)
export_dialogue(chatlog)