forked from fishioon/douyu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
30 lines (26 loc) · 1022 Bytes
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding:utf-8 -*-
import pandas as pd
import re
from collections import OrderedDict
def load_data(dpath):
gift_list = []
msg_list = []
with open(dpath, 'r') as f:
for line in f.xreadlines():
line = unicode(line.strip(), 'utf-8')
tuples = line.split('#')
if len(tuples) != 3:
continue
if len(tuples[2]) > 0 and tuples[2][0] == u'@' and re.match(ur'.*[0-9]+.*', tuples[2]):
tuples[2] = tuples[2][1:]
gift_list.append(tuples)
else:
msg_list.append(tuples)
return pd.DataFrame(gift_list, columns=['name', 'level', 'gift_id']), \
pd.DataFrame(msg_list, columns=['name', 'level', 'content'])
def get_gift_dist(dgift):
return dgift['gift_id'].value_counts()
def get_level_dist(dgift, gift_id):
lists = dgift[dgift['gift_id']==gift_id]['level']
lists = lists.value_counts() / lists.value_counts().sum()
return lists.sort_values(ascending=False)