-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauthorship.py
136 lines (101 loc) · 4.84 KB
/
authorship.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
"""
authorship.py: This file provides a template to practice how to do authorship attribution (classification) in Python
The steps in this template can be used for any simple classification task. It contains the following steps:
- loading of the data
- splitting into a train, validation and test set.
- Extracting of the features (exercise)
- Classification (i.e. SVM, but easily replace with other classifiers)
- Evaluation (recall, precision, F1)
Good luck and have fun!
"""
__author__ = "Simon Brugman, Christoph Aurnhammer"
__license__ = "MIT"
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, sent_tokenize
import nltk
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
from sklearn.svm import SVC
# Download the 'stopwords' and 'punkt' from the Natural Language Toolkit,
# you can comment the next lines if already present.
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
# Load the dataset into memory from the filesystem
def load_data(dir_name):
return sklearn.datasets.load_files('data/%s' % dir_name, encoding='utf-8')
def load_train_data():
return load_data('train')
def load_test_data():
return load_data('test')
# Extract features from a given text
def extract_features(text):
bag_of_words = [x for x in wordpunct_tokenize(text)]
features = []
# Example feature 1: count the number of words
features.append(len(bag_of_words))
# Example feature 2: count the number of words, excluded the stopwords
features.append(len([x for x in bag_of_words if x.lower() not in stop_words]))
# TODO: Follow the instructions in the assignment and add your own features.
return features
# Classify using the features
def classify(train_features, train_labels, test_features):
# TODO: (Optional) If you would like to test different how classifiers would perform different, you can alter
# TODO: the classifier here.
clf = SVC(kernel='linear')
clf.fit(train_features, train_labels)
return clf.predict(test_features)
# Evaluate predictions (y_pred) given the ground truth (y_true)
def evaluate(y_true, y_pred):
# TODO: What is being evaluated here and what does it say about the performance? Include or change the evaluation
# TODO: if necessary.
recall = sklearn.metrics.recall_score(y_true, y_pred, average='macro')
print("Recall: %f" % recall)
precision = sklearn.metrics.precision_score(y_true, y_pred, average='macro')
print("Precision: %f" % precision)
f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
print("F1-score: %f" % f1_score)
return recall, precision, f1_score
# The main program
def main():
train_data = load_train_data()
# Extract the features
features = list(map(extract_features, train_data.data))
# Classify and evaluate
skf = sklearn.model_selection.StratifiedKFold(n_splits=10)
scores = []
for fold_id, (train_indexes, validation_indexes) in enumerate(skf.split(train_data.filenames, train_data.target)):
# Print the fold number
print("Fold %d" % (fold_id + 1))
# Collect the data for this train/validation split
train_features = [features[x] for x in train_indexes]
train_labels = [train_data.target[x] for x in train_indexes]
validation_features = [features[x] for x in validation_indexes]
validation_labels = [train_data.target[x] for x in validation_indexes]
# Classify and add the scores to be able to average later
y_pred = classify(train_features, train_labels, validation_features)
scores.append(evaluate(validation_labels, y_pred))
# Print a newline
print("")
# Print the averaged score
recall = sum([x[0] for x in scores]) / len(scores)
print("Averaged total recall", recall)
precision = sum([x[1] for x in scores]) / len(scores)
print("Averaged total precision", precision)
f_score = sum([x[2] for x in scores]) / len(scores)
print("Averaged total f-score", f_score)
print("")
# TODO: Once you are done crafting your features and tuning your model, also test on the test set and report your
# TODO: findings. How does the score differ from the validation score? And why do you think this is?
# test_data = load_test_data()
# test_features = list(map(extract_features, test_data.data))
#
# y_pred = classify(features, train_data.target, test_features)
# evaluate(test_data.target, y_pred)
# This piece of code is common practice in Python, is something like if "this file" is the main file to be ran, then
# execute this remaining piece of code. The advantage of this is that your main loop will not be executed when you
# import certain functions in this file in another file, which is useful in larger projects.
if __name__ == '__main__':
main()