-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetector.py
93 lines (68 loc) · 3.13 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
"""Random baseline for the PAN19 hyperpartisan news detection task"""
# Version: 2018-09-24
# Parameters:
# --inputDataset=<directory>
# Directory that contains the articles XML file with the articles for which a prediction should be made.
# --outputDir=<directory>
# Directory to which the predictions will be written. Will be created if it does not exist.
from __future__ import division
import os
import getopt
import sys
import xml.sax
import random
random.seed(42)
runOutputFileName = "prediction.txt"
def parse_options():
"""Parses the command line options."""
try:
long_options = ["inputDataset=", "outputDir="]
opts, _ = getopt.getopt(sys.argv[1:], "d:o:", long_options)
except getopt.GetoptError as err:
print(str(err))
sys.exit(2)
inputDataset = "undefined"
outputDir = "undefined"
for opt, arg in opts:
if opt in ("-d", "--inputDataset"):
inputDataset = arg
elif opt in ("-o", "--outputDir"):
outputDir = arg
else:
assert False, "Unknown option."
if inputDataset == "undefined":
sys.exit("Input dataset, the directory that contains the articles XML file, is undefined. Use option -d or --inputDataset.")
elif not os.path.exists(inputDataset):
sys.exit("The input dataset folder does not exist (%s)." % inputDataset)
if outputDir == "undefined":
sys.exit("Output path, the directory into which the predictions should be written, is undefined. Use option -o or --outputDir.")
elif not os.path.exists(outputDir):
os.mkdir(outputDir)
return (inputDataset, outputDir)
########## SAX ##########
class HyperpartisanNewsRandomPredictor(xml.sax.ContentHandler):
def __init__(self, outFile):
xml.sax.ContentHandler.__init__(self)
self.outFile = outFile
def startElement(self, name, attrs):
if name == "article":
articleId = attrs.getValue("id") # id of the article for which hyperpartisanship should be predicted
print(self)
prediction = random.choice(["true", "false"]) # random prediction
confidence = random.random() # random confidence value for prediction
# output format per line: "<article id> <prediction>[ <confidence>]"
# - prediction is either "true" (hyperpartisan) or "false" (not hyperpartisan)
# - confidence is an optional value to describe the confidence of the predictor in the prediction---the higher, the more confident
self.outFile.write(articleId + " " + prediction + " " + str(confidence) + "\n")
########## MAIN ##########
def main(inputDataset, outputDir):
"""Main method of this module."""
with open(outputDir + "/" + runOutputFileName, 'w') as outFile:
for file in os.listdir(inputDataset):
if file.endswith(".xml"):
with open(inputDataset + "/" + file) as inputRunFile:
xml.sax.parse(inputRunFile, HyperpartisanNewsRandomPredictor(outFile))
print("The predictions have been written to the output folder.")
if __name__ == '__main__':
main(*parse_options())