-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExperiments.txt
139 lines (110 loc) · 9.14 KB
/
Experiments.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
Commands used to carry out the experiments
####
Prerequisites:
Python2.7, Gensim, SPARQLWrapper
Word2vec https://code.google.com/archive/p/word2vec/
CoNLL NER dataset creation scripts: http://www.cnts.ua.ac.be/conll2003/ner/ / http://www.cnts.ua.ac.be/conll2003/ner.tgz
Data:
Pretrained GoogleNews Word2Vec model: https://code.google.com/archive/p/word2vec/ / https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
Pretrained Wikinews Word2Vec model: https://github.com/idio/wiki2vec / https://github.com/idio/wiki2vec/raw/master/torrents/enwiki-gensim-word2vec-1000-nostem-10cbow.torrent
Reuters RCV1: http://trec.nist.gov/data/reuters/reuters.html
Benchmark datasets: https://github.com/dbpedia-spotlight/evaluation-datasets/tree/master/data
Additional topic information for AIDA-YAGO2 dataset:
rcv1.topics.hier.orig and rcv1-v2.topics.qrels.gz: http://www.jmlr.org/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
DBpedia ontology 2015-10: http://downloads.dbpedia.org/2015-10/dbpedia_2015-10.owl
###############
# Training a Word2Vec model from the RCV1 data
###############
# Extract text from Reuters articles
# cd to directory with Reuters articles
for x in *xml ; do python getReutersText.py $x >> reuters.txt ; done
# Create model (same settings as GoogleNews model: https://groups.google.com/forum/#!msg/word2vec-toolkit/lxbl_MB29Ic/g4uEz5rNV08J (minus -read-vocab voc as that isn't available)
./word2vec -train reuters.txt -output rcv1vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10
###############
# Add topics to AIDA-YAGO2 dataset
###############
# Slightly modified CoNLL 2003 ner creation script to also print out document titles
# Add modified make.eng (to be found in scripts/ ) to the CoNLL 2003 ner creation scripts directory
# Update the Paths in lines 7, 30 and 63 to help the script find the data.
bash make.eng
# Grab the filenames of the files used for the NER training set from the resulting files:
grep "-DOCSTART-" < eng.train > index_docnames.txt
grep "-DOCSTART-" < eng.testa >> index_docnames.txt
grep "-DOCSTART-" < eng.testb >> index_docnames.txt
# Reformat the Reuters topic file:
tr -s " " < rcv1.topics.hier.orig.txt | tr " " "\t" > rcv1.topics.hier.orig.tsv
# Then gather the topics and write each topic description and document id to a file
# (twice, once for fine grained topics, once for coarse grained, switch not in line 29)
python getTopicDescriptions.py > docIds_topics_lowlevel.tsv
python getTopicDescriptions.py > docIds_topics_toplevel.tsv
# Loop through AIDA-YAGO-dataset.tsv and add the topic to each article
# (twice, once for fine grained topics, once for coarse grained, switch commenting out lines 15 and 16)
python addTopicsToAIDAYAGOfile.py > AIDA-YAGO2-dataset_topicsLowlevel.tsv
python addTopicsToAIDAYAGOfile.py > AIDA-YAGO2-dataset_topicsToplevel.tsv
# Separate AIDA-YAGO dataset into one file with entity mentions and entity links per topic
# (twice, once for fine grained topics, once for coarse grained, switch commenting out lines 10 and 11)
python separateAIDA-YAGObyTopic_singleTopicPerEntityWithNIL.py
# Currently this script doesn't put the files in a different directory, therefore between running it, clean up your directory by putting the output files into a new directory. e.g. AIDA-lowlevel before running the script for the toplevel topics.
#################
# Gather DBpedia type information for every entity in the datasets
#################
# Generate list of types
# CD to the directory with the entity datasets, this only needs to be done once for each dataset, so you don't have to do this separately for the AIDA and Wikinews datasets with the topics
for x in entity_mentions/dataset-based_experiments/*.tsv ; do output=${x#entity_mentions/dataset-based_experiments/} ; python getTypes.py $x > ../entity_types/${output%.tsv}_types.txt ; done
#####
# Generate list of DBpedia type hierarchy for evaluation
# With help from: http://stackoverflow.com/questions/17750421/retrieving-all-paths-in-an-owl-class-hierarchy-with-sparql-and-jena?answertab=votes#tab-top
#####
# cd to the directory with the DBpedia ontology
python analyseDBpediaOntology.py | sort > DBpediaHierarchy.tsv
#################
# Run Similarity Experiments
#################
# Compute similarity for each entity pair
# in case these dirs don't exist yet:
# mkdir entity_mentions/dataset-based_experiments/Google
# mkdir entity_mentions/dataset-based_experiments/Wikinews
# mkdir entity_mentions/dataset-based_experiments/Reuters
# cd to directory with data models
for x in entity_mentions/dataset-based_experiments/*.tsv ; do python getSimilarity.py $x > Google/${x%.tsv}_google.tsv ; done
for x in entity_mentions/dataset-based_experiments/*.tsv ; do python getSimilarity_rcv1.py $x > Reuters/${x%.tsv}_rcv1.tsv ; done
for x in entity_mentions/dataset-based_experiments/*.tsv ; do python getSimilarity_Wikipedia.py $x > Wikipedia/${x%.tsv}_wikipedia.tsv ; done
# Also for the topics based experiments
# in case these dirs don't exist yet:
# mkdir entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/Google
# mkdir entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/Wikinews
# mkdir entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/Reuters
# mkdir entity_mentions/topic-based_experiments/AIDA-Fine_Topics/Google
# mkdir entity_mentions/topic-based_experiments/AIDA-Fine_Topics/Wikinews
# mkdir entity_mentions/topic-based_experiments/AIDA-Fine_Topics/Reuters
# mkdir entity_mentions/topic-based_experiments/Wikinews-Topics/Google
# mkdir entity_mentions/topic-based_experiments/Wikinews-Topics/Wikinews
# mkdir entity_mentions/topic-based_experiments/Wikinews-Topics/Reuters
for x in entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/*.tsv ; do python getSimilarity.py $x > Google/${x%.tsv}_google.tsv ; done
for x in entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/*.tsv ; do python getSimilarity_rcv1.py $x > Reuters/${x%.tsv}_rcv1.tsv ; done
for x in entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/*.tsv ; do python getSimilarity_Wikipedia.py $x > Wikipedia/${x%.tsv}_wikipedia.tsv ; done
for x in entity_mentions/topic-based_experiments/AIDA-Fine_Topics/*.tsv ; do python getSimilarity.py $x > Google/${x%.tsv}_google.tsv ; done
for x in entity_mentions/topic-based_experiments/AIDA-Fine_Topics/*.tsv ; do python getSimilarity_rcv1.py $x > Reuters/${x%.tsv}_rcv1.tsv ; done
for x in entity_mentions/topic-based_experiments/AIDA-Fine_Topics/*.tsv ; do python getSimilarity_Wikipedia.py $x > Wikipedia/${x%.tsv}_wikipedia.tsv ; done
for x in entity_mentions/topic-based_experiments/Wikinews-Topics/*.tsv ; do python getSimilarity.py $x > Google/${x%.tsv}_google.tsv ; done
for x in entity_mentions/topic-based_experiments/Wikinews-Topics/*.tsv ; do python getSimilarity_rcv1.py $x > Reuters/${x%.tsv}_rcv1.tsv ; done
for x in entity_mentions/topic-based_experiments/Wikinews-Topics/*.tsv ; do python getSimilarity_Wikipedia.py $x > Wikipedia/${x%.tsv}_wikipedia.tsv ; done
# Gather results for each entity per class
for x in entity_mentions/topic-based_experiments/Google/*tsv ; do types=${x#entity_mentions/dataset-based_experiments/Google/} ; python gatherResultsTypeClassification.py entity_types/${types%_google.tsv}_types.txt $x > ${x%.tsv}_results.tsv ; done
for x in entity_mentions/dataset-based_experiments/Reuters/*tsv ; do types=${x#entity_mentions/dataset-based_experiments/Reuters/} ; python gatherResultsTypeClassification.py entity_types/${types%_rcv1.tsv}_types.txt $x > ${x%.tsv}_results.tsv ; done
for x in entity_mentions/dataset-based_experiments/Wikipedia/*tsv ; do types=${x#entity_mentions/dataset-based_experiments/Wikipedia/} ; python gatherResultsTypeClassification.py entity_types/${types%_wikipedia.tsv}_types.txt $x > ${x%.tsv}_results.tsv ; done
# Also for the topic-based experiments:
for x in entity_mentions/dataset-based_experiments/AIDA-Coarse_Topics/Google/*tsv ; do types=${x#entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/Google/} ; python gatherResultsTypeClassification.py entity_types/${types%_google.tsv}_types.txt $x > ${x%.tsv}_results.tsv ; done
for x in entity_mentions/dataset-based_experiments/AIDA-Coarse_Topics/Reuters/*tsv ; do types=${x#entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/Reuters/} ; python gatherResultsTypeClassification.py entity_types/${types%_rcv1.tsv}_types.txt $x > ${x%.tsv}_results.tsv ; done
for x in entity_mentions/dataset-based_experiments/AIDA-Coarse_Topics/Wikipedia/*tsv ; do types=${x#entity_mentions/topic-based_experiments/AIDA-Coarse_Topics/Wikipedia/} ; python gatherResultsTypeClassification.py entity_types/${types%_wikipedia.tsv}_types.txt $x > ${x%.tsv}_results.tsv ; done
# To measure the score on either coarse or fine grained types, the gold standard types need to be ordered
# via the DBpedia hierarchy
for x in *results.tsv ; do python orderGoldStandardTypes.py $x > ${x%results.tsv}ordered.tsv ; done
# Compute scores for the Tables in the paper
python computeScores.py results_ordered
# Compute scores for the R Figures
# Coarse grained
computeScores_coarse.py
# Fine grained
computeScores_fine.py
## See generateRplots.txt for code to generate the Figures in the paper