-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfill_database.py
130 lines (106 loc) · 5.98 KB
/
fill_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import secret # DB URL
import endings # generate forms table
import log # set up logging
import psycopg # connect to database
import json # open nominatives, stems, etc.
from tqdm import tqdm # progress bar
connection_dict = psycopg.conninfo.conninfo_to_dict(secret.DB_URI)
# get logger from main file
logger = log.logger
def run():
with open("lexicon.json", "r") as lexicon_file, open("generated-inflections.json", "r") as inflections_file:
lexicon_dict = json.load(lexicon_file)
inflections_dict = json.load(inflections_file)
decl = endings.endings["nouns"]
# remove 3rd declension as not supported yet
decl.pop("3rd declension")
with psycopg.connect(**connection_dict) as conn:
with conn.cursor() as cur:
# check tables are empty
cur.execute("select count(*) from dict_entry")
dict_entry_count = cur.fetchone()[0]
if dict_entry_count != 0:
logger.error("Database has already been filled! Run 'psql postgres < setup_database.sql' before running this.")
exit()
# fill table dict_entry
logger.info("Filling table dict_entry with words from lexicon...")
for _, word in tqdm(lexicon_dict.items()):
# put all on one line if a list of possible stems
stems = word.get("stems", None)
if stems is not None:
stems = ",".join(stems)
cur.execute(
"INSERT INTO dict_entry (word, entrydefinition, category, stem) VALUES (%s, %s, %s, %s)",
(word["transcription"], word["definition"], word.get("category", None), stems)
)
# fill table form
logger.info("Filling table form with endings...")
for declension, decl_set in tqdm(decl.items()):
for gender, gender_set in decl_set.items():
for number, ending_set in gender_set.items():
for case, ending in ending_set.items():
if isinstance(ending[0], list):
for end in ending:
# logger.debug(declension, case, gender, number, end[0])
cur.execute(
"INSERT INTO form (formdeclension, formcase, formgender, formnumber, formending, formpronunciation) VALUES (%s, %s, %s, %s, %s, %s)",
(declension, case, gender, number, end[0], end[1])
)
else:
# logger.debug(declension, case, gender, number, ending[0])
cur.execute(
"INSERT INTO form (formdeclension, formcase, formgender, formnumber, formending, formpronunciation) VALUES (%s, %s, %s, %s, %s, %s)",
(declension, case, gender, number, ending[0], ending[1])
)
# fill table inflection
logger.info("Filling table inflection with generated inflections...")
for word, inflection_set in tqdm(inflections_dict.items()):
for inflection, possible_forms in inflection_set.items():
for form in possible_forms:
# convert from neuter* --> neuter, true
gender_uncertain = False
if "*" in form["gender"]:
form["gender"] = form["gender"].replace("*", "")
gender_uncertain = True
# get formid by matching inflection's form to form table
cur.execute(
"select formid from form where formdeclension = %s and formcase = %s and formgender = %s and formnumber = %s and formending = %s",
(form["declension"], form["case"], form["gender"], form["number"], form["ending"])
)
formid = cur.fetchone()[0]
# # debug:
# try:
# formid = cur.fetchone()[0]
# except:
# formid = None
# if formid == None:
# print((form["declension"], form["case"], form["gender"], form["number"], form["ending"]))
# get entryid by matching inflection's root form to dict_entry table
cur.execute(
"select entryid from dict_entry where word = %s::text",
(word,)
)
entryid = cur.fetchone()[0]
# # debug
# try:
# entryid = cur.fetchone()[0]
# except:
# entryid = None
# if entryid == None:
# print(inflection)
# finally insert into table
cur.execute(
"INSERT INTO inflection (inflection, form, dict_entry, uncertaingender) VALUES (%s, %s, %s, %s)",
(inflection, formid, entryid, gender_uncertain)
)
cur.execute("select count(*) from dict_entry")
dict_entry_count = cur.fetchone()[0]
cur.execute("select count(*) from form")
form_count = cur.fetchone()[0]
cur.execute("select count(*) from inflection")
inflection_count = cur.fetchone()[0]
# TODO is this last line necessary?
conn.commit()
logger.info(f"Filled the database with {dict_entry_count} dictionary entries, {form_count} forms, and {inflection_count} inflections.")
if __name__ == "__main__":
run()