Skip to content

Commit

Permalink
Merge pull request #26 from ontologyportal/Thompson_2025_01_10
Browse files Browse the repository at this point in the history
Added coca_statistics.py, WordPairFrequency.java, and sqlite-jdbc-3.47.2.0.jar to the repository. Updated .gitignore to ignore large files.
  • Loading branch information
apease authored Jan 11, 2025
2 parents 3064cde + 6685ced commit 4384b67
Show file tree
Hide file tree
Showing 4 changed files with 343 additions and 0 deletions.
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,17 @@ nbproject/private
.Trashes

/MANIFEST.MF

# Utility files
word_pairs.db
allAxioms-eng.txt
allAxioms-log.txt
combined-eng.txt
combined-log.txt
groundRelations-eng.txt
groundRelations-log.txt
out-eng.txt
out-log.txt

# COCA
/corpora/COCA/*
Binary file added lib/sqlite-jdbc-3.47.2.0.jar
Binary file not shown.
109 changes: 109 additions & 0 deletions src/main/java/com/articulate/nlp/WordPairFrequency.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package com.articulate.nlp;

/** ***************************************************************
* This code deals with word pair frequency. Statistics were
* generated from COCA.
*
* Given a verb, the getWordPairFrequencyList function returns
* a list of nouns that co-occur with that verb in sentences,
* along with the frequency with which those words occur.
*
* To generate the
*
*/

import java.util.HashMap;
import java.util.Map;
import java.sql.*;

public class WordPairFrequency {

private static String db_location = System.getenv("CORPORA") +"/COCA/word_pairs.db";

public static Map<String, String> getWordPairFrequencies(String word, String word_type_1, String word_type_2) {
Map<String, String> frequencyMap = new HashMap<>();

String query = "WITH VerbId AS (SELECT Id FROM Word WHERE Root='"+word+"' AND pos='"+word_type_1+"') " +
"SELECT word, count " +
"FROM (" +
" SELECT w.root as word, count" +
" FROM WordPair wp, Word w, Word w1" +
" WHERE wp.Word1_id = (SELECT Id FROM VerbId)" +
" AND wp.Word2_id = w.Id" +
" AND w.pos = '" + word_type_2 + "'" +
" AND wp.Word1_id = w1.id " +
" UNION ALL" +
" SELECT w.root as word_id , count " +
" FROM WordPair wp, Word w, Word w1 " +
" WHERE wp.Word2_id = (SELECT Id FROM VerbId) " +
" AND wp.Word1_id = w.Id " +
" AND w.pos = '" + word_type_2 + "'" +
" AND wp.Word2_id = w1.id " +
" ) AS union_sums " +
" ORDER BY count DESC " +
" LIMIT 10;";

Connection c = null;
Statement stmt = null;
try {
Class.forName("org.sqlite.JDBC");
c = DriverManager.getConnection("jdbc:sqlite:"+WordPairFrequency.db_location);
stmt = c.createStatement();

ResultSet rs = stmt.executeQuery(query);

System.out.println("Word ID | Count");
System.out.println("------------------");
while ( rs.next() ) {
String wordPair = rs.getString("word");
String frequency = rs.getString("count");
frequencyMap.put(wordPair, frequency);
System.out.println(wordPair + " | " + frequency);
}
rs.close();
stmt.close();
c.close();
} catch ( Exception e ) {
System.err.println( e.getClass().getName() + ": " + e.getMessage() );
System.exit(0);
}
return frequencyMap;
}

/*
main for testing purposes
cmd line example:
java -Xmx40g -classpath $ONTOLOGYPORTAL_GIT/sigmanlp/build/sigmanlp.jar:$ONTOLOGYPORTAL_GIT/sigmanlp/build/lib/* \
com.articulate.nlp.WordPairFrequency love noun verb
Should return:
Word ID | Count
------------------
be | 83519
fall | 9520
say | 9338
make | 8574
know | 5803
go | 4461
get | 4227
think | 3715
see | 3474
come | 3365
*/
public static void main(String args[]) {
System.out.println("Testing WordPairFrequency");
String[][] testSet = {{"love", "noun", "verb"}};
if (args.length > 2) {
testSet[0][0] = args[0];
testSet[0][1] = args[1];
testSet[0][2] = args[2];
}
for (String[] testWord : testSet) {
System.out.println("Testing with word: " + testWord[0]);
Map<String, String> pairFrequencies = getWordPairFrequencies(testWord[0], testWord[1], testWord[2]);
}
}


}
220 changes: 220 additions & 0 deletions src/main/python/coca_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import sqlite3
import uuid
import os


'''
In the second version of COCA_STATISTICS, we can have the same word in the dictionary as long as it has a different possition.
For example we can have BOTH Love, noun and Love, verb something that wasn't possible in the first version since it only checked the
root value and not the type of the word.
'''


filepath_base = os.environ.get("ONTOLOGYPORTAL_GIT")
COCA_filepath = os.path.join(filepath_base, "sigmanlp/corpora/COCA")

#DB connection
conn = sqlite3.connect(COCA_filepath + '/word_pairs.db')

verb_nouns_pairs = {}
nouns_pairs = {}
verbs_pairs = {}

dictionary = {}

def split_sentences(filename):

sentences = []
current_sentence = []

# Open and read the file
with open(filename, 'r', encoding='latin1') as file:
for line in file:
# Split the line into parts
parts = line.strip().split('\t')

# We have 2 kind of files with different number of columns, 3 columns and 5 columns
if len(parts) != 3 and len(parts) != 5:
continue
elif len(parts) == 3:
word_info = [parts[0], parts[1], parts[2]]
current_sentence.append(word_info)
elif len(parts) == 5:
# Extract word, root, and POS tag
word_info = [parts[2], parts[3], parts[4]]
current_sentence.append(word_info)

# Check if this word ends the sentence
if word_info[0] in {'.', '!', '?', '#', '<p>', '...', '....'} :
sentences.append(current_sentence)
current_sentence = []

# Add the last sentence if it doesn't end with punctuation
if current_sentence:
sentences.append(current_sentence)
return sentences


def create_relations(sentences):

for sentence in sentences:
verbs = []
nouns = []

for word_info in sentence:
word, root, pos = word_info
if root == '':
continue
if pos.lower().startswith('vb') or pos.lower().startswith('vv'):
pos = 'verb'
if (root, pos) not in dictionary:
dictionary[(root, pos)] = str(uuid.uuid4())
verbs.append(dictionary[(root,pos)])
elif pos.lower().startswith('nn'):
pos = 'noun'
if (root, pos) not in dictionary:
dictionary[(root, pos)] = str(uuid.uuid4())
nouns.append(dictionary[(root,pos)])
elif pos.lower().startswith('np'): # names
pos = 'noun-phrase'
if (root, pos) not in dictionary:
dictionary[(root, pos)] = str(uuid.uuid4())

# Create pairs of verbs
for i in range(len(verbs)):
for j in range(i + 1, len(verbs)):
# Sort the noun pair to ensure consistency in storage
verb_tuple = tuple(sorted([verbs[i], verbs[j]]))
verbs_pairs[verb_tuple] = verbs_pairs.get(verb_tuple, 0) + 1 # Noun-Noun pair

for i in range(len(verbs)):
for j in range(len(nouns)):
# Sort the noun pair to ensure consistency in storage
verb_noun_tuple = tuple(sorted([verbs[i], nouns[j]]))
verb_nouns_pairs[verb_noun_tuple] = verb_nouns_pairs.get(verb_noun_tuple, 0) + 1 # Verb-Noun pair

for i in range(len(nouns)):
for j in range(i + 1, len(nouns)):
# Sort the noun pair to ensure consistency in storage
nouns_pair_tuple = tuple(sorted([nouns[i], nouns[j]]))
nouns_pairs[nouns_pair_tuple] = nouns_pairs.get(nouns_pair_tuple, 0) + 1 # Noun-Noun pair


def insert_dictionary():

print('Process of inserting ' + str(len(dictionary)) + ' Dictionary values to DB started:')

for key, value in dictionary.items():
try:
cursor.execute('''
INSERT INTO Word (id, root, pos)
VALUES (?, ?, ?)
''', (value, key[0], key[1]))
except Exception as e:
print(f"An error occurred at insert_dictionary: {e}: the word {key[0]}, {key[1], {value}}")
conn.commit()
print('Insert in dictionary completed')


def insert_relations():

print('Process of inserting ' + str(len(verbs_pairs)) + ' verbs_pairs values to DB started:')
for key, value in verbs_pairs.items():
try:
cursor.execute('''SELECT count FROM WordPair WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
result = cursor.fetchone()

if result:
cursor.execute('''UPDATE WordPair SET count = count + 1 WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
else:
cursor.execute('''INSERT INTO WordPair (word1_id, word2_id, count) VALUES (?, ?, ?)''', (key[0], key[1], value))
except sqlite3.IntegrityError as e:
print(f"IntegrityError - verbs_pairs: {e}")
except Exception as e:
print(f"An error occurred at verbs_pairs: {e}")
conn.commit()
print('Process of inserting the verbs_pairs values to DB completed:')

print('Process of inserting the verb_nouns_pairs values to DB started:')
for key, value in verb_nouns_pairs.items():
try:
cursor.execute('''SELECT count FROM WordPair WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
result = cursor.fetchone()

if result:
cursor.execute('''UPDATE WordPair SET count = count + 1 WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
else:
cursor.execute('''INSERT INTO WordPair (word1_id, word2_id, count) VALUES (?, ?, ?)''', (key[0], key[1], value))
except sqlite3.IntegrityError as e:
print(f"IntegrityError - verb_nouns_pairs: {e}")
except Exception as e:
print(f"An error occurred at verb_nouns_pairs: {e}")
conn.commit()
print('Process of inserting the verb_nouns_pairs values to DB completed:')

print('Process of inserting the nouns_pairs values to DB started:')
for key, value in nouns_pairs.items():
try:
cursor.execute('''SELECT count FROM WordPair WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
result = cursor.fetchone()
if result:
cursor.execute('''UPDATE WordPair SET count = count + 1 WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
else:
cursor.execute('''INSERT INTO WordPair (word1_id, word2_id, count) VALUES (?, ?, ?)''', (key[0], key[1], value))
except sqlite3.IntegrityError as e:
print(f"IntegrityError - nouns_pairs: {e}")
except Exception as e:
print(f"An error occurred at nouns_pairs: {e}")

conn.commit()
print('Process of inserting the nouns_pairs values to DB completed:')



# Using the special variable
# __name__
if __name__=="__main__":

cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS Word (
id TEXT PRIMARY KEY, -- Use TEXT to store UUIDs
root TEXT NOT NULL,
pos TEXT NOT NULL)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS WordPair (
word1_id TEXT,
word2_id TEXT,
count INTEGER DEFAULT 0,
PRIMARY KEY (word1_id, word2_id),
FOREIGN KEY (word1_id) REFERENCES Word(id),
FOREIGN KEY (word2_id) REFERENCES Word(id))
''')

for root, dirs, files in os.walk(COCA_filepath):
counter = 0
for file in files:
if file.endswith('.txt'):
if not (file == "nouns.txt" or file == "verbs.txt"):
filename = os.path.join(root, file)
counter += 1
print(f'Processing file {counter}/{len(files)} | name: {filename}')
sentences = split_sentences(filename)
create_relations(sentences)
else:
print("Skipped: " + file + ". This file is generated from, but not part of COCA.")
for dir in dirs:
if dir.endswith('.zip'):
print("Found " + dir + ". File not processed. Are you sure you unzipped it?")

insert_dictionary()

insert_relations()





0 comments on commit 4384b67

Please sign in to comment.