Merge pull request #26 from ontologyportal/Thompson_2025_01_10

Added coca_statistics.py, WordPairFrequency.java, and sqlite-jdbc-3.47.2.0.jar to the repository. Updated .gitignore to ignore large files.
ontologyportal · Jan 11, 2025 · 4384b67 · 4384b67
2 parents 3064cde + 6685ced
commit 4384b67
Show file tree

Hide file tree

Showing 4 changed files with 343 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,17 @@ nbproject/private
 .Trashes
 
 /MANIFEST.MF
+
+# Utility files
+word_pairs.db
+allAxioms-eng.txt
+allAxioms-log.txt
+combined-eng.txt
+combined-log.txt
+groundRelations-eng.txt
+groundRelations-log.txt
+out-eng.txt
+out-log.txt
+
+# COCA
+/corpora/COCA/*
diff --git a/lib/sqlite-jdbc-3.47.2.0.jar b/lib/sqlite-jdbc-3.47.2.0.jar
diff --git a/src/main/java/com/articulate/nlp/WordPairFrequency.java b/src/main/java/com/articulate/nlp/WordPairFrequency.java
@@ -0,0 +1,109 @@
+package com.articulate.nlp;
+
+/** ***************************************************************
+ * This code deals with word pair frequency. Statistics were
+ * generated from COCA.
+ *
+ * Given a verb, the getWordPairFrequencyList function returns
+ * a list of nouns that co-occur with that verb in sentences,
+ * along with the frequency with which those words occur.
+ *
+ *  To generate the
+ *
+ */
+
+import java.util.HashMap;
+import java.util.Map;
+import java.sql.*;
+
+public class WordPairFrequency {
+
+    private static String db_location = System.getenv("CORPORA") +"/COCA/word_pairs.db";
+
+    public static Map<String, String> getWordPairFrequencies(String word, String word_type_1, String word_type_2) {
+        Map<String, String> frequencyMap = new HashMap<>();
+
+        String query = "WITH VerbId AS (SELECT Id FROM Word WHERE Root='"+word+"' AND pos='"+word_type_1+"') " +
+                "SELECT word, count " +
+                "FROM (" +
+                "        SELECT w.root as word, count" +
+                "        FROM WordPair wp, Word w, Word w1" +
+                "        WHERE wp.Word1_id = (SELECT Id FROM VerbId)" +
+                " AND wp.Word2_id = w.Id" +
+                " AND w.pos = '" + word_type_2 + "'" +
+                " AND wp.Word1_id = w1.id " +
+                " UNION ALL" +
+                " SELECT w.root as word_id , count " +
+                " FROM WordPair wp, Word w, Word w1 " +
+                " WHERE wp.Word2_id = (SELECT Id FROM VerbId) " +
+                " AND wp.Word1_id = w.Id " +
+                " AND w.pos = '" + word_type_2 + "'" +
+                " AND wp.Word2_id = w1.id " +
+                "     ) AS union_sums " +
+                " ORDER BY count DESC " +
+                " LIMIT 10;";
+
+        Connection c = null;
+        Statement stmt = null;
+        try {
+            Class.forName("org.sqlite.JDBC");
+            c = DriverManager.getConnection("jdbc:sqlite:"+WordPairFrequency.db_location);
+            stmt = c.createStatement();
+
+            ResultSet rs = stmt.executeQuery(query);
+
+            System.out.println("Word ID | Count");
+            System.out.println("------------------");
+            while ( rs.next() ) {
+                String wordPair = rs.getString("word");
+                String frequency = rs.getString("count");
+                frequencyMap.put(wordPair, frequency);
+                System.out.println(wordPair + " | " + frequency);
+            }
+            rs.close();
+            stmt.close();
+            c.close();
+        } catch ( Exception e ) {
+            System.err.println( e.getClass().getName() + ": " + e.getMessage() );
+            System.exit(0);
+        }
+        return frequencyMap;
+    }
+
+    /*
+     main for testing purposes
+
+     cmd line example:
+     java -Xmx40g -classpath $ONTOLOGYPORTAL_GIT/sigmanlp/build/sigmanlp.jar:$ONTOLOGYPORTAL_GIT/sigmanlp/build/lib/* \
+            com.articulate.nlp.WordPairFrequency love noun verb
+
+     Should return:
+        Word ID | Count
+        ------------------
+        be | 83519
+        fall | 9520
+        say | 9338
+        make | 8574
+        know | 5803
+        go | 4461
+        get | 4227
+        think | 3715
+        see | 3474
+        come | 3365
+    */
+    public static void main(String args[]) {
+        System.out.println("Testing WordPairFrequency");
+        String[][] testSet = {{"love", "noun", "verb"}};
+        if (args.length > 2) {
+            testSet[0][0] = args[0];
+            testSet[0][1] = args[1];
+            testSet[0][2] = args[2];
+        }
+        for (String[] testWord : testSet) {
+            System.out.println("Testing with word: " + testWord[0]);
+            Map<String, String> pairFrequencies = getWordPairFrequencies(testWord[0], testWord[1], testWord[2]);
+        }
+    }
+
+
+}
diff --git a/src/main/python/coca_statistics.py b/src/main/python/coca_statistics.py
@@ -0,0 +1,220 @@
+import sqlite3
+import uuid
+import os
+
+
+'''
+In the second version of COCA_STATISTICS, we can have the same word in the dictionary as long as it has a different possition.
+For example we can have BOTH Love, noun and Love, verb something that wasn't possible in the first version since it only checked the
+root value and not the type of the word.
+'''
+
+
+filepath_base = os.environ.get("ONTOLOGYPORTAL_GIT")
+COCA_filepath = os.path.join(filepath_base, "sigmanlp/corpora/COCA")
+
+#DB connection
+conn = sqlite3.connect(COCA_filepath + '/word_pairs.db')
+
+verb_nouns_pairs = {}
+nouns_pairs = {}
+verbs_pairs = {}
+
+dictionary = {}
+
+def split_sentences(filename):
+
+    sentences = []
+    current_sentence = []
+
+    # Open and read the file
+    with open(filename, 'r', encoding='latin1') as file:
+        for line in file:
+            # Split the line into parts
+            parts = line.strip().split('\t')
+
+            # We have 2 kind of files with different number of columns, 3 columns and 5 columns
+            if len(parts) != 3 and len(parts) != 5:
+                continue
+            elif len(parts) == 3:
+                word_info = [parts[0], parts[1], parts[2]]
+                current_sentence.append(word_info)
+            elif len(parts) == 5:
+                # Extract word, root, and POS tag
+                word_info = [parts[2], parts[3], parts[4]]
+                current_sentence.append(word_info)
+
+            # Check if this word ends the sentence
+            if word_info[0] in {'.', '!', '?', '#', '<p>', '...', '....'} :
+                sentences.append(current_sentence)
+                current_sentence = []
+
+    # Add the last sentence if it doesn't end with punctuation
+    if current_sentence:
+        sentences.append(current_sentence)
+    return sentences
+
+
+def create_relations(sentences):
+
+    for sentence in sentences:
+        verbs = []
+        nouns = []
+
+        for word_info in sentence:
+            word, root, pos = word_info
+            if root == '':
+                continue
+            if pos.lower().startswith('vb') or pos.lower().startswith('vv'):
+                pos = 'verb'
+                if (root, pos) not in dictionary:
+                    dictionary[(root, pos)] = str(uuid.uuid4())
+                verbs.append(dictionary[(root,pos)])
+            elif pos.lower().startswith('nn'):
+                pos = 'noun'
+                if (root, pos) not in dictionary:
+                    dictionary[(root, pos)] = str(uuid.uuid4())
+                nouns.append(dictionary[(root,pos)])
+            elif pos.lower().startswith('np'): # names
+                pos = 'noun-phrase'
+                if (root, pos) not in dictionary:
+                    dictionary[(root, pos)] = str(uuid.uuid4())
+
+        # Create pairs of verbs
+        for i in range(len(verbs)):
+            for j in range(i + 1, len(verbs)):
+                # Sort the noun pair to ensure consistency in storage
+                verb_tuple = tuple(sorted([verbs[i], verbs[j]]))
+                verbs_pairs[verb_tuple] = verbs_pairs.get(verb_tuple, 0) + 1  # Noun-Noun pair
+
+        for i in range(len(verbs)):
+            for j in range(len(nouns)):
+                # Sort the noun pair to ensure consistency in storage
+                verb_noun_tuple = tuple(sorted([verbs[i], nouns[j]]))
+                verb_nouns_pairs[verb_noun_tuple] = verb_nouns_pairs.get(verb_noun_tuple, 0) + 1  # Verb-Noun pair
+
+        for i in range(len(nouns)):
+            for j in range(i + 1, len(nouns)):
+                # Sort the noun pair to ensure consistency in storage
+                nouns_pair_tuple = tuple(sorted([nouns[i], nouns[j]]))
+                nouns_pairs[nouns_pair_tuple] = nouns_pairs.get(nouns_pair_tuple, 0) + 1  # Noun-Noun pair
+
+
+def insert_dictionary():
+
+    print('Process of inserting ' + str(len(dictionary)) + ' Dictionary values to DB started:')
+
+    for key, value in dictionary.items():
+        try:
+            cursor.execute('''
+      INSERT INTO Word (id, root, pos)
+      VALUES (?, ?, ?)
+      ''', (value, key[0], key[1]))
+        except Exception as e:
+            print(f"An error occurred at insert_dictionary: {e}: the word {key[0]}, {key[1], {value}}")
+    conn.commit()
+    print('Insert in dictionary completed')
+
+
+def insert_relations():
+
+    print('Process of inserting ' + str(len(verbs_pairs)) + ' verbs_pairs values to DB started:')
+    for key, value in verbs_pairs.items():
+        try:
+            cursor.execute('''SELECT count FROM WordPair WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
+            result = cursor.fetchone()
+
+            if result:
+                cursor.execute('''UPDATE WordPair SET count = count + 1 WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
+            else:
+                cursor.execute('''INSERT INTO WordPair (word1_id, word2_id, count) VALUES (?, ?, ?)''', (key[0], key[1], value))
+        except sqlite3.IntegrityError as e:
+            print(f"IntegrityError - verbs_pairs: {e}")
+        except Exception as e:
+            print(f"An error occurred at verbs_pairs: {e}")
+    conn.commit()
+    print('Process of inserting the verbs_pairs values to DB completed:')
+
+    print('Process of inserting the verb_nouns_pairs values to DB started:')
+    for key, value in verb_nouns_pairs.items():
+        try:
+            cursor.execute('''SELECT count FROM WordPair WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
+            result = cursor.fetchone()
+
+            if result:
+                cursor.execute('''UPDATE WordPair SET count = count + 1 WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
+            else:
+                cursor.execute('''INSERT INTO WordPair (word1_id, word2_id, count) VALUES (?, ?, ?)''', (key[0], key[1], value))
+        except sqlite3.IntegrityError as e:
+            print(f"IntegrityError - verb_nouns_pairs: {e}")
+        except Exception as e:
+            print(f"An error occurred at verb_nouns_pairs: {e}")
+    conn.commit()
+    print('Process of inserting the verb_nouns_pairs values to DB completed:')
+
+    print('Process of inserting the nouns_pairs values to DB started:')
+    for key, value in nouns_pairs.items():
+        try:
+            cursor.execute('''SELECT count FROM WordPair WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
+            result = cursor.fetchone()
+            if result:
+                cursor.execute('''UPDATE WordPair SET count = count + 1 WHERE word1_id = ? AND word2_id = ?''', (key[0], key[1]))
+            else:
+                cursor.execute('''INSERT INTO WordPair (word1_id, word2_id, count) VALUES (?, ?, ?)''', (key[0], key[1], value))
+        except sqlite3.IntegrityError as e:
+            print(f"IntegrityError - nouns_pairs: {e}")
+        except Exception as e:
+            print(f"An error occurred at nouns_pairs: {e}")
+
+    conn.commit()
+    print('Process of inserting the nouns_pairs values to DB completed:')
+
+
+
+# Using the special variable
+# __name__
+if __name__=="__main__":
+
+    cursor = conn.cursor()
+
+    cursor.execute('''
+    CREATE TABLE IF NOT EXISTS Word (
+        id TEXT PRIMARY KEY,  -- Use TEXT to store UUIDs
+        root TEXT NOT NULL,
+        pos TEXT NOT NULL)
+    ''')
+
+    cursor.execute('''
+    CREATE TABLE IF NOT EXISTS WordPair (
+        word1_id TEXT,
+        word2_id TEXT,
+        count INTEGER DEFAULT 0,
+        PRIMARY KEY (word1_id, word2_id),
+        FOREIGN KEY (word1_id) REFERENCES Word(id),
+        FOREIGN KEY (word2_id) REFERENCES Word(id))
+    ''')
+
+    for root, dirs, files in os.walk(COCA_filepath):
+        counter = 0
+        for file in files:
+            if file.endswith('.txt'):
+                if not (file == "nouns.txt" or file == "verbs.txt"):
+                    filename = os.path.join(root, file)
+                    counter += 1
+                    print(f'Processing file {counter}/{len(files)} | name: {filename}')
+                    sentences = split_sentences(filename)
+                    create_relations(sentences)
+                else:
+                    print("Skipped: " + file + ". This file is generated from, but not part of COCA.")
+        for dir in dirs:
+            if dir.endswith('.zip'):
+                print("Found " + dir + ". File not processed. Are you sure you unzipped it?")
+
+    insert_dictionary()
+
+    insert_relations()
+
+
+
+
+