1
+ package com .articulate .nlp ;
2
+
3
+ import io .github .ollama4j .OllamaAPI ;
4
+ import io .github .ollama4j .models .response .OllamaResult ;
5
+ import io .github .ollama4j .types .OllamaModelType ;
6
+ import io .github .ollama4j .utils .OptionsBuilder ;
7
+ import io .github .ollama4j .utils .Options ;
8
+
9
+ import java .io .FileWriter ;
10
+ import java .util .Collection ;
11
+ import java .util .Set ;
12
+ import java .util .Arrays ;
13
+ import java .util .List ;
14
+ import java .util .stream .Collectors ;
15
+ import java .io .IOException ;
16
+ import java .nio .ByteBuffer ;
17
+ import java .nio .channels .FileChannel ;
18
+ import java .nio .channels .FileLock ;
19
+ import java .nio .file .Path ;
20
+ import java .nio .file .Paths ;
21
+ import java .nio .file .StandardOpenOption ;
22
+ import java .nio .file .Files ;
23
+ import java .util .Random ;
24
+
25
+ import com .articulate .sigma .*;
26
+ import java .util .*;
27
+ import com .articulate .sigma .wordNet .WordNet ;
28
+
29
+ public class GenCausesTestData {
30
+
31
+ public static boolean debug = false ;
32
+ public static KB kb ;
33
+ public static String outputFileEnglish = "causes-eng.txt" ;
34
+ public static String outputFileLogic = "causes-log.txt" ;
35
+ public static boolean EQUIVALENCE_MAPPINGS = false ;
36
+
37
+ public static String [] phrasesCauses = {
38
+ " causes " ,
39
+ " leads to " ,
40
+ " results in " ,
41
+ " brings about " ,
42
+ " triggers " ,
43
+ " provokes " ,
44
+ " induces " ,
45
+ " produces " ,
46
+ " prompts " ,
47
+ " gives rise to " ,
48
+ " is responsible for "
49
+ };
50
+
51
+ public static String [] phrasesCausedBy = {
52
+ " is caused by " ,
53
+ " is due to " ,
54
+ " is a result of " ,
55
+ " is because of " ,
56
+ " is brought about by " ,
57
+ " is triggered by " ,
58
+ " is provoked by " ,
59
+ " is induced by " ,
60
+ " is produced by " ,
61
+ " is prompted by " ,
62
+ " stems from " ,
63
+ " arises from " ,
64
+ " originates from " ,
65
+ " is driven by " ,
66
+ " is attributable to " ,
67
+ " can be traced back to "
68
+ };
69
+
70
+ /** ***************************************************************
71
+ * Creates a file if one doesn't exist already.
72
+ */
73
+ public static void createFileIfDoesNotExists (String fileName ) {
74
+ Path filePath = Paths .get (fileName );
75
+ if (Files .exists (filePath )) {
76
+ return ;
77
+ } else {
78
+ try {
79
+ Files .createFile (filePath );
80
+ } catch (IOException e ) {
81
+ e .printStackTrace ();
82
+ }
83
+ }
84
+ }
85
+
86
+ /** ***********************************************************************
87
+ * Takes an enlish phrase and its logic equivalent.
88
+ * Files are locked, so that if multiple processes are building
89
+ * the dataset, the correspondence of english/logic pairs is preserved.
90
+ */
91
+ public static void writeEnglishLogicPairToFile (String english , String logic ) {
92
+ FileChannel fileChannel1 = null ;
93
+ FileChannel fileChannel2 = null ;
94
+ FileLock lock1 = null ;
95
+ FileLock lock2 = null ;
96
+
97
+ try {
98
+ fileChannel1 = FileChannel .open (Paths .get (outputFileEnglish ), StandardOpenOption .WRITE , StandardOpenOption .APPEND );
99
+ fileChannel2 = FileChannel .open (Paths .get (outputFileLogic ), StandardOpenOption .WRITE , StandardOpenOption .APPEND );
100
+
101
+ lock1 = fileChannel1 .lock ();
102
+ lock2 = fileChannel2 .lock ();
103
+
104
+ ByteBuffer buffer1 = ByteBuffer .wrap (english .getBytes ());
105
+ ByteBuffer buffer2 = ByteBuffer .wrap (logic .getBytes ());
106
+
107
+ fileChannel1 .write (buffer1 );
108
+ fileChannel2 .write (buffer2 );
109
+
110
+ } catch (IOException e ) {
111
+ e .printStackTrace ();
112
+ } finally {
113
+ try {
114
+ if (lock1 != null ) lock1 .release ();
115
+ if (fileChannel1 != null ) fileChannel1 .close ();
116
+ if (lock2 != null ) lock2 .release ();
117
+ if (fileChannel2 != null ) fileChannel2 .close ();
118
+ } catch (IOException e ) {
119
+ e .printStackTrace ();
120
+ }
121
+ }
122
+ }
123
+
124
+ /** ***************************************************************
125
+ * Takes a string which is a response to an Ollama query.
126
+ * Removes all punctuation and splits camel case answers.
127
+ * @return a clean ollama answer.
128
+ */
129
+ private static String cleanOllamaResponse (String str ) {
130
+ StringBuilder result = new StringBuilder ();
131
+ for (char c : str .toCharArray ()) {
132
+ if (Character .isUpperCase (c ) && result .length () > 0 ) {
133
+ result .append (" " );
134
+ }
135
+ result .append (c );
136
+ }
137
+ String sentence = result .toString ();
138
+ sentence = Arrays .stream (sentence .split ("\\ s+" ))
139
+ .map (word -> word .substring (0 , 1 ).toUpperCase () + word .substring (1 ).toLowerCase ())
140
+ .collect (Collectors .joining (" " ));
141
+ List <Character > punctuation = Arrays .asList ('.' , ',' , '!' , '?' , ';' , ':' , '-' , '(' , ')' , '[' , ']' , '{' , '}' , '"' , '\'' , ' ' );
142
+ sentence = sentence .chars ()
143
+ .mapToObj (c -> (char ) c )
144
+ .filter (c -> !punctuation .contains (c ))
145
+ .map (String ::valueOf )
146
+ .collect (Collectors .joining ());
147
+ return sentence ;
148
+ }
149
+
150
+ /** ***************************************************************
151
+ * Takes a string, such as "ThisIsAStringWithCapitalLetters"
152
+ * and breaks it up into "This Is A String With Capital Letters"
153
+ * @return a string broken up into words with spaces.
154
+ */
155
+ public static String addSpaceBeforeCapitals (String input ) {
156
+ if (input == null || input .isEmpty ()) {
157
+ return input ;
158
+ }
159
+ StringBuilder result = new StringBuilder ();
160
+ result .append (input .charAt (0 )); // Append the first character as is
161
+ for (int i = 1 ; i < input .length (); i ++) {
162
+ char currentChar = input .charAt (i );
163
+ if (Character .isUpperCase (currentChar )) {
164
+ result .append (' ' );
165
+ }
166
+ result .append (currentChar );
167
+ }
168
+ return result .toString ();
169
+ }
170
+
171
+ /** ***************************************************************
172
+ * Given a term, looks up all the equivalent mappings of that therm
173
+ * in WordNet, and returns a random mapping.
174
+ * @return a random equivalent SUMO Mapping
175
+ */
176
+ private static String getEquivalentSUMOMapping (String term ) {
177
+ Set <String > synsetOfTerm = WordNet .wn .getSynsetsFromWord (term .toLowerCase ());
178
+ ArrayList <String > equivalentTerms = new ArrayList ();
179
+ int counter = 0 ;
180
+ for (String synset :synsetOfTerm ) {
181
+ if (debug ) System .out .println ("Synset of " + term + ": " + synset );
182
+ String sumoMapping = WordNet .wn .getSUMOMapping (synset );
183
+ if (sumoMapping != null ) {
184
+ sumoMapping = sumoMapping .substring (2 );
185
+ if (sumoMapping .charAt (sumoMapping .length () - 1 ) == '=' || EQUIVALENCE_MAPPINGS == false ) {
186
+ String sumoTerm = sumoMapping .substring (0 , sumoMapping .length () - 1 );
187
+ if (debug ) System .out .println ("Equivalent mapping to: " + sumoTerm );
188
+ if (kb .kbCache .subclassOf (sumoTerm , "Process" )) {
189
+ if (debug ) System .out .println (sumoTerm + " is a process. Added." );
190
+ equivalentTerms .add (sumoMapping .substring (0 , sumoMapping .length () - 1 ));
191
+ }
192
+ }
193
+ }
194
+ }
195
+ if (!equivalentTerms .isEmpty ()) {
196
+ Random rand = new Random ();
197
+ return equivalentTerms .get (rand .nextInt (equivalentTerms .size ()));
198
+ }
199
+ return null ;
200
+ }
201
+
202
+ /** *********************************************************************
203
+ * Main method. Builds a test set of the form "<term> causes <term>"
204
+ * and its logical equivalent.
205
+ * First, selects a random process from SUMO.
206
+ * Then asks ollama what is caused by that process.
207
+ */
208
+ public static void main (String [] args ) throws Exception {
209
+ if (args == null || args .length < 2 || args .length > 3 || args [0 ].equals ("-h" ))
210
+ System .out .println ("Usage: GenCausesTestData <file prefix> <num to generate> <optional: -e (for equivalence mappings only)" );
211
+ outputFileEnglish = args [0 ] + "-eng.txt" ;
212
+ outputFileLogic = args [0 ] + "-log.txt" ;
213
+ int numToGenerate = Integer .parseInt (args [1 ]);
214
+ if (args .length == 3 && args [2 ].equals ("-e" )) {
215
+ EQUIVALENCE_MAPPINGS = true ;
216
+ System .out .println ("Using ONLY equivalence mappings" );
217
+ }
218
+ else {
219
+ System .out .println ("Drawing from equivalence and subsuming mappings." );
220
+ }
221
+
222
+ OllamaAPI ollamaAPI = new OllamaAPI ();
223
+ ollamaAPI .setVerbose (false );
224
+ boolean RAW_PROMPT = false ;
225
+ Options options = new OptionsBuilder ().setTemperature (1.0f ).build ();
226
+
227
+ KBmanager .getMgr ().initializeOnce ();
228
+ kb = KBmanager .getMgr ().getKB (KBmanager .getMgr ().getPref ("sumokbname" ));
229
+ System .out .println ("Finished loading KBs" );
230
+ Set <String > allSUMOTermsSet = kb .kbCache .getChildClasses ("Process" );
231
+ RandSet allSUMOTermsRandSet = RandSet .listToEqualPairs (allSUMOTermsSet );
232
+
233
+ createFileIfDoesNotExists (outputFileEnglish );
234
+ createFileIfDoesNotExists (outputFileLogic );
235
+ Random random = new Random ();
236
+ String englishSentence ;
237
+
238
+ int sentenceGeneratedCounter = 0 ;
239
+ while (sentenceGeneratedCounter < numToGenerate ) {
240
+ if (debug ) System .out .println ("\n " );
241
+ String randomSumoProcess = allSUMOTermsRandSet .getNext ();
242
+ String randomSumoProcessEnglish = kb .getTermFormat ("EnglishLanguage" , randomSumoProcess );
243
+ if (randomSumoProcessEnglish == null ) {
244
+ randomSumoProcessEnglish = addSpaceBeforeCapitals ((randomSumoProcess ));
245
+ }
246
+ if (debug ) System .out .println ("Random SUMO Process: " + randomSumoProcess );
247
+ String prompt = "Just the response. In a single word, what does '" + randomSumoProcessEnglish + "' cause?" ;
248
+
249
+ OllamaResult result =
250
+ ollamaAPI .generate ("llama3.2" , prompt , RAW_PROMPT , options );
251
+
252
+ if (debug ) System .out .println ("Ollama returns: " + result .getResponse ());
253
+ String responseOllamaEnglish = cleanOllamaResponse (result .getResponse ());
254
+ String responseInSumo = getEquivalentSUMOMapping (responseOllamaEnglish );
255
+
256
+ if (responseInSumo != null ) {
257
+ if (random .nextBoolean ()) {
258
+ int randomIndex = random .nextInt (phrasesCauses .length );
259
+ englishSentence = randomSumoProcessEnglish + phrasesCauses [randomIndex ] + responseOllamaEnglish .toLowerCase () + ".\n " ;
260
+ }
261
+ else {
262
+ int randomIndex = random .nextInt (phrasesCausedBy .length );
263
+ englishSentence = responseOllamaEnglish .toLowerCase () + phrasesCausedBy [randomIndex ] + randomSumoProcessEnglish + ".\n " ;
264
+ }
265
+ char firstChar = Character .toUpperCase (englishSentence .charAt (0 ));
266
+ String remainingChars = englishSentence .substring (1 ).toLowerCase ();
267
+ englishSentence = firstChar + remainingChars ;
268
+ String logicPhrase = "( exists ( ?V1 ?V2) (and (instance ?V1 " + randomSumoProcess + " ) "
269
+ + " (instance ?V2 " + responseInSumo + " ) "
270
+ + " (causesSubclass ?V1 ?V2) ) )\n " ;
271
+ if (debug ) System .out .println ("Resulting English sentence: '" + englishSentence + "'" );
272
+ if (debug ) System .out .println ("Resulting logic: '" + logicPhrase + "'" );
273
+ writeEnglishLogicPairToFile (englishSentence , logicPhrase );
274
+ sentenceGeneratedCounter ++;
275
+ if (sentenceGeneratedCounter % 100 == 0 ) {
276
+ System .out .print ("...." + sentenceGeneratedCounter );
277
+ }
278
+ }
279
+ else {
280
+ if (debug ) System .out .println ("No related process for: " + result .getResponse ());
281
+ }
282
+ }
283
+ }
284
+ }
0 commit comments