1+ package  com .articulate .nlp ;
2+ 
3+ import  io .github .ollama4j .OllamaAPI ;
4+ import  io .github .ollama4j .models .response .OllamaResult ;
5+ import  io .github .ollama4j .types .OllamaModelType ;
6+ import  io .github .ollama4j .utils .OptionsBuilder ;
7+ import  io .github .ollama4j .utils .Options ;
8+ 
9+ import  java .io .FileWriter ;
10+ import  java .util .Collection ;
11+ import  java .util .Set ;
12+ import  java .util .Arrays ;
13+ import  java .util .List ;
14+ import  java .util .stream .Collectors ;
15+ import  java .io .IOException ;
16+ import  java .nio .ByteBuffer ;
17+ import  java .nio .channels .FileChannel ;
18+ import  java .nio .channels .FileLock ;
19+ import  java .nio .file .Path ;
20+ import  java .nio .file .Paths ;
21+ import  java .nio .file .StandardOpenOption ;
22+ import  java .nio .file .Files ;
23+ import  java .util .Random ;
24+ 
25+ import  com .articulate .sigma .*;
26+ import  java .util .*;
27+ import  com .articulate .sigma .wordNet .WordNet ;
28+ 
29+ public  class  GenCausesTestData  {
30+ 
31+     public  static  boolean  debug  = false ;
32+     public  static  KB  kb ;
33+     public  static  String  outputFileEnglish  = "causes-eng.txt" ;
34+     public  static  String  outputFileLogic  = "causes-log.txt" ;
35+     public  static  boolean  EQUIVALENCE_MAPPINGS  = false ;
36+ 
37+     public  static  String [] phrasesCauses  = {
38+             " causes " ,
39+             " leads to " ,
40+             " results in " ,
41+             " brings about " ,
42+             " triggers " ,
43+             " provokes " ,
44+             " induces " ,
45+             " produces " ,
46+             " prompts " ,
47+             " gives rise to " ,
48+             " is responsible for " 
49+     };
50+ 
51+     public  static  String [] phrasesCausedBy  = {
52+             " is caused by " ,
53+             " is due to " ,
54+             " is a result of " ,
55+             " is because of " ,
56+             " is brought about by " ,
57+             " is triggered by " ,
58+             " is provoked by " ,
59+             " is induced by " ,
60+             " is produced by " ,
61+             " is prompted by " ,
62+             " stems from " ,
63+             " arises from " ,
64+             " originates from " ,
65+             " is driven by " ,
66+             " is attributable to " ,
67+             " can be traced back to " 
68+     };
69+ 
70+     /** *************************************************************** 
71+      *   Creates a file if one doesn't exist already. 
72+      */ 
73+     public  static  void  createFileIfDoesNotExists (String  fileName ) {
74+         Path  filePath  = Paths .get (fileName );
75+         if  (Files .exists (filePath )) {
76+             return ;
77+         } else  {
78+             try  {
79+                 Files .createFile (filePath );
80+             } catch  (IOException  e ) {
81+                 e .printStackTrace ();
82+             }
83+         }
84+     }
85+ 
86+     /** *********************************************************************** 
87+      *   Takes an enlish phrase and its logic equivalent. 
88+      *   Files are locked, so that if multiple processes are building 
89+      *   the dataset, the correspondence of english/logic pairs is preserved. 
90+      */ 
91+     public  static  void  writeEnglishLogicPairToFile (String  english , String  logic ) {
92+         FileChannel  fileChannel1  = null ;
93+         FileChannel  fileChannel2  = null ;
94+         FileLock  lock1  = null ;
95+         FileLock  lock2  = null ;
96+ 
97+         try  {
98+             fileChannel1  = FileChannel .open (Paths .get (outputFileEnglish ), StandardOpenOption .WRITE , StandardOpenOption .APPEND );
99+             fileChannel2  = FileChannel .open (Paths .get (outputFileLogic ), StandardOpenOption .WRITE , StandardOpenOption .APPEND );
100+ 
101+             lock1  = fileChannel1 .lock ();
102+             lock2  = fileChannel2 .lock ();
103+ 
104+             ByteBuffer  buffer1  = ByteBuffer .wrap (english .getBytes ());
105+             ByteBuffer  buffer2  = ByteBuffer .wrap (logic .getBytes ());
106+ 
107+             fileChannel1 .write (buffer1 );
108+             fileChannel2 .write (buffer2 );
109+ 
110+         } catch  (IOException  e ) {
111+             e .printStackTrace ();
112+         } finally  {
113+             try  {
114+                 if  (lock1  != null ) lock1 .release ();
115+                 if  (fileChannel1  != null ) fileChannel1 .close ();
116+                 if  (lock2  != null ) lock2 .release ();
117+                 if  (fileChannel2  != null ) fileChannel2 .close ();
118+             } catch  (IOException  e ) {
119+                 e .printStackTrace ();
120+             }
121+         }
122+     }
123+ 
124+     /** *************************************************************** 
125+      *   Takes a string which is a response to an Ollama query. 
126+      *   Removes all punctuation and splits camel case answers. 
127+      *   @return a clean ollama answer. 
128+      */ 
129+     private  static  String  cleanOllamaResponse (String  str ) {
130+         StringBuilder  result  = new  StringBuilder ();
131+         for  (char  c  : str .toCharArray ()) {
132+             if  (Character .isUpperCase (c ) && result .length () > 0 ) {
133+                 result .append (" " );
134+             }
135+             result .append (c );
136+         }
137+         String  sentence  = result .toString ();
138+         sentence  = Arrays .stream (sentence .split ("\\ s+" ))
139+                     .map (word  -> word .substring (0 , 1 ).toUpperCase () + word .substring (1 ).toLowerCase ())
140+                     .collect (Collectors .joining (" " ));
141+         List <Character > punctuation  = Arrays .asList ('.' , ',' , '!' , '?' , ';' , ':' , '-' , '(' , ')' , '[' , ']' , '{' , '}' , '"' , '\'' , ' ' );
142+         sentence  = sentence .chars ()
143+                     .mapToObj (c  -> (char ) c )
144+                     .filter (c  -> !punctuation .contains (c ))
145+                     .map (String ::valueOf )
146+                     .collect (Collectors .joining ());
147+         return  sentence ;
148+     }
149+ 
150+     /** *************************************************************** 
151+      *   Takes a string, such as "ThisIsAStringWithCapitalLetters" 
152+      *   and breaks it up into "This Is A String With Capital Letters" 
153+      *   @return a string broken up into words with spaces. 
154+      */ 
155+     public  static  String  addSpaceBeforeCapitals (String  input ) {
156+         if  (input  == null  || input .isEmpty ()) {
157+             return  input ;
158+         }
159+         StringBuilder  result  = new  StringBuilder ();
160+         result .append (input .charAt (0 )); // Append the first character as is 
161+         for  (int  i  = 1 ; i  < input .length (); i ++) {
162+             char  currentChar  = input .charAt (i );
163+             if  (Character .isUpperCase (currentChar )) {
164+                 result .append (' ' );
165+             }
166+             result .append (currentChar );
167+         }
168+         return  result .toString ();
169+     }
170+ 
171+     /** *************************************************************** 
172+      * Given a term, looks up all the equivalent mappings of that therm 
173+      * in WordNet, and returns a random mapping. 
174+      * @return a random equivalent SUMO Mapping 
175+      */ 
176+     private  static  String  getEquivalentSUMOMapping (String  term ) {
177+         Set <String > synsetOfTerm  = WordNet .wn .getSynsetsFromWord (term .toLowerCase ());
178+         ArrayList <String > equivalentTerms  = new  ArrayList ();
179+         int  counter  = 0 ;
180+         for  (String  synset :synsetOfTerm ) {
181+             if  (debug ) System .out .println ("Synset of "  + term  + ": "  + synset );
182+             String  sumoMapping  = WordNet .wn .getSUMOMapping (synset );
183+             if  (sumoMapping  != null ) {
184+                 sumoMapping  = sumoMapping .substring (2 );
185+                 if  (sumoMapping .charAt (sumoMapping .length () - 1 ) == '='  || EQUIVALENCE_MAPPINGS  == false ) {
186+                     String  sumoTerm  = sumoMapping .substring (0 , sumoMapping .length () - 1 );
187+                     if  (debug ) System .out .println ("Equivalent mapping to: "  + sumoTerm );
188+                     if (kb .kbCache .subclassOf (sumoTerm , "Process" )) {
189+                         if  (debug ) System .out .println (sumoTerm  + " is a process. Added." );
190+                         equivalentTerms .add (sumoMapping .substring (0 , sumoMapping .length () - 1 ));
191+                     }
192+                 }
193+             }
194+         }
195+         if  (!equivalentTerms .isEmpty ()) {
196+             Random  rand  = new  Random ();
197+             return  equivalentTerms .get (rand .nextInt (equivalentTerms .size ()));
198+         }
199+         return  null ;
200+     }
201+ 
202+     /** ********************************************************************* 
203+      * Main method. Builds a test set of the form "<term> causes <term>" 
204+      * and its logical equivalent. 
205+      * First, selects a random process from SUMO. 
206+      * Then asks ollama what is caused by that process. 
207+      */ 
208+     public  static  void  main (String [] args ) throws  Exception  {
209+         if  (args  == null  || args .length  < 2  || args .length  > 3  || args [0 ].equals ("-h" ))
210+             System .out .println ("Usage: GenCausesTestData <file prefix> <num to generate> <optional: -e (for equivalence mappings only)" );
211+         outputFileEnglish  = args [0 ] + "-eng.txt" ;
212+         outputFileLogic  = args [0 ] + "-log.txt" ;
213+         int  numToGenerate  = Integer .parseInt (args [1 ]);
214+         if  (args .length  == 3  && args [2 ].equals ("-e" )) {
215+             EQUIVALENCE_MAPPINGS  = true ;
216+             System .out .println ("Using ONLY equivalence mappings" );
217+         }
218+         else  {
219+             System .out .println ("Drawing from equivalence and subsuming mappings." );
220+         }
221+ 
222+         OllamaAPI  ollamaAPI  = new  OllamaAPI ();
223+         ollamaAPI .setVerbose (false );
224+         boolean  RAW_PROMPT  = false ;
225+         Options  options  = new  OptionsBuilder ().setTemperature (1.0f ).build ();
226+ 
227+         KBmanager .getMgr ().initializeOnce ();
228+         kb  = KBmanager .getMgr ().getKB (KBmanager .getMgr ().getPref ("sumokbname" ));
229+         System .out .println ("Finished loading KBs" );
230+         Set <String > allSUMOTermsSet  = kb .kbCache .getChildClasses ("Process" );
231+         RandSet  allSUMOTermsRandSet  = RandSet .listToEqualPairs (allSUMOTermsSet );
232+         
233+         createFileIfDoesNotExists (outputFileEnglish );
234+         createFileIfDoesNotExists (outputFileLogic );
235+         Random  random  = new  Random ();
236+         String  englishSentence ;
237+ 
238+         int  sentenceGeneratedCounter  = 0 ;
239+         while  (sentenceGeneratedCounter  < numToGenerate ) {
240+             if  (debug ) System .out .println ("\n " );
241+             String  randomSumoProcess  = allSUMOTermsRandSet .getNext ();
242+             String  randomSumoProcessEnglish  = kb .getTermFormat ("EnglishLanguage" , randomSumoProcess );
243+             if  (randomSumoProcessEnglish  == null ) {
244+                 randomSumoProcessEnglish  = addSpaceBeforeCapitals ((randomSumoProcess ));
245+             }
246+             if  (debug ) System .out .println ("Random SUMO Process: "  + randomSumoProcess );
247+             String  prompt  = "Just the response. In a single word, what does '"  + randomSumoProcessEnglish  + "' cause?" ;
248+ 
249+             OllamaResult  result  =
250+                     ollamaAPI .generate ("llama3.2" , prompt , RAW_PROMPT , options );
251+ 
252+             if  (debug ) System .out .println ("Ollama returns: "  + result .getResponse ());
253+             String  responseOllamaEnglish  = cleanOllamaResponse (result .getResponse ());
254+             String  responseInSumo  = getEquivalentSUMOMapping (responseOllamaEnglish );
255+             
256+             if  (responseInSumo  != null ) {
257+                 if  (random .nextBoolean ()) {
258+                     int  randomIndex  = random .nextInt (phrasesCauses .length );
259+                     englishSentence  = randomSumoProcessEnglish  + phrasesCauses [randomIndex ] + responseOllamaEnglish .toLowerCase () + ".\n " ;
260+                 }
261+                 else  {
262+                     int  randomIndex  = random .nextInt (phrasesCausedBy .length );
263+                     englishSentence  = responseOllamaEnglish .toLowerCase () + phrasesCausedBy [randomIndex ] + randomSumoProcessEnglish  + ".\n " ;
264+                 }
265+                 char  firstChar  = Character .toUpperCase (englishSentence .charAt (0 ));
266+                 String  remainingChars  = englishSentence .substring (1 ).toLowerCase ();
267+                 englishSentence  = firstChar  + remainingChars ;
268+                 String  logicPhrase  = "( exists ( ?V1 ?V2)  (and (instance ?V1 "  + randomSumoProcess  + " ) " 
269+                                         + " (instance ?V2 "  + responseInSumo  + " ) " 
270+                                         + " (causesSubclass ?V1 ?V2) ) )\n " ;
271+                 if  (debug ) System .out .println ("Resulting English sentence: '"  + englishSentence  + "'" );
272+                 if  (debug ) System .out .println ("Resulting logic: '"  + logicPhrase  + "'" );
273+                 writeEnglishLogicPairToFile (englishSentence , logicPhrase );
274+                 sentenceGeneratedCounter ++;
275+                 if  (sentenceGeneratedCounter  % 100  == 0 ) {
276+                     System .out .print ("...."  + sentenceGeneratedCounter );
277+                 }
278+             }
279+             else  {
280+                 if  (debug ) System .out .println ("No related process for: "  + result .getResponse ());
281+             }
282+         }
283+     }
284+ }
0 commit comments