From c589008c49b34fb5cfdee591e9d088f177eee90d Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 16 Nov 2016 18:42:22 +0100 Subject: [PATCH 01/51] towards version 1 --- pom.xml | 2 +- src/main/java/org/searsia/Main.java | 195 +++++++++--------- src/main/java/org/searsia/SearsiaOptions.java | 55 ++--- .../java/org/searsia/engine/Resource.java | 105 +++++----- .../java/org/searsia/index/ResourceIndex.java | 2 +- .../org/searsia/web/SearsiaApplication.java | 5 +- src/main/java/org/searsia/web/Update.java | 185 ----------------- src/test/java/org/searsia/MainTest.java | 6 +- .../java/org/searsia/engine/ResourceTest.java | 8 +- .../org/searsia/index/TestResourceIndex.java | 4 +- src/test/java/org/searsia/web/SearchTest.java | 4 +- 11 files changed, 183 insertions(+), 388 deletions(-) delete mode 100644 src/main/java/org/searsia/web/Update.java diff --git a/pom.xml b/pom.xml index 9b98878..bbcd173 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.searsia searsiaserver - 0.4.1 + 1.0.0 3 diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 5b9857f..f7e15d4 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -21,6 +21,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.security.MessageDigest; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; @@ -85,7 +86,7 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines } - private static void getResources(Resource mother, SearchResult result, ResourceIndex engines) { + private static void testResources(Resource mother, SearchResult result, ResourceIndex engines) { int i = 0; for (Hit hit: result.getHits()) { String rid = hit.getString("rid"); @@ -112,12 +113,22 @@ private static void getResources(Resource mother, SearchResult result, ResourceI } + private static void getResources(SearchResult result, ResourceIndex index) { + for (Hit hit: result.getHits()) { + String rid = hit.getString("rid"); + if (rid != null && !index.containsKey(rid)) { + //index.reserve(rid); + } + } + } + + private static String uriToTemplate(String uri) { if (!(uri == null) && !(uri.contains("{q"))) { if (!uri.endsWith("/")) { uri += "/"; } - uri += "search?q={q?}&r={r?}"; + uri += "search?q={q}"; } return uri; } @@ -138,6 +149,34 @@ private static void logSample(String resourceid, String query) { r.put("query", query); LOGGER.info(r.toString()); } + + private static void printMessage(String message, Boolean isQuiet) { + if (!isQuiet) { + System.err.println(message); + } + } + + + // for 'random' ids, if not provided + private static String getHashString(String inputString) { + MessageDigest md; + byte[] hash; + try { + md = MessageDigest.getInstance("MD5"); + } catch (java.security.NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + try { + hash = md.digest(inputString.getBytes("UTF-8")); + } catch (java.io.UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + StringBuilder sb = new StringBuilder(); + for(byte b : hash){ + sb.append(String.format("%02x", b & 0xff)); + } + return sb.toString(); + } /** @@ -174,69 +213,52 @@ public static void main(String[] args) { } catch (IllegalArgumentException e) { System.exit(1); } + printMessage("Searsia server " + SearsiaApplication.VERSION, options.isQuiet()); - if (!options.isQuiet()) { - System.err.println("Searsia server " + SearsiaApplication.VERSION); - } - + // Connect to the mother engine and gather information from the mother. - String motherTemplate = options.getMotherTemplate(); + Resource myself = null; Resource mother = null; - SearchResult result = null; - if (motherTemplate != null) { - mother = new Resource(motherTemplate); - try { - result = mother.search(); - } catch (SearchException e) { - System.err.println("Error: Connection failed: " + e.getMessage()); - System.exit(1); - } - Resource newMother = result.getResource(); - if (newMother != null) { - String id = newMother.getId(); - if (id != null) { - mother.changeId(id); - } - mother.setPrior(newMother.getPrior()); - mother.setName (newMother.getName()); - mother.setFavicon(newMother.getFavicon()); - mother.setBanner(newMother.getBanner()); - mother.setTestQuery(newMother.getTestQuery()); - mother.setUrlUserTemplate(newMother.getUserTemplate()); - mother.setUrlSuggestTemplate(newMother.getSuggestTemplate()); - } - if (!options.isQuiet()) { - System.err.println("Connected to: " + mother.getId()); - } - } + Resource connect = new Resource(options.getMotherTemplate(), null); + SearchResult result = null; + try { + result = connect.search(); + mother = result.getResource(); + if (mother.getAPITemplate() == null) { + mother.setUrlAPITemplate(options.getMotherTemplate()); + } + myself = mother.deepcopy(); + myself.setUrlAPITemplate(options.getMyURI()); + } catch (SearchException e) { + System.err.println("Warning: Connection failed: " + e.getMessage()); + } - // This is about me: - String myURI = options.getMyURI(); - String myTemplate = uriToTemplate(myURI); - Resource me = null; - String myId = options.getMyName(); - if (myId == null) { - if (motherTemplate != null) { - myId = mother.getId(); // no Id and mother? Take my mother's name - me = new Resource(myTemplate, myId); - } else { - me = new Resource(myTemplate); - myId = me.getId(); // no Id and no mother?, this will result in a random Id - } + + if (options.isTest()) { + printMessage("Testing: " + myself.getId(), options.isQuiet()); + try { + + result = mother.search(mother.getTestQuery()); + if (!options.isQuiet()) { + //System.out.println(result.toJson()); + if (result.getHits().isEmpty()) { + System.err.println("Test failed."); + System.exit(1); + } else { + System.err.println("Ok."); + System.exit(0); + } + } + } catch (SearchException e) { + printMessage("Error: " + e.getMessage(), options.isQuiet()); + } } else { - me = new Resource(myTemplate, myId); + printMessage("Starting: " + myself.getId(), options.isQuiet()); } - String prefix; - if (motherTemplate != null) { - prefix = mother.getMD5(); - } else { - prefix = "local"; - } - - - // Create or open indexes. The index name combines the mother unique MD5 with the local Id; - // MD5, so we will not mix indexes of we have two mothers with the same name. - String fileName = prefix + "_" + myId; + + + // Create or open indexes. The index is the MD5 of the mother + String fileName = getHashString(options.getMotherTemplate()); String path = options.getIndexPath(); Level level = options.getLoggerLevel(); try { @@ -244,62 +266,33 @@ public static void main(String[] args) { index = new SearchResultIndex(path, fileName, options.getCacheSize()); setupQueryLogger(path, fileName, level); } catch (Exception e) { - System.err.println("Setup failed: " + e.getMessage()); + printMessage("Setup failed: " + e.getMessage(), options.isQuiet()); System.exit(1); } - - // My mother: Remember her, and ask her for advice - if (mother != null) { - try { - engines.putMother(mother); - } catch (Exception e) { - System.err.println("Error: " + e.getMessage()); - System.exit(1); - } - getResources(mother, result, engines); - } - - // Myself: - Resource newMe = engines.getMyself(); - if (newMe != null) { - me.setName (newMe.getName()); - me.setFavicon(newMe.getFavicon()); - me.setBanner(newMe.getBanner()); - me.setTestQuery(newMe.getTestQuery()); - me.setUrlUserTemplate(newMe.getUserTemplate()); - me.setUrlSuggestTemplate(newMe.getSuggestTemplate()); - } else if (mother != null) { - me.setName(mother.getName()); - me.setFavicon(mother.getFavicon()); // first time? get images from mother. - me.setBanner(mother.getBanner()); - me.setUrlSuggestTemplate(mother.getSuggestTemplate()); + if (mother == null || myself == null) { + mother = engines.getMother(); + myself = engines.getMyself(); + } else { + engines.putMother(mother); + engines.putMyself(myself); } - me.setPrior(engines.maxPrior()); - try { - engines.putMyself(me); - } catch (Exception e) { - System.err.println("Error: " + e.getMessage()); - System.exit(1); - } - + // Start the web server - Boolean openWide = options.openedWide(); + String myURI = options.getMyURI(); try { server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), - new SearsiaApplication(index, engines, openWide)); + new SearsiaApplication(index, engines)); } catch (Exception e) { System.err.println("Server failed: " + e.getMessage()); System.exit(1); } - if (!options.isQuiet()) { - System.err.println("API template: " + uriToTemplate(myURI)); - System.err.println("Use Ctrl+c to stop."); - } + printMessage("API end point: " + uriToTemplate(myURI), options.isQuiet()); + printMessage("Use Ctrl+c to stop.", options.isQuiet()); // Start the update daemon - if (!options.isExit()) { + if (!options.isTest()) { try { searsiaDaemon(index, engines, options.getPollInterval()); } catch (InterruptedException e) { } diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 8a2bd91..a6b2241 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -34,8 +34,7 @@ public class SearsiaOptions { /* See setDefaults() below */ - private Boolean openWide; - private Boolean exit; + private Boolean test; private Boolean quiet; private int cacheSize; private int pollInterval; @@ -43,7 +42,6 @@ public class SearsiaOptions { private String myURI; private String motherTemplate; private String indexPath; - private String myName; /** * Takes command line options and sensible defaults @@ -52,14 +50,12 @@ public class SearsiaOptions { public SearsiaOptions(String[] args) throws IllegalArgumentException { Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); - options.addOption("e", "exit", false, "Exit immediately after startup."); + options.addOption("t", "test", false, "Test and exit."); options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); - options.addOption("m", "mother", true, "Set api template of the mother. ('none' for standalone)"); - options.addOption("n", "name", true, "Set my id (name)."); - options.addOption("o", "open", false, "Open the system for on-line updates (be careful!)"); - options.addOption("p", "path", true, "Set index path."); + options.addOption("m", "mother", true, "Set url of mother's web service end point."); + options.addOption("p", "path", true, "Set directory path to store the index."); options.addOption("q", "quiet", false, "No output on console."); options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); @@ -68,16 +64,14 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { private void setDefaults() { - openWide = false; - exit = false; + test = false; quiet = false; cacheSize = 500; pollInterval = 120; logLevel = 2; myURI = "http://localhost:16842/searsia/"; - motherTemplate = "https://search.utwente.nl/searsia/search?q={q?}&r={r?}"; + motherTemplate = null; indexPath = friendlyIndexPath(); - myName = null; } @@ -124,6 +118,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti try { cmd = parser.parse(options, args); } catch (ParseException e) { + System.out.println(e.getMessage()); help(options); throw new IllegalArgumentException(e); } @@ -134,8 +129,8 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti cacheSize = 30; } } - if (cmd.hasOption("e")) { - exit = true; + if (cmd.hasOption("t")) { + test = true; } if (cmd.hasOption("h") || cmd.getArgs().length > 0) { help(options); @@ -158,16 +153,6 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti help(options); throw new IllegalArgumentException(e); } - if (cmd.hasOption("m")) { - motherTemplate = cmd.getOptionValue("m"); - if (motherTemplate.equals("none")) motherTemplate = null; - } - if (cmd.hasOption("n")) { - myName = cmd.getOptionValue("n"); - } - if (cmd.hasOption("o")) { - openWide = true; - } if (cmd.hasOption("p")) { indexPath = cmd.getOptionValue("p"); } @@ -177,6 +162,14 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("u")) { myURI = cmd.getOptionValue("u"); } + if (cmd.hasOption("m")) { + motherTemplate = cmd.getOptionValue("m"); + } else { + help(options); + String message = "Please provide url of mother's web service end point"; + System.out.println(message); + throw new IllegalArgumentException(message); + } } @@ -189,8 +182,8 @@ public int getCacheSize() { return cacheSize; } - public Boolean isExit() { - return exit; + public Boolean isTest() { + return test; } public int getLogLevel() { @@ -225,14 +218,6 @@ public String getIndexPath() { return indexPath; } - public String getMyName() { - return myName; - } - - public Boolean openedWide() { - return openWide; - } - public Boolean isQuiet() { return quiet; } @@ -243,10 +228,8 @@ public String toString() { result += "\n Log Level = " + getLoggerLevel(); result += "\n Base Url = " + getMyURI(); result += "\n Mother = " + getMotherTemplate(); - result += "\n Index Name = " + getMyName(); result += "\n Index Path = " + getIndexPath(); result += "\n Poll Interval = " + getPollInterval(); - result += "\n Allows update = " + openedWide(); result += "\n Cache Size = " + getCacheSize(); return result; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index dd5e657..cb4d654 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -18,13 +18,16 @@ import java.io.BufferedReader; import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; +import java.net.URLConnection; import java.net.URL; import java.net.URLEncoder; -import java.security.MessageDigest; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -95,10 +98,6 @@ public Resource(String urlAPITemplate, String id) { this.testQuery = defaultTestQuery; } - public Resource(String urlAPITemplate) { - this(urlAPITemplate, getHashString(urlAPITemplate)); - } - public Resource(JSONObject jo) throws XPathExpressionException, JSONException { this.mimeType = SearchResult.SEARSIA_MIME_TYPE; this.testQuery = defaultTestQuery; @@ -261,7 +260,7 @@ public SearchResult search(String query, boolean debug) throws SearchException { } postString = fillTemplate(this.postString, postQuery); } - String page = getCompleteWebPage(url, postString, this.headers); + String page = getCompletePage(url, postString, this.headers); SearchResult result; if (this.mimeType != null && this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { result = searsiaSearch(page); @@ -286,7 +285,7 @@ public SearchResult search() throws SearchException { try { String url = this.urlAPITemplate; url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters - String page = getCompleteWebPage(url, this.postString, this.headers); + String page = getCompletePage(url, this.postString, this.headers); return searsiaSearch(page); } catch (Exception e) { // catch all, also runtime exceptions throw createPrivateSearchException(e); @@ -302,7 +301,7 @@ public Resource searchResource(String resourceid) throws SearchException { Resource engine = null; String url = this.urlAPITemplate.replaceAll("\\{r\\??\\}", URLEncoder.encode(resourceid, "UTF-8")); url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters - String jsonPage = getCompleteWebPage(url, this.postString, this.headers); + String jsonPage = getCompletePage(url, this.postString, this.headers); JSONObject json = new JSONObject(jsonPage); if (json.has("resource")) { engine = new Resource(json.getJSONObject("resource")); @@ -468,14 +467,10 @@ private boolean rateLimitReached() { } } - // TODO refactor, waaay too big: - private String getCompleteWebPage(String urlString, String postString, Map headers) throws IOException { - if (rateLimitReached()) { - throw new IOException("Rate limited"); - } - URL url = new URL(urlString); - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setRequestProperty("User-Agent", "Searsia/0.4"); + + private URLConnection setConnectionProperties(URL url, Map headers) throws IOException { + URLConnection connection = url.openConnection(); + connection.setRequestProperty("User-Agent", "Searsia/1.0"); connection.setRequestProperty("Accept", this.mimeType); //TODO: "*/*" connection.setRequestProperty("Accept-Language", "en-US,en;q=0.5"); // TODO: from browser? for (Map.Entry entry : headers.entrySet()) { @@ -490,23 +485,47 @@ private String getCompleteWebPage(String urlString, String postString, Map headers) throws IOException { + if (rateLimitReached()) { + throw new IOException("Rate limited"); } - //int responseCode = connection.getResponseCode(); - BufferedReader in = null; + URL url = new URL(urlString); + URLConnection connection = setConnectionProperties(url, headers); + InputStream stream; + if (url.getProtocol().equals("file")) { + stream = fileConnect(connection); + } else { + stream = httpConnect(connection, postString); + } + BufferedReader in = new BufferedReader(new InputStreamReader(stream, "UTF-8")); StringBuilder page = new StringBuilder(); - in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8")); if (in != null) { String inputLine; while ((inputLine = in.readLine()) != null) { @@ -522,10 +541,6 @@ public String getId() { return this.id; } - public String getMD5() { - return getHashString(this.urlAPITemplate); - } - public String getName() { return this.name; } @@ -624,6 +639,15 @@ public float score(String query) { return score; } + + public Resource deepcopy() { + try { + return new Resource(this.toJson()); + } catch (XPathExpressionException | JSONException e) { + throw new RuntimeException(e); + } + } + public JSONObject toJson() { JSONObject engine = new JSONObject(); @@ -722,25 +746,4 @@ private boolean stringEquals(String a, String b) { return a.equals(b); } - // for 'random' ids, if not provided - private static String getHashString(String inputString) { - MessageDigest md; - byte[] hash; - try { - md = MessageDigest.getInstance("MD5"); - } catch (java.security.NoSuchAlgorithmException e) { - throw new RuntimeException(e); - } - try { - hash = md.digest(inputString.getBytes("UTF-8")); - } catch (java.io.UnsupportedEncodingException e) { - throw new RuntimeException(e); - } - StringBuilder sb = new StringBuilder(); - for(byte b : hash){ - sb.append(String.format("%02x", b & 0xff)); - } - return sb.toString(); - } - } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 000704d..28e7198 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -195,7 +195,7 @@ public void delete(String id) throws IOException { this.writer.commit(); } - + public void put(Resource engine) { if (this.mother != null && engine.getId().equals(this.mother.getId())) { throw new RuntimeException("Mother id conflict: " + engine.getId()); diff --git a/src/main/java/org/searsia/web/SearsiaApplication.java b/src/main/java/org/searsia/web/SearsiaApplication.java index 1356edf..d4adb3c 100644 --- a/src/main/java/org/searsia/web/SearsiaApplication.java +++ b/src/main/java/org/searsia/web/SearsiaApplication.java @@ -34,7 +34,7 @@ */ public class SearsiaApplication extends ResourceConfig { - public static final String VERSION = "v0.4.1"; + public static final String VERSION = "v1.0.0"; protected static Response responseOk(JSONObject json) { json.put("searsia", VERSION); @@ -66,11 +66,10 @@ protected static Response jsonResponse(int status, JSONObject json) { .build(); } - public SearsiaApplication(SearchResultIndex index, ResourceIndex engines, Boolean openWide) throws IOException { + public SearsiaApplication(SearchResultIndex index, ResourceIndex engines) throws IOException { super(); Logger.getLogger("org.glassfish.grizzly").setLevel(Level.WARNING); register(new Search(index, engines)); - register(new Update(engines, openWide)); register(new OpenSearch(engines)); } diff --git a/src/main/java/org/searsia/web/Update.java b/src/main/java/org/searsia/web/Update.java deleted file mode 100644 index 0753455..0000000 --- a/src/main/java/org/searsia/web/Update.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright 2016 Searsia - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.searsia.web; - -import java.util.List; - -import javax.ws.rs.DELETE; -import javax.ws.rs.OPTIONS; -import javax.ws.rs.PUT; -import javax.ws.rs.Path; -import javax.ws.rs.PathParam; -import javax.ws.rs.Produces; -import javax.ws.rs.core.Context; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.Response; - -import org.json.JSONObject; -import org.searsia.Hit; -import org.searsia.SearchResult; -import org.searsia.index.ResourceIndex; -import org.searsia.engine.Resource; - -/** - * Enables on-line updates, only if --open set in the options. - * - * @author Dolf Trieschnigg and Djoerd Hiemstra - */ -@Path("update") -public class Update { - - private ResourceIndex engines; - private Boolean wideOpen; - - - public Update(ResourceIndex engines, Boolean wideOpen) { - this.engines = engines; - this.wideOpen = wideOpen; - } - - - private JSONObject getJSONResource(String postString, HttpHeaders headers) { - JSONObject jsonResource = null; - String contentType = headers.getHeaderString("Content-Type").toLowerCase(); - if (contentType.equals(SearchResult.SEARSIA_MIME_ENCODING)) { - JSONObject jsonInput = new JSONObject(postString); - jsonResource = jsonInput.getJSONObject("resource"); - } else { - throw new RuntimeException("Content-type not implemented"); - } - return jsonResource; - } - - - @OPTIONS - @Path("{id}") - public Response options() { - return Response.status(Response.Status.NO_CONTENT) - .header("Access-Control-Allow-Origin", "*") - .header("Access-Control-Allow-Methods", "DELETE, PUT") - .header("Access-Control-Allow-Headers", "Content-Type") - .build(); - } - - /** - * Updates the engines database with a new resource. Test with: - * curl -X PUT -H 'Content-Type: application/searsia+json; charset=UTF-8' http://localhost:16842/searsia/update/2 -d '{"resource":{"id":"2", "apitemplate":"https://search.utwente.nl/searsia/suggestions.php?q={q}", "testquery":"osiris"}}' - * - * @param id engine identifier - * @param headers http headers - * @param putString data - * @return search results for the test query if the update is successful - */ - @PUT // - @Path("{id}") - @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response put(@PathParam("id") String id, @Context HttpHeaders headers, String putString) { - if (!this.wideOpen) { - return SearsiaApplication.responseError(401, "Unauthorized. Run Searsia server with --open"); - } - Resource engine = null; - try { - JSONObject jsonResource = getJSONResource(putString, headers); - if (!id.equals(jsonResource.get("id"))) { - return SearsiaApplication.responseError(400, "Conflicting id's"); - } - engine = new Resource(jsonResource); - } catch (Exception e) { - return SearsiaApplication.responseError(400, e.getMessage()); - } - SearchResult result = null; - updateEngine(engine); - try { - result = engine.search(engine.getTestQuery(), true); // debug = true - } catch (Exception e) { - return SearsiaApplication.responseError(503, "Resource unavailable: " + e.getMessage()); - } - - JSONObject jsonOutput = result.toJson(); - jsonOutput.put("resource", engine.toJson()); - jsonOutput.put("debug", result.getXmlOut()); - List hits = result.getHits(); - if (result == null || hits.size() == 0) { - jsonOutput.put("error", "No results for test query: '" + engine.getTestQuery() + "'" ); - return SearsiaApplication.jsonResponse(405, jsonOutput); - //return SearsiaApplication.responseError(405, "No results for test query: '" + engine.getTestQuery() + "'" ); - } else { - for (Hit hit: hits) { - if (hit.getTitle() == null) { - jsonOutput.put("error", "Search result without title for query: '" + engine.getTestQuery() + "'"); - return SearsiaApplication.jsonResponse(405, jsonOutput); - } - break; // check only first - } - } - try { - engines.put(engine); - } catch (Exception e) { - return SearsiaApplication.responseError(400, e.getMessage()); - } - return SearsiaApplication.responseOk(jsonOutput); - } - - /** - * If Searsia engine, get several values. Will change the value of 'engine' - * @param engine - */ - private void updateEngine(Resource engine) { - if (engine.getMimeType().equals(SearchResult.SEARSIA_MIME_TYPE)) { - SearchResult result = null; - Resource resource = null; - try { - result = engine.search(); - resource = result.getResource(); - if (resource != null) { - engine.setUrlAPITemplate(resource.getAPITemplate()); - if (engine.getName() == null) { engine.setName(resource.getName()); } - if (engine.getBanner() == null) { engine.setBanner(resource.getBanner()); } - if (engine.getFavicon() == null) { engine.setFavicon(resource.getFavicon()); } - if (engine.getRerank() == null) { engine.setRerank(resource.getRerank()); } - if (engine.getTestQuery().equals(Resource.defaultTestQuery)) { engine.setTestQuery(resource.getTestQuery()); } // awkward if the user typed 'searsia' - } - } catch (Exception e) { - // nothing - } - } - } - - /** - * Deletes the engine with resource id: id. Test with: - * curl -X DELETE http://localhost:16842/searsia/update/2 - * - * @param id engine identifier - * @return only searsia version if successful - */ - @DELETE - @Path("{id}") - @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response delete(@PathParam("id") String id) { - if (!this.wideOpen) { - return SearsiaApplication.responseError(401, "Unauthorized"); - } - JSONObject jsonOutput = new JSONObject(); - try { - engines.delete(id); - } catch (Exception e) { - return SearsiaApplication.responseError(400, e.getMessage()); - } - return SearsiaApplication.responseOk(jsonOutput); - } - -} diff --git a/src/test/java/org/searsia/MainTest.java b/src/test/java/org/searsia/MainTest.java index e13a785..7b58b51 100644 --- a/src/test/java/org/searsia/MainTest.java +++ b/src/test/java/org/searsia/MainTest.java @@ -7,9 +7,11 @@ public class MainTest { - @Test + //@Test public void test() { - String[] args = {"--path=target/index-test/", "--log=4", "--exit", "--quiet"}; + String[] args = {"--path=target/index-test/", + "--mother=http://searsia.org/searsia/wiki-informat-.json", + "--log=4", "--test"}; Main.main(args); Assert.assertTrue(true); // happy if we get here! } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index c3b0bc3..ccfadcf 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -50,17 +50,17 @@ private Resource postSearch() throws XPathExpressionException { } private Resource searsiaSearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json"); + return new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json", "wiki"); } private Resource xmlSearch() throws XPathExpressionException, SearchException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json"); + Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json", "wiki"); Resource wikifull = wiki.searchResource("wikifull"); return wikifull; } private Resource jsonSearch() throws XPathExpressionException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-wikifull.json"); + Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-wikifull.json", "wikifull"); wiki.setMimeType("application/json"); wiki.setItemXpath("//hits"); wiki.addExtractor( @@ -73,7 +73,7 @@ private Resource jsonSearch() throws XPathExpressionException { } private Resource javascriptSearch() throws XPathExpressionException { - Resource wikifull = new Resource("http://searsia.org/searsia/wiki-{q}-wikifull.js"); + Resource wikifull = new Resource("http://searsia.org/searsia/wiki-{q}-wikifull.js", "wikifull"); wikifull.setMimeType("application/x-javascript"); wikifull.setItemXpath("//hits"); wikifull.addExtractor( diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index d99ffdc..7b84f4a 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -42,14 +42,14 @@ private static Resource searsia() { } private static Resource newby() { - Resource e = new Resource("http://new.com/?q={q}"); + Resource e = new Resource("http://new.com/?q={q}", "new"); e.changeId("890"); e.addPrivateParameter("apikey", "secret"); return e; } private static Resource me() { - Resource e = new Resource("http://me.org"); + Resource e = new Resource("http://me.org", "me"); e.setName("Me"); return e; } diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index b7a7270..4826f56 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -36,7 +36,7 @@ private static Resource wrong() { } private static Resource me() { - return new Resource("http://me.org?q={q}"); + return new Resource("http://me.org?q={q}", "me"); } @@ -65,7 +65,7 @@ public void test() throws IOException { JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); - Assert.assertEquals("708addc213e3daf4b9742883d18d0c45", resource.get("id")); + Assert.assertEquals("me", resource.get("id")); } @Test // returns local search results for 'searsia' From 7dbb3f2b525f87c00274ebdaef83cb6190bb05ca Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 22 Mar 2017 16:13:42 +0100 Subject: [PATCH 02/51] engine test --- src/main/java/org/searsia/Main.java | 61 +++++++++++-------- src/main/java/org/searsia/SearchResult.java | 12 ++-- src/main/java/org/searsia/SearsiaOptions.java | 24 +++++--- .../java/org/searsia/engine/Resource.java | 23 ++++--- .../java/org/searsia/engine/ResourceTest.java | 9 ++- 5 files changed, 77 insertions(+), 52 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index f7e15d4..2e48e19 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -230,28 +230,40 @@ public static void main(String[] args) { myself = mother.deepcopy(); myself.setUrlAPITemplate(options.getMyURI()); } catch (SearchException e) { - System.err.println("Warning: Connection failed: " + e.getMessage()); + System.err.println("ERROR: Connection failed: " + e.getMessage()); + System.exit(1); } - - if (options.isTest()) { + // If test is set, test the mother + String test = options.getTestOutput(); + if (test != null) { printMessage("Testing: " + myself.getId(), options.isQuiet()); try { - - result = mother.search(mother.getTestQuery()); - if (!options.isQuiet()) { - //System.out.println(result.toJson()); - if (result.getHits().isEmpty()) { - System.err.println("Test failed."); - System.exit(1); - } else { - System.err.println("Ok."); - System.exit(0); - } - } + result = mother.search(mother.getTestQuery(), test); } catch (SearchException e) { - printMessage("Error: " + e.getMessage(), options.isQuiet()); + System.err.println("ERROR: No output: " + e.getMessage()); + System.exit(1); + } + if (!options.isQuiet()) { + if (test.equals("json")) { + System.out.println(result.toJson()); + } else if (test.equals("xml") || test.equals("response")) { + String debugOut = result.getDebugOut(); + if (debugOut == null) { + System.out.println ("No '" + test + "' output."); + } else { + System.out.println(debugOut); + } + } } + System.out.flush(); + if (result.getHits().isEmpty()) { + System.err.println("ERROR: No results for test query."); + System.exit(1); + } else { + printMessage("Ok.", options.isQuiet()); + System.exit(0); + } } else { printMessage("Starting: " + myself.getId(), options.isQuiet()); } @@ -259,20 +271,20 @@ public static void main(String[] args) { // Create or open indexes. The index is the MD5 of the mother String fileName = getHashString(options.getMotherTemplate()); - String path = options.getIndexPath(); + String path = options.getIndexPath(); Level level = options.getLoggerLevel(); try { - engines = new ResourceIndex(path, fileName); - index = new SearchResultIndex(path, fileName, options.getCacheSize()); - setupQueryLogger(path, fileName, level); - } catch (Exception e) { + engines = new ResourceIndex(path, fileName); + index = new SearchResultIndex(path, fileName, options.getCacheSize()); + setupQueryLogger(path, fileName, level); + } catch (Exception e) { printMessage("Setup failed: " + e.getMessage(), options.isQuiet()); System.exit(1); } if (mother == null || myself == null) { - mother = engines.getMother(); - myself = engines.getMyself(); + mother = engines.getMother(); + myself = engines.getMyself(); } else { engines.putMother(mother); engines.putMyself(myself); @@ -291,8 +303,9 @@ public static void main(String[] args) { printMessage("API end point: " + uriToTemplate(myURI), options.isQuiet()); printMessage("Use Ctrl+c to stop.", options.isQuiet()); + // Start the update daemon - if (!options.isTest()) { + if (options.getTestOutput() != null) { try { searsiaDaemon(index, engines, options.getPollInterval()); } catch (InterruptedException e) { } diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 88f6f06..5bf62b5 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -39,7 +39,7 @@ public class SearchResult { private List hits; private Random random; private Resource resource; - private String xmlOut; + private String debugOut; private String query; public SearchResult() { @@ -51,7 +51,7 @@ public SearchResult(Hit hit) { this.random = new Random(); this.resource = null; this.query = null; - this.xmlOut = null; + this.debugOut = null; if (hit != null) { this.hits.add(hit); } @@ -73,12 +73,12 @@ public Resource getResource() { return this.resource; } - public void setXmlOut(String xmlOut) { - this.xmlOut = xmlOut; + public void setDebugOut(String debugOut) { + this.debugOut = debugOut; } - public String getXmlOut() { - return this.xmlOut; + public String getDebugOut() { + return this.debugOut; } public void setQuery(String query) { diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index a6b2241..cdeb356 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -34,7 +34,7 @@ public class SearsiaOptions { /* See setDefaults() below */ - private Boolean test; + private String test; private Boolean quiet; private int cacheSize; private int pollInterval; @@ -50,13 +50,13 @@ public class SearsiaOptions { public SearsiaOptions(String[] args) throws IllegalArgumentException { Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); - options.addOption("t", "test", false, "Test and exit."); + options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response')."); options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); options.addOption("m", "mother", true, "Set url of mother's web service end point."); options.addOption("p", "path", true, "Set directory path to store the index."); - options.addOption("q", "quiet", false, "No output on console."); + options.addOption("q", "quiet", false, "No output to console."); options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); @@ -64,7 +64,7 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { private void setDefaults() { - test = false; + test = null; // no test quiet = false; cacheSize = 500; pollInterval = 120; @@ -118,7 +118,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti try { cmd = parser.parse(options, args); } catch (ParseException e) { - System.out.println(e.getMessage()); + System.err.println(e.getMessage()); help(options); throw new IllegalArgumentException(e); } @@ -130,7 +130,12 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } } if (cmd.hasOption("t")) { - test = true; + test = cmd.getOptionValue("t").toLowerCase(); + if (!(test.equals("json") || test.equals("xml") || test.equals("response"))) { + String message = "Test output must be one of 'json', 'xml', or 'response'."; + System.err.println(message); + throw new IllegalArgumentException(message); + } } if (cmd.hasOption("h") || cmd.getArgs().length > 0) { help(options); @@ -165,9 +170,9 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("m")) { motherTemplate = cmd.getOptionValue("m"); } else { - help(options); String message = "Please provide url of mother's web service end point"; - System.out.println(message); + System.err.println(message); + help(options); throw new IllegalArgumentException(message); } } @@ -182,7 +187,7 @@ public int getCacheSize() { return cacheSize; } - public Boolean isTest() { + public String getTestOutput() { return test; } @@ -231,6 +236,7 @@ public String toString() { result += "\n Index Path = " + getIndexPath(); result += "\n Poll Interval = " + getPollInterval(); result += "\n Cache Size = " + getCacheSize(); + result += "\n Test Output = " + getTestOutput(); return result; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index cb4d654..8fdc895 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -237,11 +237,11 @@ public SearchResult randomSearch() throws SearchException { public SearchResult search(String query) throws SearchException { - return search(query, false); + return search(query, null); } - public SearchResult search(String query, boolean debug) throws SearchException { + public SearchResult search(String query, String debug) throws SearchException { try { String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); String postString = ""; @@ -263,7 +263,7 @@ public SearchResult search(String query, boolean debug) throws SearchException { String page = getCompletePage(url, postString, this.headers); SearchResult result; if (this.mimeType != null && this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { - result = searsiaSearch(page); + result = searsiaSearch(page, debug); } else { result = xpathSearch(url, page, debug); } @@ -286,7 +286,7 @@ public SearchResult search() throws SearchException { String url = this.urlAPITemplate; url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters String page = getCompletePage(url, this.postString, this.headers); - return searsiaSearch(page); + return searsiaSearch(page, null); } catch (Exception e) { // catch all, also runtime exceptions throw createPrivateSearchException(e); } @@ -313,8 +313,11 @@ public Resource searchResource(String resourceid) throws SearchException { } - private SearchResult searsiaSearch(String jsonPage) { + private SearchResult searsiaSearch(String jsonPage, String debug) { SearchResult result = new SearchResult(); + if (debug != null && debug.equals("response")) { + result.setDebugOut(jsonPage); + } JSONObject json = new JSONObject(jsonPage); JSONArray hits = json.getJSONArray("hits"); for (int i = 0; i < hits.length(); i += 1) { @@ -332,7 +335,7 @@ private SearchResult searsiaSearch(String jsonPage) { } - private SearchResult xpathSearch(String url, String page, boolean debug) + private SearchResult xpathSearch(String url, String page, String debug) throws IOException, XPathExpressionException { Document document; if (this.mimeType != null && this.mimeType.equals("application/json")) { @@ -348,8 +351,12 @@ private SearchResult xpathSearch(String url, String page, boolean debug) throw new IOException("Error parsing document. Wrong mimetype?"); } SearchResult result = new SearchResult(); - if (debug) { - result.setXmlOut(DOMBuilder.DOM2String(document)); + if (debug != null) { + if (debug.equals("xml")) { + result.setDebugOut(DOMBuilder.DOM2String(document)); + } else if (debug.equals("response")) { + result.setDebugOut(page); + } } XPathFactory xFactory = XPathFactory.newInstance(); XPath xpath = xFactory.newXPath(); diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index ccfadcf..9cf7470 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -101,7 +101,7 @@ public void testSearchSearsia() throws XPathExpressionException, SearchException @Test public void testSearchHtml() throws XPathExpressionException, SearchException { Resource se = htmlSearch(); - SearchResult result = se.search("dolf trieschnigg", true); + SearchResult result = se.search("dolf trieschnigg", "xml"); Assert.assertEquals("text/html", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); // TODO text nodes are glued together. @@ -137,9 +137,8 @@ public void testSearchXml2() throws XPathExpressionException, SearchException { @Test public void testSearchJson() throws XPathExpressionException, SearchException { Resource se = jsonSearch(); - Boolean debug = true; - SearchResult result = se.search("informat", debug); - Assert.assertNotNull(result.getXmlOut()); + SearchResult result = se.search("informat", "xml"); + Assert.assertNotNull(result.getDebugOut()); Assert.assertEquals("application/json", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); } @@ -162,7 +161,7 @@ public void testSearchJson3() throws XPathExpressionException, SearchException { @Test public void testSearchJavascript() throws XPathExpressionException, SearchException { Resource se = javascriptSearch(); - Boolean debug = true; + String debug = "xml"; SearchResult result = se.search("informat", debug); Assert.assertEquals("application/x-javascript", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); From 636cf69ddee964be683806c3e8c2cab3f9e4ad43 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 23 Mar 2017 18:11:13 +0100 Subject: [PATCH 03/51] v1 all tests green --- src/main/java/org/searsia/Main.java | 190 ++++++++---------- src/main/java/org/searsia/SearsiaOptions.java | 24 +-- .../java/org/searsia/engine/Resource.java | 17 +- .../java/org/searsia/index/ResourceIndex.java | 12 +- .../org/searsia/index/SearchResultIndex.java | 4 +- src/main/java/org/searsia/web/Search.java | 45 ++--- src/test/java/org/searsia/MainTest.java | 4 +- .../java/org/searsia/engine/ResourceTest.java | 39 +++- src/test/java/org/searsia/web/SearchTest.java | 8 +- 9 files changed, 170 insertions(+), 173 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 2e48e19..1a7626a 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -22,9 +22,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.security.MessageDigest; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; import java.util.Random; import org.apache.log4j.Appender; @@ -34,7 +31,6 @@ import org.apache.log4j.PatternLayout; import org.glassfish.grizzly.http.server.HttpServer; import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory; -import org.json.JSONObject; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.web.SearsiaApplication; @@ -54,8 +50,7 @@ public class Main { private static final Logger LOGGER = Logger.getLogger("org.searsia"); - private static final DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); - private static Random random = new Random(); + private static Random random = new Random(); private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines, @@ -65,7 +60,7 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines while(true) { Thread.sleep(pollInterval * 1000); try { - if (!index.check()) { + if (!index.checkFlush()) { SearchResult result = null; if (mother != null && random.nextBoolean()) { // sample mostly from mother engine = mother; @@ -77,16 +72,16 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines result.addQueryResourceRankDate(engine.getId()); } index.offer(result); - logSample(engine.getId(), result.getQuery()); + LOGGER.info("Sample " + engine.getId() + ": " + result.getQuery()); } } catch (Exception e) { - logWarning("Sampling " + engine.getId() + " failed: " + e.getMessage()); + LOGGER.warn("Sampling " + engine.getId() + " failed: " + e.getMessage()); } } } - private static void testResources(Resource mother, SearchResult result, ResourceIndex engines) { + private static void getResources(Resource mother, SearchResult result, ResourceIndex engines) { int i = 0; for (Hit hit: result.getHits()) { String rid = hit.getString("rid"); @@ -96,68 +91,56 @@ private static void testResources(Resource mother, SearchResult result, Resource try { engine = mother.searchResource(rid); } catch (SearchException e) { - System.err.println("Warning: Unable to get resources from " + mother.getId()); + System.err.println("Warning: " + e.getMessage()); break; } try { engines.put(engine); } catch(Exception e) { - System.err.println("Error: " + e.getMessage()); - System.exit(1); + fatalError(e.getMessage()); } } if (i > 10) { - break; // not more than the first 10. + break; // not more than the first 10. } } } + - - private static void getResources(SearchResult result, ResourceIndex index) { - for (Hit hit: result.getHits()) { - String rid = hit.getString("rid"); - if (rid != null && !index.containsKey(rid)) { - //index.reserve(rid); - } - } + private static String uriNormalize(String uri, String myId) { + if (uri != null) { + uri = uri.replaceAll("\\?.*$", ""); + uri = uri.replaceAll("\\/?search\\/?", ""); + if (uri.endsWith(myId)) { + uri = uri.replace(myId, ""); + } + } + return uri; } - - private static String uriToTemplate(String uri) { + private static String uriToTemplate(String uri, String myId) { if (!(uri == null) && !(uri.contains("{q"))) { if (!uri.endsWith("/")) { uri += "/"; } - uri += "search?q={q}"; + uri += myId + "/search?q={q}"; } return uri; } - private static void logWarning(String message) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - r.put("message", message); - LOGGER.warn(r.toString()); - } - - - private static void logSample(String resourceid, String query) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - r.put("sample", resourceid); - r.put("query", query); - LOGGER.info(r.toString()); - } - private static void printMessage(String message, Boolean isQuiet) { if (!isQuiet) { System.err.println(message); } } - - // for 'random' ids, if not provided + private static void fatalError(String message) { + System.err.println("ERROR: " + message); + System.exit(1); + } + + // for unique filename private static String getHashString(String inputString) { MessageDigest md; byte[] hash; @@ -178,6 +161,34 @@ private static String getHashString(String inputString) { return sb.toString(); } + + private static void testMother(Resource mother, String debugInfo, Boolean isQuiet) { + printMessage("Testing: " + mother.getId(), isQuiet); + SearchResult result = null; + try { + result = mother.search(mother.getTestQuery(), debugInfo); + } catch (SearchException e) { + fatalError("No output: " + e.getMessage()); + } + if (!isQuiet) { + if (debugInfo.equals("json")) { + System.out.println(result.toJson()); + } else if (debugInfo.equals("xml") || debugInfo.equals("response")) { + String debugOut = result.getDebugOut(); + if (debugOut == null) { + System.out.println ("No '" + debugInfo + "' output."); + } else { + System.out.println(debugOut); + } + } + } + System.out.flush(); + if (result.getHits().isEmpty()) { + fatalError("No results for test query."); + } else { + printMessage("Ok.", isQuiet); + } + } /** * Attaches a rolling file logger for search queries @@ -186,18 +197,18 @@ private static String getHashString(String inputString) { * @param filename * @throws IOException */ - private static void setupQueryLogger(String path, String filename, Level level) throws IOException { - Path querylogDir = Paths.get(path, filename + "_log"); - if (!Files.exists(querylogDir)) { - Files.createDirectories(querylogDir); + private static void setupLogger(String path, String filename, Level level) throws IOException { + Path logDir = Paths.get(path, filename + "_log"); + if (!Files.exists(logDir)) { + Files.createDirectories(logDir); } Appender appender = new DailyRollingFileAppender( - new PatternLayout("%m%n"), - querylogDir.resolve("queries.log").toString(), + new PatternLayout("%p %d{ISO8601} %m%n"), + logDir.resolve("searsia.log").toString(), "'.'yyyy-MM-dd"); LOGGER.addAppender(appender); LOGGER.setLevel(level); - logWarning("Searsia restart"); + LOGGER.warn("Searsia restart"); } @@ -211,8 +222,11 @@ public static void main(String[] args) { try { options = new SearsiaOptions(args); } catch (IllegalArgumentException e) { - System.exit(1); + fatalError(e.getMessage()); } + if (options.isHelp()) { + System.exit(0); + } printMessage("Searsia server " + SearsiaApplication.VERSION, options.isQuiet()); @@ -222,94 +236,64 @@ public static void main(String[] args) { Resource connect = new Resource(options.getMotherTemplate(), null); SearchResult result = null; try { - result = connect.search(); + result = connect.search(); mother = result.getResource(); + if (mother == null) { + fatalError("Initialization failed: JSONObject[\"resource\"] not found."); + } if (mother.getAPITemplate() == null) { mother.setUrlAPITemplate(options.getMotherTemplate()); } myself = mother.deepcopy(); myself.setUrlAPITemplate(options.getMyURI()); } catch (SearchException e) { - System.err.println("ERROR: Connection failed: " + e.getMessage()); - System.exit(1); + fatalError("Connection failed: " + e.getMessage()); } + // If test is set, test the mother - String test = options.getTestOutput(); - if (test != null) { - printMessage("Testing: " + myself.getId(), options.isQuiet()); - try { - result = mother.search(mother.getTestQuery(), test); - } catch (SearchException e) { - System.err.println("ERROR: No output: " + e.getMessage()); - System.exit(1); - } - if (!options.isQuiet()) { - if (test.equals("json")) { - System.out.println(result.toJson()); - } else if (test.equals("xml") || test.equals("response")) { - String debugOut = result.getDebugOut(); - if (debugOut == null) { - System.out.println ("No '" + test + "' output."); - } else { - System.out.println(debugOut); - } - } - } - System.out.flush(); - if (result.getHits().isEmpty()) { - System.err.println("ERROR: No results for test query."); - System.exit(1); - } else { - printMessage("Ok.", options.isQuiet()); - System.exit(0); - } + if (options.getTestOutput() != null) { + testMother(mother, options.getTestOutput(), options.isQuiet()); } else { printMessage("Starting: " + myself.getId(), options.isQuiet()); } - + // Create or open indexes. The index is the MD5 of the mother String fileName = getHashString(options.getMotherTemplate()); String path = options.getIndexPath(); Level level = options.getLoggerLevel(); try { + setupLogger(path, fileName, level); engines = new ResourceIndex(path, fileName); index = new SearchResultIndex(path, fileName, options.getCacheSize()); - setupQueryLogger(path, fileName, level); } catch (Exception e) { - printMessage("Setup failed: " + e.getMessage(), options.isQuiet()); - System.exit(1); + fatalError("Setup failed: " + e.getMessage()); } - - if (mother == null || myself == null) { - mother = engines.getMother(); - myself = engines.getMyself(); - } else { - engines.putMother(mother); - engines.putMyself(myself); - } + engines.putMother(mother); + engines.putMyself(myself); + getResources(mother, result, engines); + // Start the web server - String myURI = options.getMyURI(); + String myURI = uriNormalize(options.getMyURI(), myself.getId()); try { server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), new SearsiaApplication(index, engines)); } catch (Exception e) { - System.err.println("Server failed: " + e.getMessage()); - System.exit(1); + fatalError("Server failed: " + e.getMessage()); } - printMessage("API end point: " + uriToTemplate(myURI), options.isQuiet()); - printMessage("Use Ctrl+c to stop.", options.isQuiet()); - // Start the update daemon - if (options.getTestOutput() != null) { + // Start the update daemon if not testing + if (options.getTestOutput() == null) { + printMessage("API end point: " + uriToTemplate(myURI, myself.getId()), options.isQuiet()); + printMessage("Use Ctrl+c to stop.", options.isQuiet()); try { searsiaDaemon(index, engines, options.getPollInterval()); } catch (InterruptedException e) { } } server.shutdownNow(); } -} +} diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index cdeb356..64719de 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -36,6 +36,7 @@ public class SearsiaOptions { /* See setDefaults() below */ private String test; private Boolean quiet; + private Boolean help; private int cacheSize; private int pollInterval; private int logLevel; @@ -65,6 +66,7 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { private void setDefaults() { test = null; // no test + help = false; quiet = false; cacheSize = 500; pollInterval = 120; @@ -118,9 +120,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti try { cmd = parser.parse(options, args); } catch (ParseException e) { - System.err.println(e.getMessage()); - help(options); - throw new IllegalArgumentException(e); + throw new IllegalArgumentException(e.getMessage() + " (use '-h' for help)"); } if (cmd.hasOption("c")) { @@ -132,14 +132,12 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("t")) { test = cmd.getOptionValue("t").toLowerCase(); if (!(test.equals("json") || test.equals("xml") || test.equals("response"))) { - String message = "Test output must be one of 'json', 'xml', or 'response'."; - System.err.println(message); - throw new IllegalArgumentException(message); + throw new IllegalArgumentException("Test output must be one of 'json', 'xml' or 'response'."); } } if (cmd.hasOption("h") || cmd.getArgs().length > 0) { help(options); - throw new IllegalArgumentException("Help!"); // misusing exceptions :-( + help = true; } try { if (cmd.hasOption("i")) { @@ -155,8 +153,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } } } catch (IllegalArgumentException e) { - help(options); - throw new IllegalArgumentException(e); + throw new IllegalArgumentException(e.getMessage()); } if (cmd.hasOption("p")) { indexPath = cmd.getOptionValue("p"); @@ -170,10 +167,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("m")) { motherTemplate = cmd.getOptionValue("m"); } else { - String message = "Please provide url of mother's web service end point"; - System.err.println(message); - help(options); - throw new IllegalArgumentException(message); + throw new IllegalArgumentException("Please provide mother's url template (use '-h' for help)"); } } @@ -227,6 +221,10 @@ public Boolean isQuiet() { return quiet; } + public Boolean isHelp() { + return help; + } + @Override public String toString() { String result = "SearsiaOptions:"; diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 8fdc895..d0fd5ce 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -295,11 +295,18 @@ public SearchResult search() throws SearchException { public Resource searchResource(String resourceid) throws SearchException { if (!this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { - throw new SearchException("Resource is not a searsia engine: " + resourceid); + throw new SearchException("Resource is not a searsia engine: " + this.getId()); } - try { - Resource engine = null; - String url = this.urlAPITemplate.replaceAll("\\{r\\??\\}", URLEncoder.encode(resourceid, "UTF-8")); + Resource engine = null; + String url = this.urlAPITemplate; + String rid = this.getId(); + int lastIndex = url.lastIndexOf(rid); // replace last occurrence of resourceId + if (lastIndex < 0) { + throw new SearchException("No resources available"); + } + try { + String newRid = URLEncoder.encode(resourceid, "UTF-8"); + url = url.substring(0, lastIndex) + url.substring(lastIndex).replaceFirst(rid, newRid); url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters String jsonPage = getCompletePage(url, this.postString, this.headers); JSONObject json = new JSONObject(jsonPage); @@ -328,7 +335,7 @@ private SearchResult searsiaSearch(String jsonPage, String debug) { Resource engine = new Resource(json.getJSONObject("resource")); result.setResource(engine); } catch (XPathExpressionException e) { - LOGGER.warn("Warning: " + e.getMessage()); + LOGGER.warn("Resource error: " + e.getMessage()); } } return result; diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 28e7198..ca5fef4 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -23,8 +23,8 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Random; -import java.util.logging.Logger; +import org.apache.log4j.Logger; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -55,6 +55,7 @@ public class ResourceIndex { private final static Logger LOGGER = Logger.getLogger(ResourceIndex.class.getName()); + private final static Version version = Version.LUCENE_4_10_4; private final static int MAX_SOURCE_CACHE = 10000; // TODO: breaks if we discover more than 10000 sources @@ -116,7 +117,7 @@ private void readResourceIndex() throws IOException { reader = DirectoryReader.open(dir); } catch (org.apache.lucene.index.IndexNotFoundException e) { - LOGGER.warning("No resources in index."); + LOGGER.warn("No resources in index."); return; } try { @@ -205,10 +206,9 @@ public void put(Resource engine) { } if (!exists(engine)) { try { - // TODO: keepPrivateParameters(engine); do not overwrite own parameters, ugh updateResourceIndex(engine.getId(), engine); } catch (IOException e) { - LOGGER.warning("Update of resource " + engine.getId() + " failed"); + LOGGER.warn("Update of resource " + engine.getId() + " failed"); // TODO Oh crap, what to do? } } @@ -229,7 +229,7 @@ public Resource getRandom() { int nr = random.nextInt(keys.length); return this.engines.get(keys[nr]); } else { - return null; + return getMother(); } } @@ -272,7 +272,7 @@ public void putMyself(Resource engine) { try { writeMyselfFile(engine); } catch (IOException e) { - LOGGER.warning("Could not write index file"); + LOGGER.error("Could not write index file"); } this.me = engine; } diff --git a/src/main/java/org/searsia/index/SearchResultIndex.java b/src/main/java/org/searsia/index/SearchResultIndex.java index b48f624..4b901c9 100644 --- a/src/main/java/org/searsia/index/SearchResultIndex.java +++ b/src/main/java/org/searsia/index/SearchResultIndex.java @@ -188,7 +188,7 @@ public void flush() throws IOException { } this.hitsWriter.commit(); closeReader(); - LOGGER.info("{\"message\":\"Flushed cache to index.\"}"); + LOGGER.info("Flushed cache to index."); } /** @@ -197,7 +197,7 @@ public void flush() throws IOException { * @return true is queue was flushed. * @throws IOException */ - public boolean check() throws IOException { + public boolean checkFlush() throws IOException { boolean full = queue.size() > limit; if (full) { flush(); diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 8ce2487..fd062f5 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -17,14 +17,12 @@ package org.searsia.web; import java.io.IOException; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; import javax.ws.rs.GET; import javax.ws.rs.OPTIONS; import javax.ws.rs.Path; import javax.ws.rs.Produces; +import javax.ws.rs.PathParam; import javax.ws.rs.QueryParam; import javax.ws.rs.core.Response; @@ -42,7 +40,7 @@ * * @author Dolf Trieschnigg and Djoerd Hiemstra */ -@Path("search") +@Path("{resourceid}/search") public class Search { private final static Logger LOGGER = Logger.getLogger(Search.class); @@ -54,24 +52,7 @@ public Search(SearchResultIndex index, ResourceIndex engines) throws IOException this.engines = engines; this.index = index; } - - private static final DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); - - private void logQuery(String resourceid, String query) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - if (resourceid != null) r.put("resourceid", resourceid); - r.put("query", query); - LOGGER.info(r.toString()); - } - - private void logWarning(String message) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - r.put("warning", message); - LOGGER.warn(r.toString()); - } - + @OPTIONS public Response options() { return Response.status(Response.Status.NO_CONTENT) @@ -79,19 +60,18 @@ public Response options() { .header("Access-Control-Allow-Methods", "GET") .build(); } - + @GET @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response query(@QueryParam("r") String resourceid, @QueryParam("q") String query) { - // TODO: also log the outcome of the query - logQuery(resourceid, query); + public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q") String query) { + LOGGER.info("Query " + resourceid + ": " + query); Resource me, engine, mother; SearchResult result; JSONObject json; me = engines.getMyself(); mother = engines.getMother(); - if (resourceid != null && resourceid.trim().length() > 0 && !resourceid.equals(me.getId())) { + if (!resourceid.equals(me.getId())) { engine = engines.get(resourceid); if (engine == null) { // unknown? ask your mother if (mother != null) { @@ -99,13 +79,13 @@ public Response query(@QueryParam("r") String resourceid, @QueryParam("q") Strin engine = mother.searchResource(resourceid); } catch (SearchException e) { String message = "Resource not found: @" + resourceid; - logWarning(message); + LOGGER.warn(message); return SearsiaApplication.responseError(404, message); } } if (engine == null) { String message = "Unknown resource identifier: @" + resourceid; - logWarning(message); + LOGGER.warn(message); return SearsiaApplication.responseError(404, message); } engines.put(engine); @@ -121,7 +101,7 @@ public Response query(@QueryParam("r") String resourceid, @QueryParam("q") Strin return SearsiaApplication.responseOk(json); } catch (Exception e) { String message = "Resource @" + resourceid + " unavailable: " + e.getMessage(); - logWarning(message); + LOGGER.warn(message); return SearsiaApplication.responseError(503, message); } } else { @@ -134,7 +114,7 @@ public Response query(@QueryParam("r") String resourceid, @QueryParam("q") Strin result = index.search(query); } catch (IOException e) { String message = "Service unavailable: " + e.getMessage(); - logWarning(message); + LOGGER.warn(message); return SearsiaApplication.responseError(503, message); } if (result.getHits().isEmpty() && mother != null) { // empty? ask mother! @@ -142,8 +122,7 @@ public Response query(@QueryParam("r") String resourceid, @QueryParam("q") Strin result = mother.search(query); index.offer(result); // really trust mother } catch (SearchException e) { - String message = "Mother not available"; - logWarning(message); + LOGGER.warn("Mother not available"); } } else { // own results? Do resource ranking. result.scoreResourceSelection(query, engines); diff --git a/src/test/java/org/searsia/MainTest.java b/src/test/java/org/searsia/MainTest.java index 7b58b51..2a49f3b 100644 --- a/src/test/java/org/searsia/MainTest.java +++ b/src/test/java/org/searsia/MainTest.java @@ -7,11 +7,11 @@ public class MainTest { - //@Test + @Test public void test() { String[] args = {"--path=target/index-test/", "--mother=http://searsia.org/searsia/wiki-informat-.json", - "--log=4", "--test"}; + "--log=4", "--test=json", "--quiet"}; Main.main(args); Assert.assertTrue(true); // happy if we get here! } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index 9cf7470..fc32792 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -48,19 +48,23 @@ private Resource postSearch() throws XPathExpressionException { ); return hiemstra; } + + private Resource searsiaMimeOnlySearch() throws XPathExpressionException { + return new Resource("http://searsia.org/searsia/v1-wikididyoumean-{q?}.json", "randomid"); + } private Resource searsiaSearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json", "wiki"); + return new Resource("http://searsia.org/searsia/v1-wiki-{q?}.json", "wiki"); } private Resource xmlSearch() throws XPathExpressionException, SearchException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json", "wiki"); + Resource wiki = new Resource("http://searsia.org/searsia/v1-wiki-{q?}.json", "wiki"); Resource wikifull = wiki.searchResource("wikifull"); return wikifull; } private Resource jsonSearch() throws XPathExpressionException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-wikifull.json", "wikifull"); + Resource wiki = new Resource("http://searsia.org/searsia/v1-wikifull-{q?}.json", "wikifull"); wiki.setMimeType("application/json"); wiki.setItemXpath("//hits"); wiki.addExtractor( @@ -73,7 +77,7 @@ private Resource jsonSearch() throws XPathExpressionException { } private Resource javascriptSearch() throws XPathExpressionException { - Resource wikifull = new Resource("http://searsia.org/searsia/wiki-{q}-wikifull.js", "wikifull"); + Resource wikifull = new Resource("http://searsia.org/searsia/v1-wikifull-{q}.js", "wikifull"); wikifull.setMimeType("application/x-javascript"); wikifull.setItemXpath("//hits"); wikifull.addExtractor( @@ -137,7 +141,8 @@ public void testSearchXml2() throws XPathExpressionException, SearchException { @Test public void testSearchJson() throws XPathExpressionException, SearchException { Resource se = jsonSearch(); - SearchResult result = se.search("informat", "xml"); + String debug = "xml"; + SearchResult result = se.search("informat", debug); Assert.assertNotNull(result.getDebugOut()); Assert.assertEquals("application/json", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -181,6 +186,30 @@ public void testSearchResource() throws XPathExpressionException, SearchExceptio Assert.assertTrue(engine != null); } + @Test + public void testSearchNoResource1() throws XPathExpressionException, SearchException { + Resource se = htmlSearch(); + Boolean exception = false; + try { + Resource engine = se.searchResource("wikifull"); + } catch (SearchException e) { + exception = true; + } + Assert.assertTrue("Non-Searsia engine throws exception", exception); + } + + @Test + public void testSearchNoResource2() throws XPathExpressionException, SearchException { + Resource se = searsiaMimeOnlySearch(); + Boolean exception = false; + try { + Resource engine = se.searchResource("wikifull"); + } catch (SearchException e) { + exception = true; + } + Assert.assertTrue("No resources exception", exception); + } + @Test public void testSearchError() throws XPathExpressionException { Resource se = htmlSearch(); diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 4826f56..7f740d4 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -36,7 +36,7 @@ private static Resource wrong() { } private static Resource me() { - return new Resource("http://me.org?q={q}", "me"); + return new Resource("http://me.org?q={q}", "utwente"); } @@ -59,19 +59,19 @@ public static void lastThing() throws IOException { @Test // returns 'my' resource description public void test() throws IOException { Search search = new Search(index, engines); - Response response = search.query("", ""); + Response response = search.query("utwente", ""); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); - Assert.assertEquals("me", resource.get("id")); + Assert.assertEquals("utwente", resource.get("id")); } @Test // returns local search results for 'searsia' public void testQuery() throws IOException { Search search = new Search(index, engines); - Response response = search.query("", "searsia search for noobs"); + Response response = search.query("utwente", "searsia search for noobs"); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); From c967a8c60ba149010f9513e3cecb7000de21c236 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 24 Mar 2017 12:56:51 +0100 Subject: [PATCH 04/51] checks and tests --- src/main/java/org/searsia/Main.java | 57 ++++++++++++------- src/main/java/org/searsia/SearchResult.java | 10 ++++ .../java/org/searsia/engine/DOMBuilder.java | 3 + .../java/org/searsia/engine/Resource.java | 38 ++++++++----- .../java/org/searsia/index/ResourceIndex.java | 4 +- src/test/java/org/searsia/MainTest.java | 2 +- .../java/org/searsia/engine/ResourceTest.java | 2 +- 7 files changed, 77 insertions(+), 39 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 1a7626a..29641da 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -91,7 +91,7 @@ private static void getResources(Resource mother, SearchResult result, ResourceI try { engine = mother.searchResource(rid); } catch (SearchException e) { - System.err.println("Warning: " + e.getMessage()); + System.err.println("Warning: Not found: " + rid + ": " + e.getMessage()); break; } try { @@ -101,11 +101,18 @@ private static void getResources(Resource mother, SearchResult result, ResourceI } } if (i > 10) { - break; // not more than the first 10. + break; // not more than the first 10. Rest will follow when needed } } } - + + private static boolean sameTemplates(String uri1, String uri2, String myId) { + if (uri1 == null) { + return (uri2 == null); + } else { + return uriNormalize(uri1, myId).equals(uriNormalize(uri2, myId)); + } + } private static String uriNormalize(String uri, String myId) { if (uri != null) { @@ -163,12 +170,11 @@ private static String getHashString(String inputString) { private static void testMother(Resource mother, String debugInfo, Boolean isQuiet) { - printMessage("Testing: " + mother.getId(), isQuiet); SearchResult result = null; try { result = mother.search(mother.getTestQuery(), debugInfo); } catch (SearchException e) { - fatalError("No output: " + e.getMessage()); + fatalError("Test failed: " + e.getMessage()); } if (!isQuiet) { if (debugInfo.equals("json")) { @@ -176,7 +182,7 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie } else if (debugInfo.equals("xml") || debugInfo.equals("response")) { String debugOut = result.getDebugOut(); if (debugOut == null) { - System.out.println ("No '" + debugInfo + "' output."); + System.out.println ("Warning: No " + debugInfo + " output."); } else { System.out.println(debugOut); } @@ -184,9 +190,12 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie } System.out.flush(); if (result.getHits().isEmpty()) { - fatalError("No results for test query."); + fatalError("Test failed: No results for test query."); } else { - printMessage("Ok.", isQuiet); + if (result.getHits().size() < 10) { + printMessage("Warning: less than 10 results; see \"testquery\" or \"rerank\".", isQuiet); + } + printMessage("Test succeeded.", isQuiet); } } @@ -233,34 +242,42 @@ public static void main(String[] args) { // Connect to the mother engine and gather information from the mother. Resource myself = null; Resource mother = null; - Resource connect = new Resource(options.getMotherTemplate(), null); + Resource connect = new Resource(options.getMotherTemplate(), null); + String version = null; SearchResult result = null; try { - result = connect.search(); + result = connect.searchWithoutQuery(); mother = result.getResource(); - if (mother == null) { - fatalError("Initialization failed: JSONObject[\"resource\"] not found."); - } - if (mother.getAPITemplate() == null) { - mother.setUrlAPITemplate(options.getMotherTemplate()); - } - myself = mother.deepcopy(); - myself.setUrlAPITemplate(options.getMyURI()); + version = result.getVersion(); } catch (SearchException e) { fatalError("Connection failed: " + e.getMessage()); } + if (mother == null) { + fatalError("Initialization failed: JSONObject[\"resource\"] not found."); + } + if (version != null && !version.startsWith("v1")) { + printMessage("Warning: Wrong major Searsia version " + version, options.isQuiet()); + } + myself = mother.deepcopy(); + myself.setUrlAPITemplate(options.getMyURI()); + if (mother.getAPITemplate() == null) { + mother.setUrlAPITemplate(options.getMotherTemplate()); + } else if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { + printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); + } // If test is set, test the mother if (options.getTestOutput() != null) { + printMessage("Testing: " + mother.getId(), options.isQuiet()); testMother(mother, options.getTestOutput(), options.isQuiet()); } else { printMessage("Starting: " + myself.getId(), options.isQuiet()); } - // Create or open indexes. The index is the MD5 of the mother - String fileName = getHashString(options.getMotherTemplate()); + // Create or open indexes. The filename appends the MD5 of the id so we don't confuse indexes + String fileName = myself.getId() + "_" + getHashString(options.getMotherTemplate()); String path = options.getIndexPath(); Level level = options.getLoggerLevel(); try { diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 5bf62b5..6ebef60 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -41,6 +41,7 @@ public class SearchResult { private Resource resource; private String debugOut; private String query; + private String version; public SearchResult() { this(null); @@ -51,6 +52,7 @@ public SearchResult(Hit hit) { this.random = new Random(); this.resource = null; this.query = null; + this.version = null; this.debugOut = null; if (hit != null) { this.hits.add(hit); @@ -72,6 +74,14 @@ public void setResource(Resource resource) { public Resource getResource() { return this.resource; } + + public String getVersion() { + return this.version; + } + + public void setVersion(String version) { + this.version = version; + } public void setDebugOut(String debugOut) { this.debugOut = debugOut; diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 6a738b8..f4cd18a 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -77,6 +77,9 @@ public static String DOM2String(Document document) { try { transformer = tf.newTransformer(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); + transformer.setOutputProperty(OutputKeys.METHOD, "xml"); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter writer = new StringWriter(); transformer.transform(new DOMSource(document), new StreamResult(writer)); String output = writer.getBuffer().toString(); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index d0fd5ce..62d1022 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -140,8 +140,8 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { addPrivateParameter((String) key, (String) json.get(key)); } } - if (this.urlAPITemplate == null) { - throw new IllegalArgumentException("Missing API Template"); + if (this.urlAPITemplate != null && this.urlAPITemplate.startsWith("file")) { + throw new IllegalArgumentException("Illegal 'file' API Template"); } if (this.id == null) { throw new IllegalArgumentException("Missing Identifier"); @@ -277,14 +277,12 @@ public SearchResult search(String query, String debug) throws SearchException { } } - - public SearchResult search() throws SearchException { + public SearchResult searchWithoutQuery() throws SearchException { if (!this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { throw new SearchException("Engine is not a searsia engine: " + this.id); } try { - String url = this.urlAPITemplate; - url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters + String url = fillTemplate(this.urlAPITemplate, ""); String page = getCompletePage(url, this.postString, this.headers); return searsiaSearch(page, null); } catch (Exception e) { // catch all, also runtime exceptions @@ -307,7 +305,7 @@ public Resource searchResource(String resourceid) throws SearchException { try { String newRid = URLEncoder.encode(resourceid, "UTF-8"); url = url.substring(0, lastIndex) + url.substring(lastIndex).replaceFirst(rid, newRid); - url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters + url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}|\\{q\\}", ""); // remove optional parameters and query String jsonPage = getCompletePage(url, this.postString, this.headers); JSONObject json = new JSONObject(jsonPage); if (json.has("resource")) { @@ -326,7 +324,10 @@ private SearchResult searsiaSearch(String jsonPage, String debug) { result.setDebugOut(jsonPage); } JSONObject json = new JSONObject(jsonPage); - JSONArray hits = json.getJSONArray("hits"); + JSONArray hits = new JSONArray(); + try { + hits = json.getJSONArray("hits"); + } catch (JSONException e) { } for (int i = 0; i < hits.length(); i += 1) { result.addHit(new Hit((JSONObject) hits.get(i))); } @@ -338,6 +339,9 @@ private SearchResult searsiaSearch(String jsonPage, String debug) { LOGGER.warn("Resource error: " + e.getMessage()); } } + if (json.has("searsia")) { + result.setVersion(json.getString("searsia")); + } return result; } @@ -440,7 +444,7 @@ private Document parseDocumentXML(String xmlString) throws IOException { return DOMBuilder.string2DOM(xmlString); } - private String fillTemplate(String template, String query) throws UnsupportedEncodingException { + private String fillTemplate(String template, String query) throws IOException { String url = template; for (String param: this.privateParameters.keySet()) { url = url.replaceAll("\\{" + param + "\\??\\}", this.privateParameters.get(param)); @@ -448,7 +452,8 @@ private String fillTemplate(String template, String query) throws UnsupportedEnc url = url.replaceAll("\\{q\\??\\}", query); url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters if (url.matches(".*\\{[0-9A-Za-z\\-_]+\\}.*")) { - throw new UnsupportedEncodingException("Missing url parameter"); // TODO: better error + String param = url.substring(url.indexOf("{"), url.indexOf("}") + 1); + throw new IOException("Missing url parameter " + param); } return url; } @@ -489,11 +494,14 @@ private URLConnection setConnectionProperties(URL url, Map heade connection.setRequestProperty("Accept-Language", "en-US,en;q=0.5"); // TODO: from browser? for (Map.Entry entry : headers.entrySet()) { String value = entry.getValue(); - for (String param: this.privateParameters.keySet()) { - value = value.replace("{" + param + "}", this.privateParameters.get(param)); - } - if (value.contains("{")) { - throw new IOException("Missing header parameter"); // TODO: better error + if (value.contains("{")) { + for (String param: this.privateParameters.keySet()) { + value = value.replace("{" + param + "}", this.privateParameters.get(param)); + } + if (value.contains("{")) { + String param = value.substring(value.indexOf("{"), value.indexOf("}") + 1); + throw new IOException("Missing header parameter " + param); + } } connection.setRequestProperty(entry.getKey(), value); } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index ca5fef4..6964e07 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -62,9 +62,9 @@ public class ResourceIndex { private Map engines = new LinkedHashMap(); private Random random = new Random(); private Resource mother = null; - private Resource me = null; + private Resource me = null; private Path meFile = null; - private Path indexDir = null; + private Path indexDir = null; private IndexWriter writer = null; /** diff --git a/src/test/java/org/searsia/MainTest.java b/src/test/java/org/searsia/MainTest.java index 2a49f3b..0cd99fa 100644 --- a/src/test/java/org/searsia/MainTest.java +++ b/src/test/java/org/searsia/MainTest.java @@ -10,7 +10,7 @@ public class MainTest { @Test public void test() { String[] args = {"--path=target/index-test/", - "--mother=http://searsia.org/searsia/wiki-informat-.json", + "--mother=http://searsia.org/searsia/v1-wiki-.json", "--log=4", "--test=json", "--quiet"}; Main.main(args); Assert.assertTrue(true); // happy if we get here! diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index fc32792..fd28458 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -175,7 +175,7 @@ public void testSearchJavascript() throws XPathExpressionException, SearchExcept @Test public void testSearchSearsiaEmpty() throws XPathExpressionException, SearchException { Resource se = searsiaSearch(); - SearchResult result = se.search(); + SearchResult result = se.searchWithoutQuery(); Assert.assertTrue(result.getHits().size() > 0); } From 184ab94f4bd8319406da40cefb4299e0af5b6a71 Mon Sep 17 00:00:00 2001 From: Searsia Date: Sat, 25 Mar 2017 16:25:24 +0100 Subject: [PATCH 05/51] better errors, and options --- src/main/java/org/searsia/Main.java | 4 +++- src/main/java/org/searsia/SearsiaOptions.java | 13 ++++++------ .../java/org/searsia/engine/Resource.java | 20 +++++-------------- .../org/searsia/engine/TextExtractor.java | 5 ++++- .../java/org/searsia/engine/ResourceTest.java | 2 +- 5 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 29641da..39460b5 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -237,7 +237,9 @@ public static void main(String[] args) { System.exit(0); } printMessage("Searsia server " + SearsiaApplication.VERSION, options.isQuiet()); - + + + // Connect to the mother engine and gather information from the mother. Resource myself = null; diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 64719de..2b1a2df 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -135,10 +135,6 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti throw new IllegalArgumentException("Test output must be one of 'json', 'xml' or 'response'."); } } - if (cmd.hasOption("h") || cmd.getArgs().length > 0) { - help(options); - help = true; - } try { if (cmd.hasOption("i")) { pollInterval = new Integer(cmd.getOptionValue("i")); @@ -166,8 +162,13 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } if (cmd.hasOption("m")) { motherTemplate = cmd.getOptionValue("m"); - } else { - throw new IllegalArgumentException("Please provide mother's url template (use '-h' for help)"); + } + if (cmd.hasOption("h") || cmd.getArgs().length < 0 || !cmd.hasOption("m")) { + if (!cmd.hasOption("m")) { + System.out.println("Please provide mother's url template (use '-m')."); + } + help(options); + help = true; } } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 62d1022..8f4f0c7 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -41,7 +41,6 @@ import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import org.apache.log4j.Logger; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; @@ -56,7 +55,6 @@ public class Resource implements Comparable { public final static String defaultTestQuery = "searsia"; - private final static Logger LOGGER = Logger.getLogger(Resource.class.getName()); // For rate limiting: Default = 1000 queries per day private final static int defaultRATE = 1000; // unit: queries @@ -318,7 +316,7 @@ public Resource searchResource(String resourceid) throws SearchException { } - private SearchResult searsiaSearch(String jsonPage, String debug) { + private SearchResult searsiaSearch(String jsonPage, String debug) throws XPathExpressionException, JSONException { SearchResult result = new SearchResult(); if (debug != null && debug.equals("response")) { result.setDebugOut(jsonPage); @@ -332,12 +330,8 @@ private SearchResult searsiaSearch(String jsonPage, String debug) { result.addHit(new Hit((JSONObject) hits.get(i))); } if (json.has("resource")) { - try { - Resource engine = new Resource(json.getJSONObject("resource")); - result.setResource(engine); - } catch (XPathExpressionException e) { - LOGGER.warn("Resource error: " + e.getMessage()); - } + Resource engine = new Resource(json.getJSONObject("resource")); + result.setResource(engine); } if (json.has("searsia")) { result.setVersion(json.getString("searsia")); @@ -379,14 +373,10 @@ private SearchResult xpathSearch(String url, String page, String debug) return result; } - private Hit extractHit(Node item) { + private Hit extractHit(Node item) throws XPathExpressionException { Hit hit = new Hit(); for(TextExtractor extractor: this.extractors) { - try { - extractor.extract(item, hit); - } catch (XPathExpressionException e) { - LOGGER.warn(e.getMessage()); // TODO: handle this gracefully :-) - } + extractor.extract(item, hit); } return hit; } diff --git a/src/main/java/org/searsia/engine/TextExtractor.java b/src/main/java/org/searsia/engine/TextExtractor.java index 2bd1c94..c702208 100644 --- a/src/main/java/org/searsia/engine/TextExtractor.java +++ b/src/main/java/org/searsia/engine/TextExtractor.java @@ -46,12 +46,15 @@ public TextExtractor(String field, String xpath) throws XPathExpressionException public void extract(Node item, Hit hit) throws XPathExpressionException { - String resultString = ""; + String resultString = ""; // TODO: StringBuilder try { NodeList nodeList = (NodeList) this.compiledXpath.evaluate(item, XPathConstants.NODESET); if (nodeList != null) { for (int i=0; i < nodeList.getLength(); i++) { Node node = nodeList.item(i); + if (!resultString.equals("")) { + resultString += " "; + } resultString += node.getTextContent(); } } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index fd28458..5b82c9d 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -28,7 +28,7 @@ private Resource htmlSearch() throws XPathExpressionException { hiemstra.setFavicon("http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico"); hiemstra.setItemXpath("//div[@class='post']"); hiemstra.addExtractor( - new TextExtractor("title", "./h3"), + new TextExtractor("title", "./h3"), new TextExtractor("description", "./h3/following-sibling::text()"), new TextExtractor("url", "./h3/a/@href") ); From a3200cc9165a9a813a22cde3b1a2cc9ed2527a0e Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 29 Mar 2017 16:07:06 +0200 Subject: [PATCH 06/51] better errors and text handling --- src/main/java/org/searsia/Main.java | 7 +++++-- src/main/java/org/searsia/engine/Resource.java | 17 +++++++++++------ .../java/org/searsia/engine/TextExtractor.java | 2 +- .../java/org/searsia/engine/ResourceTest.java | 5 ++--- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 39460b5..fa01462 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -257,8 +257,11 @@ public static void main(String[] args) { if (mother == null) { fatalError("Initialization failed: JSONObject[\"resource\"] not found."); } + if (!options.getMotherTemplate().contains(mother.getId())) { + fatalError("API Template (" + options.getMotherTemplate() + ") does not contain the id (" + mother.getId() +")"); + } if (version != null && !version.startsWith("v1")) { - printMessage("Warning: Wrong major Searsia version " + version, options.isQuiet()); + fatalError("Wrong major Searsia version " + version + ": Must be v1.0.0 or higher."); } myself = mother.deepcopy(); myself.setUrlAPITemplate(options.getMyURI()); @@ -311,7 +314,7 @@ public static void main(String[] args) { printMessage("Use Ctrl+c to stop.", options.isQuiet()); try { searsiaDaemon(index, engines, options.getPollInterval()); - } catch (InterruptedException e) { } + } catch (InterruptedException e) { } } server.shutdownNow(); } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 8f4f0c7..033300e 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.URLConnection; import java.net.URL; @@ -342,15 +341,20 @@ private SearchResult searsiaSearch(String jsonPage, String debug) throws XPathEx private SearchResult xpathSearch(String url, String page, String debug) throws IOException, XPathExpressionException { - Document document; - if (this.mimeType != null && this.mimeType.equals("application/json")) { + Document document = null; + if (this.mimeType == null) { + throw new IOException("No MIME Type provided."); + } + if (this.mimeType.equals("application/json")) { document = parseDocumentJSON(page); - } else if (this.mimeType != null && this.mimeType.equals("application/x-javascript")) { + } else if (this.mimeType.equals("application/x-javascript")) { document = parseDocumentJavascript(page); - } else if (this.mimeType != null && this.mimeType.equals("application/xml")) { + } else if (this.mimeType.equals("application/xml")) { document = parseDocumentXML(page); - } else { + } else if (this.mimeType.equals("text/html")){ document = parseDocumentHTML(page, url); + } else { + throw new IOException("MIME Type not supported: " + this.mimeType); } if (document == null) { throw new IOException("Error parsing document. Wrong mimetype?"); @@ -542,6 +546,7 @@ private String getCompletePage(String urlString, String postString, Map]*>||||", ""); // No HTML, please: spans removed - s = s.replaceAll("<[^>]+>", " "); // all other tags replaced by a space + s = s.replaceAll("<[^>]+>|\ufffd", " "); // all other tags or unicode replace character replaced by a space if (trim) { s = s.trim(); } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index 5b82c9d..37ed7bf 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -108,7 +108,6 @@ public void testSearchHtml() throws XPathExpressionException, SearchException { SearchResult result = se.search("dolf trieschnigg", "xml"); Assert.assertEquals("text/html", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); - // TODO text nodes are glued together. } @Test @@ -191,7 +190,7 @@ public void testSearchNoResource1() throws XPathExpressionException, SearchExcep Resource se = htmlSearch(); Boolean exception = false; try { - Resource engine = se.searchResource("wikifull"); + se.searchResource("wikifull"); } catch (SearchException e) { exception = true; } @@ -203,7 +202,7 @@ public void testSearchNoResource2() throws XPathExpressionException, SearchExcep Resource se = searsiaMimeOnlySearch(); Boolean exception = false; try { - Resource engine = se.searchResource("wikifull"); + se.searchResource("wikifull"); } catch (SearchException e) { exception = true; } From 289fd17f20c84af4c34809277bd3fb95b3dc0088 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 30 Mar 2017 18:09:57 +0200 Subject: [PATCH 07/51] updates without restart --- src/main/java/org/searsia/Main.java | 50 ++++++++----- .../java/org/searsia/engine/Resource.java | 73 ++++++++++++++----- .../java/org/searsia/index/ResourceIndex.java | 5 +- src/main/java/org/searsia/web/Search.java | 11 +-- .../org/searsia/web/SearsiaApplication.java | 4 +- src/test/java/org/searsia/MainTest.java | 2 +- .../java/org/searsia/engine/ResourceTest.java | 7 ++ .../org/searsia/index/TestResourceIndex.java | 1 - src/test/java/org/searsia/web/SearchTest.java | 20 ++--- 9 files changed, 115 insertions(+), 58 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index fa01462..a8adf6e 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -65,6 +65,13 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines if (mother != null && random.nextBoolean()) { // sample mostly from mother engine = mother; result = engine.randomSearch(); + Resource newmother = result.getResource(); + if (newmother != null && newmother.getId().equals(mother.getId())) { + engines.putMother(newmother); // TODO myself! + } else { + LOGGER.warn("Unable to update mother: Did ids change?"); + } + getResources(mother, result, engines); } else { engine = engines.getRandom(); result = engine.randomSearch(); @@ -81,29 +88,32 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines } - private static void getResources(Resource mother, SearchResult result, ResourceIndex engines) { + private static int getResources(Resource mother, SearchResult result, ResourceIndex engines) { int i = 0; for (Hit hit: result.getHits()) { String rid = hit.getString("rid"); - if (rid != null && !engines.containsKey(rid)) { - Resource engine; - i += 1; - try { - engine = mother.searchResource(rid); - } catch (SearchException e) { - System.err.println("Warning: Not found: " + rid + ": " + e.getMessage()); - break; - } - try { - engines.put(engine); - } catch(Exception e) { - fatalError(e.getMessage()); - } + if (rid != null ) { + Resource engine = engines.get(rid); + if (engine == null || engine.getLastUpdatedSecondsAgo() > 3600) { + i += 1; + try { + engine = mother.searchResource(rid); + } catch (SearchException e) { + LOGGER.warn("Warning: Not found: " + rid + ": " + e.getMessage()); + } + try { + engines.put(engine); + LOGGER.info("Updated resource: " + rid); + } catch(Exception e) { + LOGGER.warn(e.getMessage()); + } + } } if (i > 10) { - break; // not more than the first 10. Rest will follow when needed + break; // not more than the first 10 per check } } + return i; } private static boolean sameTemplates(String uri1, String uri2, String myId) { @@ -215,6 +225,7 @@ private static void setupLogger(String path, String filename, Level level) throw new PatternLayout("%p %d{ISO8601} %m%n"), logDir.resolve("searsia.log").toString(), "'.'yyyy-MM-dd"); + // Appender appender = new ConsoleAppender(new PatternLayout("%m%n"), ConsoleAppender.SYSTEM_ERR); LOGGER.addAppender(appender); LOGGER.setLevel(level); LOGGER.warn("Searsia restart"); @@ -248,7 +259,7 @@ public static void main(String[] args) { String version = null; SearchResult result = null; try { - result = connect.searchWithoutQuery(); + result = connect.searchWithoutQuery(); mother = result.getResource(); version = result.getVersion(); } catch (SearchException e) { @@ -301,8 +312,9 @@ public static void main(String[] args) { // Start the web server String myURI = uriNormalize(options.getMyURI(), myself.getId()); try { - server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), - new SearsiaApplication(index, engines)); + + SearsiaApplication app = new SearsiaApplication(index, engines); + server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), app); } catch (Exception e) { fatalError("Server failed: " + e.getMessage()); } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 033300e..f10604b 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -34,6 +34,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; @@ -84,7 +85,8 @@ public class Resource implements Comparable { // internal data not to be shared private String nextQuery = null; private double allowance = defaultRATE / 2; - private long lastCheck = new Date().getTime(); // Unix time + private Long lastUsedCheck = new Date().getTime(); // Unix time + private Long lastUpdatedCheck = new Date().getTime(); // Unix time public Resource(String urlAPITemplate, String id) { @@ -129,8 +131,8 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { addHeader((String) key, (String) json.get(key)); } } - if (jo.has("parameters")) { - JSONObject json = (JSONObject) jo.get("parameters"); + if (jo.has("privateparameters")) { + JSONObject json = (JSONObject) jo.get("privateparameters"); Iterator keys = json.keys(); while (keys.hasNext()) { String key = (String) keys.next(); @@ -215,10 +217,9 @@ public void setRerank(String rerank) { this.rerank = rerank; } - - public void changeId(String id) { // BEWARE, only used in Main - this.id = id; - } + public void setLastUpdatedToNow() { + this.lastUpdatedCheck = new Date().getTime(); + } public SearchResult randomSearch() throws SearchException { @@ -440,8 +441,8 @@ private Document parseDocumentXML(String xmlString) throws IOException { private String fillTemplate(String template, String query) throws IOException { String url = template; - for (String param: this.privateParameters.keySet()) { - url = url.replaceAll("\\{" + param + "\\??\\}", this.privateParameters.get(param)); + for (String param: getPrivateParameterKeys()) { + url = url.replaceAll("\\{" + param + "\\??\\}", getPrivateParameter(param)); } url = url.replaceAll("\\{q\\??\\}", query); url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters @@ -454,8 +455,8 @@ private String fillTemplate(String template, String query) throws IOException { private SearchException createPrivateSearchException(Exception e) { String message = e.toString(); - for (String param: this.privateParameters.keySet()) { - message = message.replaceAll(this.privateParameters.get(param), "{" + param + "}"); + for (String param: getPrivateParameterKeys()) { + message = message.replaceAll(getPrivateParameter(param), "{" + param + "}"); } return new SearchException(message); } @@ -466,8 +467,8 @@ private SearchException createPrivateSearchException(Exception e) { */ private boolean rateLimitReached() { Long now = new Date().getTime(); - Long timePassed = now - this.lastCheck; - this.lastCheck = now; + Long timePassed = now - this.lastUsedCheck; + this.lastUsedCheck = now; this.allowance += (((double) timePassed / defaultPER)) * this.rate; if (this.allowance > this.rate) { this.allowance = this.rate; @@ -489,8 +490,8 @@ private URLConnection setConnectionProperties(URL url, Map heade for (Map.Entry entry : headers.entrySet()) { String value = entry.getValue(); if (value.contains("{")) { - for (String param: this.privateParameters.keySet()) { - value = value.replace("{" + param + "}", this.privateParameters.get(param)); + for (String param: getPrivateParameterKeys()) { + value = value.replace("{" + param + "}", getPrivateParameter(param)); } if (value.contains("{")) { String param = value.substring(value.indexOf("{"), value.indexOf("}") + 1); @@ -532,7 +533,7 @@ private String getCompletePage(String urlString, String postString, Map getPrivateParameterKeys() { + return this.privateParameters.keySet(); + } + + public List getExtractors() { return this.extractors; } @@ -638,7 +648,32 @@ public float getPrior() { } } - public float score(String query) { + + private Long secondsAgo(Long last) { + if (last == null) { + return null; + } else { + Long now = new Date().getTime(); + Long ago = 1 + (now - last) / 1000; + if (ago < 0 || ago > 8640000l) { // 100 days... + ago = 8640000l; + } + return ago; + } + } + + + public Long getLastUpdatedSecondsAgo() { + return secondsAgo(this.lastUpdatedCheck); + } + + + public Long getLastUsedSecondsAgo() { + return secondsAgo(this.lastUsedCheck); + } + + + public float score(String query) { float score = 0.0f; Map nameTerm = new HashMap(); String name = getName(); @@ -697,6 +732,10 @@ public JSONObject toJson() { } engine.put("headers", json); } + Long ago = this.getLastUpdatedSecondsAgo(); + if (ago != null) engine.put("updatedsecondsago", ago); + ago = this.getLastUsedSecondsAgo(); + if (ago != null) engine.put("usedsecondsago", this.getLastUsedSecondsAgo()); return engine; } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 6964e07..1334536 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -209,16 +209,17 @@ public void put(Resource engine) { updateResourceIndex(engine.getId(), engine); } catch (IOException e) { LOGGER.warn("Update of resource " + engine.getId() + " failed"); - // TODO Oh crap, what to do? } } - engines.put(engine.getId(), engine); + engine.setLastUpdatedToNow(); + this.engines.put(engine.getId(), engine); } public boolean containsKey(String id) { return this.engines.containsKey(id); } + public Resource get(String id) { return this.engines.get(id); } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index fd062f5..d7e2e23 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -26,7 +26,6 @@ import javax.ws.rs.QueryParam; import javax.ws.rs.core.Response; -import org.apache.log4j.Logger; import org.json.JSONObject; import org.searsia.SearchResult; @@ -43,7 +42,7 @@ @Path("{resourceid}/search") public class Search { - private final static Logger LOGGER = Logger.getLogger(Search.class); + private final static org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(Search.class); private ResourceIndex engines; private SearchResultIndex index; @@ -73,7 +72,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q mother = engines.getMother(); if (!resourceid.equals(me.getId())) { engine = engines.get(resourceid); - if (engine == null) { // unknown? ask your mother + if (engine == null || engine.getLastUpdatedSecondsAgo() == null || engine.getLastUpdatedSecondsAgo() > 3600) { // unknown or old? ask your mother if (mother != null) { try { engine = mother.searchResource(resourceid); @@ -112,7 +111,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q if (query != null && query.trim().length() > 0) { try { result = index.search(query); - } catch (IOException e) { + } catch (Exception e) { String message = "Service unavailable: " + e.getMessage(); LOGGER.warn(message); return SearsiaApplication.responseError(503, message); @@ -123,7 +122,9 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q index.offer(result); // really trust mother } catch (SearchException e) { LOGGER.warn("Mother not available"); - } + } catch (Exception e) { + LOGGER.warn(e.getMessage()); + } } else { // own results? Do resource ranking. result.scoreResourceSelection(query, engines); } diff --git a/src/main/java/org/searsia/web/SearsiaApplication.java b/src/main/java/org/searsia/web/SearsiaApplication.java index d4adb3c..d4a3569 100644 --- a/src/main/java/org/searsia/web/SearsiaApplication.java +++ b/src/main/java/org/searsia/web/SearsiaApplication.java @@ -17,8 +17,6 @@ package org.searsia.web; import java.io.IOException; -import java.util.logging.Level; -import java.util.logging.Logger; import javax.ws.rs.core.Response; @@ -68,7 +66,7 @@ protected static Response jsonResponse(int status, JSONObject json) { public SearsiaApplication(SearchResultIndex index, ResourceIndex engines) throws IOException { super(); - Logger.getLogger("org.glassfish.grizzly").setLevel(Level.WARNING); + java.util.logging.Logger.getLogger("").setLevel(java.util.logging.Level.WARNING); register(new Search(index, engines)); register(new OpenSearch(engines)); } diff --git a/src/test/java/org/searsia/MainTest.java b/src/test/java/org/searsia/MainTest.java index 0cd99fa..4d1aeeb 100644 --- a/src/test/java/org/searsia/MainTest.java +++ b/src/test/java/org/searsia/MainTest.java @@ -10,7 +10,7 @@ public class MainTest { @Test public void test() { String[] args = {"--path=target/index-test/", - "--mother=http://searsia.org/searsia/v1-wiki-.json", + "--mother=http://searsia.org/searsia/v1-wiki-{q}.json", "--log=4", "--test=json", "--quiet"}; Main.main(args); Assert.assertTrue(true); // happy if we get here! diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index 37ed7bf..ce7215e 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -252,6 +252,13 @@ public void testJsonRoundtrip() throws XPathExpressionException { Assert.assertEquals("postencode", se1.getPostQueryEncode(), se2.getPostQueryEncode()); Assert.assertFalse("secret revealed", json.toString().contains(SECRET_API_KEY)); } + + @Test + public void testJsonPrivateParameter() throws XPathExpressionException { + JSONObject json = new JSONObject("{\"id\":\"test\", \"privateparameters\":{\"apikey\":\"secret\"}}"); + Resource se = new Resource(json); + Assert.assertEquals("private parameters", se.getPrivateParameter("apikey"), "secret"); + } @Test public void equalEngines1() throws XPathExpressionException { diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index 7b84f4a..026df77 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -43,7 +43,6 @@ private static Resource searsia() { private static Resource newby() { Resource e = new Resource("http://new.com/?q={q}", "new"); - e.changeId("890"); e.addPrivateParameter("apikey", "secret"); return e; } diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 7f740d4..8d00907 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -27,8 +27,8 @@ public class SearchTest { private static ResourceIndex engines; - private static Resource utwente() { - return new Resource("https://search.utwente.nl/searsia/search.php?q={q?}&r={r?}", "utwente"); + private static Resource wiki() { + return new Resource("http://searsia.org/searsia/v1-wiki-{q}.json", "wiki"); } private static Resource wrong() { @@ -36,7 +36,7 @@ private static Resource wrong() { } private static Resource me() { - return new Resource("http://me.org?q={q}", "utwente"); + return new Resource("http://me.org?q={q}", "wiki"); } @@ -46,7 +46,7 @@ public static void setUp() throws Exception { LOGGER.addAppender(new NullAppender()); // thou shall not log index = new SearchResultIndex(PATH, INDEX, 2); engines = new ResourceIndex(PATH, INDEX); - engines.putMother(utwente()); + engines.putMother(wiki()); engines.put(wrong()); engines.putMyself(me()); } @@ -59,19 +59,19 @@ public static void lastThing() throws IOException { @Test // returns 'my' resource description public void test() throws IOException { Search search = new Search(index, engines); - Response response = search.query("utwente", ""); + Response response = search.query("wiki", ""); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); - Assert.assertEquals("utwente", resource.get("id")); + Assert.assertEquals("wiki", resource.get("id")); } @Test // returns local search results for 'searsia' public void testQuery() throws IOException { Search search = new Search(index, engines); - Response response = search.query("utwente", "searsia search for noobs"); + Response response = search.query("wiki", "searsia search for noobs"); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -101,16 +101,16 @@ public void testResource() throws IOException { Assert.assertEquals(wrong().getAPITemplate(), resource.get("apitemplate")); } - @Test // returns resource 'youtube' (from mother) + @Test // returns resource 'wikididyoumean' (from mother) public void testResourceUnknown() throws IOException { Search search = new Search(index, engines); - Response response = search.query("youtube", ""); + Response response = search.query("wikididyoumean", ""); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); - Assert.assertEquals("Youtube", resource.get("name")); + Assert.assertEquals("Did you mean:", resource.get("name")); } @Test // returns results for the engine 'wrong' (which does not exist) From 75f4e729282b86e30279e3e339e2ab2e3afd6c80 Mon Sep 17 00:00:00 2001 From: Searsia Date: Sun, 2 Apr 2017 15:53:35 +0200 Subject: [PATCH 08/51] opensearch and proxy --- src/main/java/org/searsia/web/OpenSearch.java | 68 ++++++++++++------- src/main/java/org/searsia/web/Proxy.java | 55 +++++++++++++++ .../org/searsia/index/TestResourceIndex.java | 1 + 3 files changed, 100 insertions(+), 24 deletions(-) create mode 100644 src/main/java/org/searsia/web/Proxy.java diff --git a/src/main/java/org/searsia/web/OpenSearch.java b/src/main/java/org/searsia/web/OpenSearch.java index f461a50..ff053c1 100644 --- a/src/main/java/org/searsia/web/OpenSearch.java +++ b/src/main/java/org/searsia/web/OpenSearch.java @@ -20,9 +20,11 @@ import javax.ws.rs.GET; import javax.ws.rs.Path; +import javax.ws.rs.PathParam; import javax.ws.rs.Produces; import javax.ws.rs.core.Response; +import org.searsia.engine.Resource; import org.searsia.index.ResourceIndex; /** @@ -31,7 +33,7 @@ * @author hiemstra * */ -@Path("opensearch.xml") +@Path("{resourceid}/opensearch.xml") public class OpenSearch { private ResourceIndex engines; @@ -42,30 +44,22 @@ public OpenSearch(ResourceIndex engines) throws IOException { @GET @Produces("application/opensearchdescription+xml; charset=utf-8") - public Response get() { - String response = "\n"; - String shortName = engines.getMyself().getName(); - String favicon = engines.getMyself().getFavicon(); - String userTemplate = engines.getMyself().getUserTemplate(); - String suggestTemplate = engines.getMyself().getSuggestTemplate(); - String apiTemplate = engines.getMyself().getAPITemplate(); - String testQuery = engines.getMyself().getTestQuery(); - if (shortName == null) shortName = "Searsia"; - response += "\n"; - response += " " + xmlEncode(shortName) + "\n"; - response += " Search the web with " + xmlEncode(shortName) + "\n"; - response += " \n"; - if (userTemplate != null) response += " \n"; - if (suggestTemplate != null) response += " \n"; - if (testQuery != null) response += " \n"; - if (favicon != null) response += " " + xmlEncode(favicon) + "\n"; - response += " UTF-8\n"; - response += " UTF-8\n"; - response += "\n"; - return Response.ok(response).build(); + public Response get(@PathParam("resourceid") String resourceid) { + Resource engine; + if (resourceid.equals(engines.getMyself().getId())) { + engine = engines.getMyself(); + } else { + engine = engines.get(resourceid); + } + if (engine != null) { + String response = engineXML(engine); + return Response.ok(response).build(); + } else { + return SearsiaApplication.responseError(404, "Not found: " + resourceid); + } } - + private String xmlEncode(String text) { text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); @@ -74,8 +68,34 @@ private String xmlEncode(String text) { private String templateEncode(String url) { url = url.replaceAll("\\{q", "{searchTerms"); - url = url.replaceAll("\\{r", "{searsia:resourceId"); return xmlEncode(url); } + private String engineXML(Resource engine) { + String response = "\n"; + String shortName = engine.getName(); + String favicon = engine.getFavicon(); + String userTemplate = engine.getUserTemplate(); + String suggestTemplate = engine.getSuggestTemplate(); + String apiTemplate = engine.getAPITemplate(); + String mimeType = engine.getMimeType(); + String postString = engine.getPostString(); + String testQuery = engine.getTestQuery(); + String method = "GET"; + if (postString != null) method = "POST"; + if (shortName == null) shortName = "Searsia"; + response += "\n"; + response += " " + xmlEncode(shortName) + "\n"; + response += " Search the web with " + xmlEncode(shortName) + "\n"; + response += " \n"; + if (userTemplate != null) response += " \n"; + if (suggestTemplate != null) response += " \n"; + if (testQuery != null) response += " \n"; + if (favicon != null) response += " " + xmlEncode(favicon) + "\n"; + response += " UTF-8\n"; + response += " UTF-8\n"; + response += "\n"; + return response; + } + } diff --git a/src/main/java/org/searsia/web/Proxy.java b/src/main/java/org/searsia/web/Proxy.java new file mode 100644 index 0000000..f78146b --- /dev/null +++ b/src/main/java/org/searsia/web/Proxy.java @@ -0,0 +1,55 @@ +package org.searsia.web; + +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLConnection; + +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.PathParam; +import javax.ws.rs.QueryParam; +import javax.ws.rs.core.Response; + +import org.searsia.index.ResourceIndex; + + +@Path("{resourceid}/proxy") +public class Proxy { + + private ResourceIndex engines; + + public Proxy(ResourceIndex engines) throws IOException { + this.engines = engines; + } + + @GET + public Response query(@PathParam("resourceid") String resourceid, @QueryParam("url") String url) { + try { + if (url != null && (engines.getMyself().getId().equals(resourceid) || engines.get(resourceid) != null)) { + return getWebData(url); + } else { + return SearsiaApplication.responseError(404, "Resource not found: " + resourceid); + } + } catch (Exception e) { + return SearsiaApplication.responseError(503, "Unavailable: " + e.getMessage()); + } + } + + private Response getWebData(String urlString) throws IOException { + URL url = new URL(urlString); + URLConnection connection = url.openConnection(); + connection.setRequestProperty("User-Agent", "Searsia/1.0"); + connection.setRequestProperty("Accept", "*/*"); // TODO If-Modified-Since: + connection.setReadTimeout(4000); + connection.setConnectTimeout(4000); + HttpURLConnection http = (HttpURLConnection) connection; + http.setInstanceFollowRedirects(true); + http.setRequestMethod("GET"); + http.connect(); + String contentType = http.getHeaderField("Content-Type"); + InputStream stream = http.getInputStream(); + return Response.ok(stream).header("Content-Type", contentType).build(); + } +} diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index 026df77..f33459f 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -54,6 +54,7 @@ private static Resource me() { } + @Test public static void checkFiles() throws IOException { Resource e1 = me(); Resource e2 = engines.getMyself(); From 2efdab90749db814a4e7e3ccde0505e8d9bff86e Mon Sep 17 00:00:00 2001 From: Searsia Date: Sun, 2 Apr 2017 15:55:57 +0200 Subject: [PATCH 09/51] whooops, test typo --- src/test/java/org/searsia/index/TestResourceIndex.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index f33459f..026df77 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -54,7 +54,6 @@ private static Resource me() { } - @Test public static void checkFiles() throws IOException { Resource e1 = me(); Resource e2 = engines.getMyself(); From 0d8c77cc215d087700a9a3c61f23d094bc5ac19f Mon Sep 17 00:00:00 2001 From: Searsia Date: Mon, 3 Apr 2017 16:41:49 +0200 Subject: [PATCH 10/51] adds lastupdated --- .../java/org/searsia/engine/Resource.java | 24 +++++++++++++------ .../java/org/searsia/index/ResourceIndex.java | 15 ++++++++---- src/main/java/org/searsia/web/Search.java | 2 +- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index f10604b..88a51c4 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -27,12 +27,16 @@ import java.net.URLConnection; import java.net.URL; import java.net.URLEncoder; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -59,6 +63,7 @@ public class Resource implements Comparable { // For rate limiting: Default = 1000 queries per day private final static int defaultRATE = 1000; // unit: queries private final static int defaultPER = 86400000; // unit: miliseconds (86400000 miliseconds is one day) + private final static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); // TODO: private static final Pattern queryPattern = Pattern.compile("\\{q\\??\\}"); @@ -221,6 +226,11 @@ public void setLastUpdatedToNow() { this.lastUpdatedCheck = new Date().getTime(); } + public void setLastUpdatedToDateString(String date) { + try { + this.lastUpdatedCheck = dateFormat.parse(date).getTime(); + } catch (ParseException e) { } + } public SearchResult randomSearch() throws SearchException { if (nextQuery == null) { @@ -520,7 +530,6 @@ private InputStream httpConnect(URLConnection connection, String postString) thr http.setRequestMethod("GET"); http.connect(); } - //int responseCode = http.getResponseCode(); return http.getInputStream(); } @@ -663,8 +672,12 @@ private Long secondsAgo(Long last) { } - public Long getLastUpdatedSecondsAgo() { - return secondsAgo(this.lastUpdatedCheck); + public String getLastUpdatedString() { + return dateFormat.format(new Date(this.lastUpdatedCheck)); + } + + public long getLastUpdatedSecondsAgo() { + return secondsAgo(this.lastUpdatedCheck); } @@ -732,10 +745,7 @@ public JSONObject toJson() { } engine.put("headers", json); } - Long ago = this.getLastUpdatedSecondsAgo(); - if (ago != null) engine.put("updatedsecondsago", ago); - ago = this.getLastUsedSecondsAgo(); - if (ago != null) engine.put("usedsecondsago", this.getLastUsedSecondsAgo()); + engine.put("lastupdated", this.getLastUpdatedString()); return engine; } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 1334536..1a90249 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -44,7 +44,6 @@ import org.apache.lucene.util.Version; import org.json.JSONException; import org.json.JSONObject; - import org.searsia.engine.Resource; /** @@ -72,8 +71,9 @@ public class ResourceIndex { * @param path path where the Searsia index resides * @param filename index file name * @throws IOException + * @throws JSONException */ - public ResourceIndex(String path, String filename) throws IOException { + public ResourceIndex(String path, String filename) throws IOException, JSONException { this.meFile = Paths.get(path, filename + ".json"); this.indexDir = Paths.get(path, filename + "_sources"); if (meFile.toFile().exists()) { @@ -97,7 +97,7 @@ private void writeMyselfFile(Resource engine) throws IOException { } - private Resource readMyselfFile(Path meFile) throws IOException { + private Resource readMyselfFile(Path meFile) throws IOException, JSONException { String content = new String(Files.readAllBytes(meFile)); Resource me = null; try { @@ -127,6 +127,9 @@ private void readResourceIndex() throws IOException { Document doc = searcher.doc(hit.doc); JSONObject json = new JSONObject(doc.get("json")); Resource engine = new Resource(json); + if (json.has("lastupdated")) { + engine.setLastUpdatedToDateString(json.getString("lastupdated")); + } this.engines.put(engine.getId(), engine); } } catch (javax.xml.xpath.XPathExpressionException e) { @@ -174,7 +177,7 @@ private boolean exists(Resource engine) { private void updateResourceIndex(String id, Resource engine) throws IOException { - Document doc = new Document(); + Document doc = new Document(); if (id != null) { JSONObject json = engine.toJson(); json.put("parameters", engine.getJsonPrivateParameters()); // we need to remember those @@ -205,13 +208,13 @@ public void put(Resource engine) { throw new RuntimeException("Local id conflict: " + engine.getId()); } if (!exists(engine)) { + engine.setLastUpdatedToNow(); try { updateResourceIndex(engine.getId(), engine); } catch (IOException e) { LOGGER.warn("Update of resource " + engine.getId() + " failed"); } } - engine.setLastUpdatedToNow(); this.engines.put(engine.getId(), engine); } @@ -263,6 +266,7 @@ public Map topValues(String queryString, int max) { } public void putMother(Resource mother) { + mother.setLastUpdatedToNow(); this.mother = mother; } @@ -270,6 +274,7 @@ public void putMyself(Resource engine) { if (get(engine.getId()) != null) { throw new RuntimeException("The server id '" + engine.getId() + "' already exists."); } + engine.setLastUpdatedToNow(); try { writeMyselfFile(engine); } catch (IOException e) { diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index d7e2e23..8a00c17 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -72,7 +72,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q mother = engines.getMother(); if (!resourceid.equals(me.getId())) { engine = engines.get(resourceid); - if (engine == null || engine.getLastUpdatedSecondsAgo() == null || engine.getLastUpdatedSecondsAgo() > 3600) { // unknown or old? ask your mother + if (engine == null || engine.getLastUpdatedSecondsAgo() > 3600) { // unknown or old? ask your mother if (mother != null) { try { engine = mother.searchResource(resourceid); From 57568f1c14f613269ff1c250346670125bf9873c Mon Sep 17 00:00:00 2001 From: Searsia Date: Mon, 3 Apr 2017 16:42:17 +0200 Subject: [PATCH 11/51] proxy headers --- src/main/java/org/searsia/web/Proxy.java | 34 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/searsia/web/Proxy.java b/src/main/java/org/searsia/web/Proxy.java index f78146b..f7c26e7 100644 --- a/src/main/java/org/searsia/web/Proxy.java +++ b/src/main/java/org/searsia/web/Proxy.java @@ -10,7 +10,10 @@ import javax.ws.rs.Path; import javax.ws.rs.PathParam; import javax.ws.rs.QueryParam; +import javax.ws.rs.core.Context; +import javax.ws.rs.core.HttpHeaders; import javax.ws.rs.core.Response; +import javax.ws.rs.core.Response.ResponseBuilder; import org.searsia.index.ResourceIndex; @@ -25,10 +28,14 @@ public Proxy(ResourceIndex engines) throws IOException { } @GET - public Response query(@PathParam("resourceid") String resourceid, @QueryParam("url") String url) { + public Response query(@PathParam("resourceid") String resourceid, @QueryParam("url") String url, @Context HttpHeaders headers) { try { if (url != null && (engines.getMyself().getId().equals(resourceid) || engines.get(resourceid) != null)) { - return getWebData(url); + if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { + return Response.status(304).build(); // cheating! Maybe really check if it is modified? + } else { + return getWebResponse(url); + } } else { return SearsiaApplication.responseError(404, "Resource not found: " + resourceid); } @@ -37,19 +44,34 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("u } } - private Response getWebData(String urlString) throws IOException { + private Response getWebResponse(String urlString) throws IOException { URL url = new URL(urlString); URLConnection connection = url.openConnection(); connection.setRequestProperty("User-Agent", "Searsia/1.0"); - connection.setRequestProperty("Accept", "*/*"); // TODO If-Modified-Since: + connection.setRequestProperty("Accept", "*/*"); connection.setReadTimeout(4000); connection.setConnectTimeout(4000); HttpURLConnection http = (HttpURLConnection) connection; http.setInstanceFollowRedirects(true); http.setRequestMethod("GET"); http.connect(); - String contentType = http.getHeaderField("Content-Type"); InputStream stream = http.getInputStream(); - return Response.ok(stream).header("Content-Type", contentType).build(); + return responseWithHeaders(http, stream).build(); } + + private ResponseBuilder responseWithHeaders(HttpURLConnection http, InputStream stream) { + ResponseBuilder builder = Response.ok(stream); + String field = http.getHeaderField("Content-Type"); + if (field != null) builder.header("Content-Type", field); + field = http.getHeaderField("Content-Length"); + if (field != null) builder.header("Content-Length", field); + field = http.getHeaderField("Expires"); + if (field != null) builder.header("Expires", field); + field = http.getHeaderField("Cache-Control"); + if (field != null) builder.header("Cache-Control", field); + field = http.getHeaderField("Last-Modified"); + if (field != null) builder.header("Last-Modified", field); + return builder; + } + } From 6ce8296ce07e95bc1795d7af883aa4a1339f3f08 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 6 Apr 2017 14:14:32 +0200 Subject: [PATCH 12/51] efficient resource index --- src/main/java/org/searsia/Main.java | 32 +++++--- .../java/org/searsia/index/ResourceIndex.java | 81 ++++++++++++------- 2 files changed, 73 insertions(+), 40 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index a8adf6e..526e67f 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,7 +39,14 @@ /** - * Searsia Main class + * Searsia Main class does the following actions: + * + * 1. Connect to mother peer + * 2. If it runs in test mode, test the mother, print results and exit. + * 3. Open/create Lucene indexes + * 4. Get the 10 top resources if older than one hour + * 5. Run the web server + * 6. Run the daemon to periodically poll the mother and resources * * Start as: java -jar target/searsiaserver.jar * More info: java -jar target/searsiaserver.jar --help @@ -79,7 +86,7 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines result.addQueryResourceRankDate(engine.getId()); } index.offer(result); - LOGGER.info("Sample " + engine.getId() + ": " + result.getQuery()); + LOGGER.info("Sampled " + engine.getId() + ": " + result.getQuery()); } } catch (Exception e) { LOGGER.warn("Sampling " + engine.getId() + " failed: " + e.getMessage()); @@ -98,21 +105,24 @@ private static int getResources(Resource mother, SearchResult result, ResourceIn i += 1; try { engine = mother.searchResource(rid); - } catch (SearchException e) { - LOGGER.warn("Warning: Not found: " + rid + ": " + e.getMessage()); - } - try { - engines.put(engine); - LOGGER.info("Updated resource: " + rid); - } catch(Exception e) { - LOGGER.warn(e.getMessage()); + } catch (Exception e) { + LOGGER.warn("Warning: Update failed: " + e.getMessage()); } + if (engine != null && rid.equals(engine.getId())) { + engines.put(engine); + LOGGER.info("Updated " + rid); + } else { + LOGGER.warn("Warning: Resource not found: " + rid); + } } } if (i > 10) { break; // not more than the first 10 per check } } + if (i > 0) { + engines.flush(); + } return i; } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 1a90249..15e19ef 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -65,6 +65,7 @@ public class ResourceIndex { private Path meFile = null; private Path indexDir = null; private IndexWriter writer = null; + private String lastFlushed = null; /** * Reads resources from index (if they exist) @@ -128,13 +129,15 @@ private void readResourceIndex() throws IOException { JSONObject json = new JSONObject(doc.get("json")); Resource engine = new Resource(json); if (json.has("lastupdated")) { - engine.setLastUpdatedToDateString(json.getString("lastupdated")); + String lastUpdated = json.getString("lastupdated"); + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { + this.lastFlushed = lastUpdated; + } + engine.setLastUpdatedToDateString(lastUpdated); } this.engines.put(engine.getId(), engine); } - } catch (javax.xml.xpath.XPathExpressionException e) { - throw new IOException(e); - } catch (JSONException e) { + } catch (javax.xml.xpath.XPathExpressionException | JSONException e) { throw new IOException(e); } finally { @@ -168,24 +171,9 @@ public Resource getMyself() { } - private boolean exists(Resource engine) { - for (Resource e: this.engines.values()) - if (e.equals(engine)) - return true; - return false; - } - - - private void updateResourceIndex(String id, Resource engine) throws IOException { - Document doc = new Document(); - if (id != null) { - JSONObject json = engine.toJson(); - json.put("parameters", engine.getJsonPrivateParameters()); // we need to remember those - doc.add(new StringField("id", id, Field.Store.YES)); // unique identifier - doc.add(new StoredField("json", json.toString())); - this.writer.updateDocument(new Term("id", id), doc); - } - this.writer.commit(); + private boolean equalExists(Resource engine) { + Resource old = this.engines.get(engine.getId()); + return (old != null && old.equals(engine)); } @@ -207,13 +195,9 @@ public void put(Resource engine) { if (this.me != null && engine.getId().equals(this.me.getId())) { throw new RuntimeException("Local id conflict: " + engine.getId()); } - if (!exists(engine)) { - engine.setLastUpdatedToNow(); - try { - updateResourceIndex(engine.getId(), engine); - } catch (IOException e) { - LOGGER.warn("Update of resource " + engine.getId() + " failed"); - } + engine.setLastUpdatedToNow(); + if (!equalExists(engine)) { + // TODO: engine.setLastUpdatedToNow(); } this.engines.put(engine.getId(), engine); } @@ -302,6 +286,45 @@ public void dump() { } } + private Document luceneDocument(Resource engine) { + Document doc = new Document(); + String id = engine.getId(); + JSONObject json = engine.toJson(); + json.put("parameters", engine.getJsonPrivateParameters()); // we need to remember those + doc.add(new StringField("id", id, Field.Store.YES)); // unique identifier + doc.add(new StoredField("json", json.toString())); + return doc; + } + + /** + * Flush the index updates to disk + */ + public void flush() { + try { + String lastDate = this.lastFlushed; + for (Resource engine: this.engines.values()) { + String lastUpdated = engine.getLastUpdatedString(); + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { + if (lastDate == null || lastDate.compareTo(lastUpdated) < 0) { + lastDate = lastUpdated; + } + this.writer.updateDocument(new Term("id", engine.getId()), luceneDocument(engine)); + } + } + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastDate) < 0) { + this.writer.commit(); + this.lastFlushed = lastDate; + LOGGER.info("Flushed resources to disk."); + } + } catch (Exception e) { + LOGGER.warn("Flushing resource index failed."); + } + } + + /** + * Close the index + * @throws IOException + */ public void close() throws IOException { this.writer.close(); this.mother = null; From 3bd1446e4fe5c9af6900ed7e30dec0b970664857 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 7 Apr 2017 11:21:03 +0200 Subject: [PATCH 13/51] simple caching --- src/main/java/org/searsia/SearchResult.java | 11 +++- .../java/org/searsia/engine/Resource.java | 4 +- .../org/searsia/index/SearchResultIndex.java | 52 ++++++++++++++++--- src/main/java/org/searsia/web/Search.java | 44 ++++++++++------ .../searsia/index/TestSearchResultIndex.java | 24 +++++++-- 5 files changed, 106 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 6ebef60..ab24b17 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,7 @@ public class SearchResult { private Resource resource; private String debugOut; private String query; + private String resourceId; private String version; public SearchResult() { @@ -75,6 +76,14 @@ public Resource getResource() { return this.resource; } + public void setResourceId(String resourceId) { + this.resourceId = resourceId; + } + + public String getResourceId() { + return this.resourceId; + } + public String getVersion() { return this.version; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 88a51c4..a913fa9 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -279,6 +279,7 @@ public SearchResult search(String query, String debug) throws SearchException { result.scoreReranking(query, this.rerank); } result.setQuery(query); + result.setResourceId(this.getId()); return result; } catch (Exception e) { // catch all, also runtime exceptions throw createPrivateSearchException(e); @@ -550,6 +551,7 @@ private String getCompletePage(String urlString, String postString, Map iterator = this.queue.iterator(); + while(iterator.hasNext()) { + SearchResult result = iterator.next(); + if (query.equals(result.getQuery()) && resourceId.equals(result.getResourceId())) { + return result; + } + } + } + return null; + } /** * Flushes the queue with updates to disk * @throws IOException */ public void flush() throws IOException { - while (queue.size() > 0) { - SearchResult result = queue.poll(); + while (this.queue.size() > 0) { + SearchResult result = this.queue.poll(); storeSearchResult(result); - } + } this.hitsWriter.commit(); closeReader(); LOGGER.info("Flushed cache to index."); @@ -194,11 +232,11 @@ public void flush() throws IOException { /** * Checks if the queue is full (its size is larger than 'limit') * If so, it flushes the updates to disk. - * @return true is queue was flushed. + * @return true if queue was flushed. * @throws IOException */ public boolean checkFlush() throws IOException { - boolean full = queue.size() > limit; + boolean full = this.queue.size() > limit; if (full) { flush(); } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 8a00c17..940fc8f 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,6 @@ import javax.ws.rs.core.Response; import org.json.JSONObject; - import org.searsia.SearchResult; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; @@ -63,8 +62,7 @@ public Response options() { @GET @Produces(SearchResult.SEARSIA_MIME_ENCODING) public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q") String query) { - LOGGER.info("Query " + resourceid + ": " + query); - + Resource me, engine, mother; SearchResult result; JSONObject json; @@ -90,21 +88,32 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q engines.put(engine); } if (query != null && query.trim().length() > 0) { - try { - result = engine.search(query); - result.removeResourceRank(); // only trust your mother - json = result.toJson(); // first json for response, so - result.addQueryResourceRankDate(engine.getId()); // response will not have query + resource - index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) - json.put("resource", engine.toJson()); - return SearsiaApplication.responseOk(json); - } catch (Exception e) { - String message = "Resource @" + resourceid + " unavailable: " + e.getMessage(); - LOGGER.warn(message); - return SearsiaApplication.responseError(503, message); - } + result = index.cacheSearch(query, engine.getId()); + if (result != null) { + result.removeResourceRank(); + json = result.toJson(); + json.put("resource", engine.toJson()); + LOGGER.info("Cache " + resourceid + ": " + query); + return SearsiaApplication.responseOk(json); + } else { + try { + result = engine.search(query); + result.removeResourceRank(); // only trust your mother + json = result.toJson(); // first json for response, so + result.addQueryResourceRankDate(engine.getId()); // response will not have query + resource + index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) + json.put("resource", engine.toJson()); + LOGGER.info("Query " + resourceid + ": " + query); + return SearsiaApplication.responseOk(json); + } catch (Exception e) { + String message = "Resource @" + resourceid + " unavailable: " + e.getMessage(); + LOGGER.warn(message); + return SearsiaApplication.responseError(503, message); + } + } } else { json = new JSONObject().put("resource", engine.toJson()); + LOGGER.info("Resource " + resourceid + "."); return SearsiaApplication.responseOk(json); } } else { @@ -134,6 +143,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q } json = result.toJson(); json.put("resource", engines.getMyself().toJson()); + LOGGER.info("Local " + resourceid + ": " + query); return SearsiaApplication.responseOk(json); } } diff --git a/src/test/java/org/searsia/index/TestSearchResultIndex.java b/src/test/java/org/searsia/index/TestSearchResultIndex.java index 1207809..b436f2d 100644 --- a/src/test/java/org/searsia/index/TestSearchResultIndex.java +++ b/src/test/java/org/searsia/index/TestSearchResultIndex.java @@ -48,10 +48,13 @@ private static SearchResult readFile(String fileString) throws IOException { finally { reader.close(); } - JSONArray hits = (new JSONObject(jsonString).getJSONArray("hits")); + JSONObject json = new JSONObject(jsonString); + JSONArray hits = json.getJSONArray("hits"); for(int i = 0; i < hits.length(); i++) { result.addHit(new Hit(hits.getJSONObject(i))); } + JSONObject resource = json.getJSONObject("resource"); + result.setResourceId(resource.getString("id")); return result; } @@ -72,7 +75,7 @@ public void testSearch1() throws Exception { public void testSearch2() throws Exception { SearchResult result = readFile("exampleSearchResult.json"); index.offer(result); - index.flush(); + index.flush(); // add it again String query = "dolf"; result = index.search(query); Assert.assertEquals(query, result.getQuery()); @@ -85,7 +88,7 @@ public void testSearch3() throws Exception { Assert.assertEquals(6, result.getHits().size()); } - @Test + @Test // test hit lookup (not used currently) public void testSearch4() throws Exception { SearchResult result = readFile("exampleSearchResult.json"); Hit hit1 = result.getHits().get(0); @@ -93,6 +96,21 @@ public void testSearch4() throws Exception { Assert.assertEquals(hit1.getTitle(), hit2.getTitle()); } + @Test // test the cache + public void testSearch5() throws Exception { + SearchResult result = readFile("exampleSearchResult.json"); + String query = "information"; + result.setQuery(query); + String resourceId = result.getResourceId(); + index.offer(result); + result = index.cacheSearch(query, resourceId); + Assert.assertEquals(10, result.getHits().size()); + result = index.cacheSearch(query, "nothing"); + Assert.assertTrue(result == null); + result = index.cacheSearch("nope", resourceId); + Assert.assertTrue(result == null); + } + /** * Can also be used from the command line to test an existing index * @param args query From 1e6541bf08da4ce1dd84145c7e2a956efcf32e3c Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 7 Apr 2017 17:55:00 +0200 Subject: [PATCH 14/51] last changed and pretty print --- src/main/java/org/searsia/Main.java | 6 +-- .../java/org/searsia/engine/DOMBuilder.java | 1 + .../java/org/searsia/engine/Resource.java | 40 ++++++++++++++----- .../java/org/searsia/index/ResourceIndex.java | 5 ++- 4 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 526e67f..848fffc 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -120,9 +120,7 @@ private static int getResources(Resource mother, SearchResult result, ResourceIn break; // not more than the first 10 per check } } - if (i > 0) { - engines.flush(); - } + engines.flush(); return i; } @@ -198,7 +196,7 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie } if (!isQuiet) { if (debugInfo.equals("json")) { - System.out.println(result.toJson()); + System.out.println(result.toJson().toString(2)); } else if (debugInfo.equals("xml") || debugInfo.equals("response")) { String debugOut = result.getDebugOut(); if (debugOut == null) { diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index f4cd18a..7a2a712 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -78,6 +78,7 @@ public static String DOM2String(Document document) { transformer = tf.newTransformer(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.setOutputProperty(OutputKeys.METHOD, "xml"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter writer = new StringWriter(); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index a913fa9..c61acfc 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -81,7 +81,7 @@ public class Resource implements Comparable { private String itemXpath = null; private String testQuery = defaultTestQuery; private List extractors = new ArrayList<>(); - private Map headers = new LinkedHashMap<>(); + private Map headers = new LinkedHashMap<>(); private Map privateParameters = new LinkedHashMap<>(); private Float prior = null; private String rerank = null; @@ -90,8 +90,11 @@ public class Resource implements Comparable { // internal data not to be shared private String nextQuery = null; private double allowance = defaultRATE / 2; - private Long lastUsedCheck = new Date().getTime(); // Unix time - private Long lastUpdatedCheck = new Date().getTime(); // Unix time + private Long lastUsed = new Date().getTime(); // Unix time + private Long lastUpdated = new Date().getTime(); // Unix time + private Long lastChanged = null; + private Integer nrOfRequests = 0; + private Integer nrOfSuccess = 0; public Resource(String urlAPITemplate, String id) { @@ -223,12 +226,22 @@ public void setRerank(String rerank) { } public void setLastUpdatedToNow() { - this.lastUpdatedCheck = new Date().getTime(); + this.lastUpdated = new Date().getTime(); } public void setLastUpdatedToDateString(String date) { try { - this.lastUpdatedCheck = dateFormat.parse(date).getTime(); + this.lastUpdated = dateFormat.parse(date).getTime(); + } catch (ParseException e) { } + } + + public void setLastChangedToNow() { + this.lastChanged = new Date().getTime(); + } + + public void setLastChangedToDateString(String date) { + try { + this.lastChanged = dateFormat.parse(date).getTime(); } catch (ParseException e) { } } @@ -478,8 +491,8 @@ private SearchException createPrivateSearchException(Exception e) { */ private boolean rateLimitReached() { Long now = new Date().getTime(); - Long timePassed = now - this.lastUsedCheck; - this.lastUsedCheck = now; + Long timePassed = now - this.lastUsed; + this.lastUsed = now; this.allowance += (((double) timePassed / defaultPER)) * this.rate; if (this.allowance > this.rate) { this.allowance = this.rate; @@ -675,16 +688,20 @@ private Long secondsAgo(Long last) { public String getLastUpdatedString() { - return dateFormat.format(new Date(this.lastUpdatedCheck)); + return dateFormat.format(new Date(this.lastUpdated)); } + public String getLastChangedString() { + return dateFormat.format(new Date(this.lastChanged)); + } + public long getLastUpdatedSecondsAgo() { - return secondsAgo(this.lastUpdatedCheck); + return secondsAgo(this.lastUpdated); } public Long getLastUsedSecondsAgo() { - return secondsAgo(this.lastUsedCheck); + return secondsAgo(this.lastUsed); } @@ -748,6 +765,9 @@ public JSONObject toJson() { engine.put("headers", json); } engine.put("lastupdated", this.getLastUpdatedString()); + if (this.lastChanged != null) { + engine.put("lastchanged", this.getLastChangedString()); + } return engine; } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 15e19ef..9dec10e 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -135,6 +135,9 @@ private void readResourceIndex() throws IOException { } engine.setLastUpdatedToDateString(lastUpdated); } + if (json.has("lastupdated")) { + engine.setLastUpdatedToDateString(json.getString("lastupdated")); + } this.engines.put(engine.getId(), engine); } } catch (javax.xml.xpath.XPathExpressionException | JSONException e) { @@ -197,7 +200,7 @@ public void put(Resource engine) { } engine.setLastUpdatedToNow(); if (!equalExists(engine)) { - // TODO: engine.setLastUpdatedToNow(); + engine.setLastChangedToNow(); } this.engines.put(engine.getId(), engine); } From bf7213faadfd229f176be98967defa74634d3bd2 Mon Sep 17 00:00:00 2001 From: Searsia Date: Sat, 8 Apr 2017 20:27:13 +0200 Subject: [PATCH 15/51] new ranking (work in progress), better random queries, resource updates --- src/main/java/org/searsia/Hit.java | 47 +++++--- src/main/java/org/searsia/Main.java | 12 +- src/main/java/org/searsia/SearchResult.java | 114 +++++++++++++----- .../java/org/searsia/engine/Resource.java | 10 +- .../java/org/searsia/index/ResourceIndex.java | 27 ++--- src/main/java/org/searsia/web/Search.java | 6 +- .../java/org/searsia/SearchResultTest.java | 6 +- 7 files changed, 148 insertions(+), 74 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index 71ddb4e..e120674 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -71,10 +71,10 @@ public void setScore(float score) { map.put("score", score); } - public void setRank(int rank) { - map.put("rank", rank); - } - + public void setResourceScore(float score) { + map.put("rscore", score); + } + public void setTitle(String title) { map.put("title", title); } @@ -87,12 +87,24 @@ public void setUrl(String url) { map.put("url", url); } + /** + * This id of will be used the Lucene index. + * So one url may be indexed multiple times, + * once for each resource id (rid). + * @return + */ public String getId() { String result = (String) map.get("url"); + String rid = ""; if (result == null) { result = (String) map.get("title"); + } else { + rid = (String) map.get("rid"); + if (rid == null) { + rid = ""; + } } - return result; + return rid + "@" + result; } public float getScore() { @@ -104,10 +116,15 @@ public float getScore() { } } - public Integer getRank() { - return (Integer) map.get("rank"); - } - + public float getResourceScore() { + Float score = (Float) map.get("rscore"); + if (score == null) { + return 0.0f; + } else { + return score; + } + } + public Object get(String field) { return map.get(field); } @@ -158,13 +175,13 @@ public String toIndexVersion() { // TODO: special treatment for urls, etc. @Override public int compareTo(Hit hit2) { - Float score1 = getScore(); - Float score2 = hit2.getScore(); + Float score1 = getResourceScore(); + Float score2 = hit2.getResourceScore(); if (score1.compareTo(score2) == 0) { - Integer rank1 = getRank(); - Integer rank2 = hit2.getRank(); - if (rank1 != null && rank2 != null) { - return rank2.compareTo(rank1); // yes reversed! + score1 = getScore(); + score2 = hit2.getScore(); + if (score1 != null && score2 != null) { + return score1.compareTo(score2); } else { return 0; } diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 848fffc..764b52d 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -82,8 +82,8 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines } else { engine = engines.getRandom(); result = engine.randomSearch(); - result.removeResourceRank(); // only trust your mother - result.addQueryResourceRankDate(engine.getId()); + result.removeResourceQuery(); // only trust your mother + result.addQueryResourceDate(engine.getId()); } index.offer(result); LOGGER.info("Sampled " + engine.getId() + ": " + result.getQuery()); @@ -165,8 +165,12 @@ private static void fatalError(String message) { System.exit(1); } - // for unique filename - private static String getHashString(String inputString) { + /** + * For a unique filename (public because used in searsiafedweb) + * @param inputString + * @return + */ + public static String getHashString(String inputString) { MessageDigest md; byte[] hash; try { diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index ab24b17..79218f1 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -109,35 +109,31 @@ public String getQuery() { } // TODO: maybe a list of query-resource pairs, if result found by multiple engines for multiple queries. - public void addQueryResourceRankDate(String resourceID) { - int rank = 1; + public void addQueryResourceDate(String resourceID) { String query = getQuery(); for (Hit hit: this.hits) { hit.putIfEmpty("query", query); hit.putIfEmpty("rid", resourceID); // TODO: if unknown rid, then replace! - hit.putIfEmpty("rank", rank++); hit.putIfEmpty("foundBefore", df.format(new Date())); } } - public void removeResourceRank() { + public void removeResourceQuery() { for (Hit hit: this.hits) { hit.remove("rid"); - hit.remove("rank"); + hit.remove("query"); } } - - // TODO: needs a proper implementation, refactoring, and research ;-) - // Scoring follows these rules: - // 1. hits are ordered such that the first hit per rid determines the resource ranking - // 2. if a resource has a exact query match, then these are ranked highest (given rule 1) - // 3. order by score (given rule 1 and rule 2) - // 4. TODO: not more than x (=10?) hits per resource - // 5. stop after 20 resources + + /** + * New resource ranker, adds rscore. + * @param query + * @param engines + */ public void scoreResourceSelection(String query, ResourceIndex engines) { - final float bias = 1.0f; - Map maxScore = new HashMap(); - Map topEngines = engines.topValues(query, 20); + final float boost = 1.0f; + Map maxScore = new HashMap(); + Map topEngines = engines.topValues(query, 10); for (Hit hit: this.hits) { String rid = hit.getString("rid"); if (rid != null) { @@ -145,33 +141,84 @@ public void scoreResourceSelection(String query, ResourceIndex engines) { if (engines.containsKey(rid)) { prior = engines.get(rid).getPrior(); } - float score = hit.getScore() * bias + prior; - Float top = topEngines.get(rid); - if (top != null) { - if (top > score) { - score = top; - } - topEngines.remove(rid); + Float top = topEngines.get(rid); + if (top != null) { + if (top > prior) { + prior = top; + } + topEngines.remove(rid); } + Float score = prior + hit.getScore() * boost; Float max = maxScore.get(rid); if (max == null || max < score) { - maxScore.put(rid, score); - max = score; + max = score; + maxScore.put(rid, max); } hit.setScore(score); - //hit.put("rscore", max); + hit.setResourceScore(max); + } else { + hit.setResourceScore(hit.getScore() * boost); } } for (String rid: topEngines.keySet()) { Hit hit = new Hit(); hit.put("rid", rid); hit.setScore(topEngines.get(rid)); - //hit.put("rscore", topEngines.get(rid)); + hit.put("rscore", topEngines.get(rid)); this.hits.add(hit); } Collections.sort(this.hits, Collections.reverseOrder()); } - + + /** + * TODO: needs a proper implementation, refactoring, and research ;-) + * Scoring follows these rules: + * 1. hits are ordered such that the first hit per rid determines the resource ranking + * 2. if a resource has a exact query match, then these are ranked highest (given rule 1) + * 3. order by score (given rule 1 and rule 2) + * 4. TODO: not more than x (=10?) hits per resource + * 5. stop after 20 resources + * @param query + * @param engines + */ + public void scoreResourceSelectionOld(String query, ResourceIndex engines) { + final float boost = 1.0f; + Map maxScore = new HashMap(); + Map topEngines = engines.topValues(query, 20); + for (Hit hit: this.hits) { + String rid = hit.getString("rid"); + if (rid != null) { + float prior = 0.0f; + if (engines.containsKey(rid)) { + prior = engines.get(rid).getPrior(); + } + float score = hit.getScore() * boost + prior; + Float top = topEngines.get(rid); + if (top != null) { + if (top > score) { + score = top; + } + topEngines.remove(rid); + } + Float max = maxScore.get(rid); + if (max == null || max < score) { + maxScore.put(rid, score); + max = score; + } + hit.setScore(score); + //hit.put("rscore", max); + } + } + for (String rid: topEngines.keySet()) { + Hit hit = new Hit(); + hit.put("rid", rid); + hit.setScore(topEngines.get(rid)); + //hit.put("rscore", topEngines.get(rid)); + this.hits.add(hit); + } + Collections.sort(this.hits, Collections.reverseOrder()); + } + public void scoreReranking(String query, String model) { // TODO use model SearchResult newResult = new SearchResult(); @@ -197,14 +244,21 @@ public void scoreReranking(String query, String model) { // TODO use model } - public String randomTerm() { + public String randomTerm(String notThisOne) { int size = this.hits.size(); if (size > 0) { int nr = random.nextInt(this.hits.size()); String text = this.hits.get(nr).toIndexVersion(); String terms[] = text.split(TOKENIZER); // TODO Lucene tokenizer? nr = random.nextInt(terms.length); - return terms[nr]; + String thisOne = terms[nr]; + int i = nr + 1; + while (notThisOne.equals(thisOne)) { + if (i >= terms.length) { i = 0; } + thisOne = terms[i]; + if (i == nr) { return null; } + } + return thisOne.toLowerCase(); } else { return null; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index c61acfc..fbfbc2a 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -246,13 +246,13 @@ public void setLastChangedToDateString(String date) { } public SearchResult randomSearch() throws SearchException { - if (nextQuery == null) { - nextQuery = this.testQuery; + if (this.nextQuery == null) { + this.nextQuery = this.testQuery; } - String thisQuery = nextQuery; - nextQuery = null; // so, nextQuery will be null in case of a searchexception + String thisQuery = this.nextQuery; + this.nextQuery = null; // so, nextQuery will be null in case of a searchexception SearchResult result = search(thisQuery); - nextQuery = result.randomTerm(); + this.nextQuery = result.randomTerm(thisQuery); return result; } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 9dec10e..2bcecb7 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -137,11 +137,11 @@ private void readResourceIndex() throws IOException { } if (json.has("lastupdated")) { engine.setLastUpdatedToDateString(json.getString("lastupdated")); - } + } this.engines.put(engine.getId(), engine); } } catch (javax.xml.xpath.XPathExpressionException | JSONException e) { - throw new IOException(e); + throw new IOException(e.getMessage()); } finally { reader.close(); @@ -174,12 +174,6 @@ public Resource getMyself() { } - private boolean equalExists(Resource engine) { - Resource old = this.engines.get(engine.getId()); - return (old != null && old.equals(engine)); - } - - public void delete(String id) throws IOException { Resource engine = get(id); if (engine == null) { @@ -198,11 +192,14 @@ public void put(Resource engine) { if (this.me != null && engine.getId().equals(this.me.getId())) { throw new RuntimeException("Local id conflict: " + engine.getId()); } - engine.setLastUpdatedToNow(); - if (!equalExists(engine)) { - engine.setLastChangedToNow(); + Resource old = get(engine.getId()); + if (old != null && old.equals(engine)) { // nothing new + old.setLastUpdatedToNow(); + } else { + engine.setLastUpdatedToNow(); + engine.setLastChangedToNow(); + this.engines.put(engine.getId(), engine); } - this.engines.put(engine.getId(), engine); } public boolean containsKey(String id) { @@ -231,7 +228,7 @@ public Map topValues(String queryString, int max) { int size = 0; float lastScore = -99.0f; for (Resource engine: this.engines.values()) { - float score = engine.score(queryString) + engine.getPrior(); // TODO: add bias ? + float score = engine.score(queryString) + engine.getPrior(); if (size < max || score > lastScore) { if (size < max) size++; int index = size - 1; @@ -293,7 +290,7 @@ private Document luceneDocument(Resource engine) { Document doc = new Document(); String id = engine.getId(); JSONObject json = engine.toJson(); - json.put("parameters", engine.getJsonPrivateParameters()); // we need to remember those + json.put("privateparameters", engine.getJsonPrivateParameters()); // we need to remember those doc.add(new StringField("id", id, Field.Store.YES)); // unique identifier doc.add(new StoredField("json", json.toString())); return doc; @@ -320,7 +317,7 @@ public void flush() { LOGGER.info("Flushed resources to disk."); } } catch (Exception e) { - LOGGER.warn("Flushing resource index failed."); + LOGGER.warn("Flushing resource index failed: " + e); } } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 940fc8f..080c27a 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -90,7 +90,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q if (query != null && query.trim().length() > 0) { result = index.cacheSearch(query, engine.getId()); if (result != null) { - result.removeResourceRank(); + result.removeResourceQuery(); json = result.toJson(); json.put("resource", engine.toJson()); LOGGER.info("Cache " + resourceid + ": " + query); @@ -98,9 +98,9 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q } else { try { result = engine.search(query); - result.removeResourceRank(); // only trust your mother + result.removeResourceQuery(); // only trust your mother json = result.toJson(); // first json for response, so - result.addQueryResourceRankDate(engine.getId()); // response will not have query + resource + result.addQueryResourceDate(engine.getId()); // response will not have query + resource index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) json.put("resource", engine.toJson()); LOGGER.info("Query " + resourceid + ": " + query); diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index 507311b..d855c9a 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -29,8 +29,10 @@ public void test3() { Hit h = new Hit("The ultimate test", "Oh yeah", "http://searsia.org", "http://searsia.org/images/search.png"); sr.addHit(h); - String term = sr.randomTerm(); - String terms = h.toIndexVersion(); + String notThis = "test"; + String term = sr.randomTerm(notThis); + String terms = h.toIndexVersion().toLowerCase(); + Assert.assertFalse(term.equals(notThis)); Assert.assertTrue(terms.contains(term)); Assert.assertTrue(sr.getHits().size() > 0); sr.scoreReranking("doesnotmatch", "or"); From a323b196438ec7b23f7e8c742d6dcdd8b9a23c83 Mon Sep 17 00:00:00 2001 From: Searsia Date: Mon, 10 Apr 2017 18:52:29 +0200 Subject: [PATCH 16/51] comments, sampling, request count --- src/main/java/org/searsia/Hit.java | 2 +- src/main/java/org/searsia/Main.java | 2 +- src/main/java/org/searsia/SearchResult.java | 10 +++++----- src/main/java/org/searsia/SearsiaOptions.java | 4 ++++ src/main/java/org/searsia/engine/Resource.java | 18 +++++++++++++----- src/main/java/org/searsia/web/Search.java | 14 +++++++------- 6 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index e120674..9820f90 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -163,7 +163,7 @@ public JSONObject toJson() { return json; } - public String toIndexVersion() { // TODO: special treatment for urls, etc. + public String toIndexVersion() { // TODO: special treatment for urls, etc. and StringBuilder String result = ""; for (Object s : map.values()) { if (s instanceof String) { diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 764b52d..7018a38 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -110,7 +110,7 @@ private static int getResources(Resource mother, SearchResult result, ResourceIn } if (engine != null && rid.equals(engine.getId())) { engines.put(engine); - LOGGER.info("Updated " + rid); + LOGGER.debug("Updated " + rid); } else { LOGGER.warn("Warning: Resource not found: " + rid); } diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 79218f1..e5d3536 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -231,7 +231,7 @@ public void scoreReranking(String query, String model) { // TODO use model String text = hit.toIndexVersion(); for (String term: text.toLowerCase().split(TOKENIZER)) { if (queryTerms.containsKey(term)) { - score += 1.0f; + score += 1.0f; // TODO: single query term multiple times? } } if (score > 0.001f) { @@ -244,21 +244,21 @@ public void scoreReranking(String query, String model) { // TODO use model } - public String randomTerm(String notThisOne) { + public String randomTerm(String notThisOne) { // TODO: keep track of more previous random queries? int size = this.hits.size(); if (size > 0) { int nr = random.nextInt(this.hits.size()); - String text = this.hits.get(nr).toIndexVersion(); + String text = this.hits.get(nr).toIndexVersion().toLowerCase(); String terms[] = text.split(TOKENIZER); // TODO Lucene tokenizer? nr = random.nextInt(terms.length); String thisOne = terms[nr]; int i = nr + 1; - while (notThisOne.equals(thisOne)) { + while (thisOne.length() < 1 || notThisOne.equals(thisOne)) { if (i >= terms.length) { i = 0; } thisOne = terms[i]; if (i == nr) { return null; } } - return thisOne.toLowerCase(); + return thisOne; } else { return null; } diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 2b1a2df..6bfcf21 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -162,6 +162,10 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } if (cmd.hasOption("m")) { motherTemplate = cmd.getOptionValue("m"); + if (!motherTemplate.matches("^https?://.*|^file:.*")) { + motherTemplate = "file:" + motherTemplate; + } + } if (cmd.hasOption("h") || cmd.getArgs().length < 0 || !cmd.hasOption("m")) { if (!cmd.hasOption("m")) { diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index fbfbc2a..bf0b2dc 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -252,7 +252,11 @@ public SearchResult randomSearch() throws SearchException { String thisQuery = this.nextQuery; this.nextQuery = null; // so, nextQuery will be null in case of a searchexception SearchResult result = search(thisQuery); - this.nextQuery = result.randomTerm(thisQuery); + if (this.testQuery.equals(thisQuery) && result.getHits().isEmpty()) { + throw new SearchException("No results for test query: " + thisQuery); + } else { + this.nextQuery = result.randomTerm(thisQuery); + } return result; } @@ -263,6 +267,8 @@ public SearchResult search(String query) throws SearchException { public SearchResult search(String query, String debug) throws SearchException { + this.nrOfRequests += 1; + SearchResult result; try { String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); String postString = ""; @@ -282,7 +288,6 @@ public SearchResult search(String query, String debug) throws SearchException { postString = fillTemplate(this.postString, postQuery); } String page = getCompletePage(url, postString, this.headers); - SearchResult result; if (this.mimeType != null && this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { result = searsiaSearch(page, debug); } else { @@ -291,12 +296,13 @@ public SearchResult search(String query, String debug) throws SearchException { if (this.rerank != null && query != null) { result.scoreReranking(query, this.rerank); } - result.setQuery(query); - result.setResourceId(this.getId()); - return result; } catch (Exception e) { // catch all, also runtime exceptions throw createPrivateSearchException(e); } + this.nrOfSuccess += 1; + result.setQuery(query); + result.setResourceId(this.getId()); + return result; } public SearchResult searchWithoutQuery() throws SearchException { @@ -768,6 +774,8 @@ public JSONObject toJson() { if (this.lastChanged != null) { engine.put("lastchanged", this.getLastChangedString()); } + engine.put("requestsok", this.nrOfSuccess); + engine.put("requests", this.nrOfRequests); return engine; } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 080c27a..560be51 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -70,21 +70,21 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q mother = engines.getMother(); if (!resourceid.equals(me.getId())) { engine = engines.get(resourceid); - if (engine == null || engine.getLastUpdatedSecondsAgo() > 3600) { // unknown or old? ask your mother - if (mother != null) { + if (engine == null || engine.getLastUpdatedSecondsAgo() > 7200) { // unknown or really old? ask your mother + if (mother != null) { // TODO: option for 7200 and similar value (3600) in Main try { engine = mother.searchResource(resourceid); } catch (SearchException e) { - String message = "Resource not found: @" + resourceid; + String message = "Resource not found: " + resourceid; LOGGER.warn(message); return SearsiaApplication.responseError(404, message); } } if (engine == null) { - String message = "Unknown resource identifier: @" + resourceid; + String message = "Unknown resource identifier: " + resourceid; LOGGER.warn(message); return SearsiaApplication.responseError(404, message); - } + } engines.put(engine); } if (query != null && query.trim().length() > 0) { @@ -106,7 +106,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q LOGGER.info("Query " + resourceid + ": " + query); return SearsiaApplication.responseOk(json); } catch (Exception e) { - String message = "Resource @" + resourceid + " unavailable: " + e.getMessage(); + String message = "Resource " + resourceid + " unavailable: " + e.getMessage(); LOGGER.warn(message); return SearsiaApplication.responseError(503, message); } @@ -132,7 +132,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q } catch (SearchException e) { LOGGER.warn("Mother not available"); } catch (Exception e) { - LOGGER.warn(e.getMessage()); + LOGGER.warn(e); } } else { // own results? Do resource ranking. result.scoreResourceSelection(query, engines); From cb2619aef038dac56e1a3ffe89a8a82c8f6fa55d Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 12 Apr 2017 12:32:10 +0200 Subject: [PATCH 17/51] changes url scheme --- src/main/java/org/searsia/Hit.java | 31 +++++++--- src/main/java/org/searsia/Main.java | 61 +++++++++++-------- src/main/java/org/searsia/SearchResult.java | 25 +++++++- src/main/java/org/searsia/SearsiaOptions.java | 2 +- .../java/org/searsia/engine/Resource.java | 17 +++--- src/main/java/org/searsia/web/Search.java | 12 ++-- src/test/java/org/searsia/MainTest.java | 2 +- .../java/org/searsia/SearchResultTest.java | 22 ++++--- .../java/org/searsia/engine/ResourceTest.java | 18 +++--- src/test/java/org/searsia/web/SearchTest.java | 14 ++--- 10 files changed, 127 insertions(+), 77 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index 9820f90..b696f62 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -141,6 +141,10 @@ public String getTitle() { return (String) map.get("title"); } + public String getRid() { + return (String) map.get("rid"); + } + @Override public String toString() { return map.entrySet().toString(); @@ -175,18 +179,25 @@ public String toIndexVersion() { // TODO: special treatment for urls, etc. and S @Override public int compareTo(Hit hit2) { - Float score1 = getResourceScore(); + Float score1 = getResourceScore(); // order by best resources Float score2 = hit2.getResourceScore(); - if (score1.compareTo(score2) == 0) { - score1 = getScore(); - score2 = hit2.getScore(); - if (score1 != null && score2 != null) { - return score1.compareTo(score2); - } else { - return 0; - } + int compare = score1.compareTo(score2); + if (compare != 0) { + return compare; } else { - return score1.compareTo(score2); + String rid1 = getRid(); // if two resources the same score + String rid2 = hit2.getRid(); + if (rid1 != null && rid2 != null && rid1.compareTo(rid2) != 0) { + return compare = rid1.compareTo(rid2); + } else { + score1 = getScore(); + score2 = hit2.getScore(); + if (score1 != null && score2 != null) { + return score1.compareTo(score2); + } else { + return 0; + } + } } } diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 7018a38..f012c30 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -128,32 +128,41 @@ private static boolean sameTemplates(String uri1, String uri2, String myId) { if (uri1 == null) { return (uri2 == null); } else { - return uriNormalize(uri1, myId).equals(uriNormalize(uri2, myId)); + uri1 = uri1.replaceAll("\\?.*$", ""); + uri2 = uri2.replaceAll("\\?.*$", ""); + return uri1.equals(uri2); } } - private static String uriNormalize(String uri, String myId) { + private static String removeFileNameUri(String uri) { if (uri != null) { - uri = uri.replaceAll("\\?.*$", ""); - uri = uri.replaceAll("\\/?search\\/?", ""); - if (uri.endsWith(myId)) { - uri = uri.replace(myId, ""); - } + uri = uri.replaceAll("\\/[^\\/]+$", "/"); } return uri; } + + private static String lastDir(String uri) { + if (uri.contains("/")) { + uri = uri.replaceAll("\\/[^\\/]*$", ""); + uri = uri.replaceAll("^.+\\/", ""); + return uri + "/"; + } else { + return ""; + } + } - private static String uriToTemplate(String uri, String myId) { - if (!(uri == null) && !(uri.contains("{q"))) { - if (!uri.endsWith("/")) { - uri += "/"; - } - uri += myId + "/search?q={q}"; - } + private static String normalizedUriToTemplate(String uri, String rid) { + if (uri != null) { + if (uri.endsWith("/") ) { + uri += rid + ".json?q={q}"; + } else if (!uri.contains("{q")) { // check for tests on searsia.org + uri += "?q={q}"; + } + + } return uri; } - private static void printMessage(String message, Boolean isQuiet) { if (!isQuiet) { System.err.println(message); @@ -215,7 +224,7 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie fatalError("Test failed: No results for test query."); } else { if (result.getHits().size() < 10) { - printMessage("Warning: less than 10 results; see \"testquery\" or \"rerank\".", isQuiet); + printMessage("Warning: less than 10 results for query: " + result.getQuery() + "; see \"testquery\" or \"rerank\".", isQuiet); } printMessage("Test succeeded.", isQuiet); } @@ -260,12 +269,10 @@ public static void main(String[] args) { System.exit(0); } printMessage("Searsia server " + SearsiaApplication.VERSION, options.isQuiet()); + - - - - // Connect to the mother engine and gather information from the mother. - Resource myself = null; + // Connect to the mother engine and gather information from the mother. + Resource myself = null; Resource mother = null; Resource connect = new Resource(options.getMotherTemplate(), null); String version = null; @@ -280,8 +287,8 @@ public static void main(String[] args) { if (mother == null) { fatalError("Initialization failed: JSONObject[\"resource\"] not found."); } - if (!options.getMotherTemplate().contains(mother.getId())) { - fatalError("API Template (" + options.getMotherTemplate() + ") does not contain the id (" + mother.getId() +")"); + if (!options.getMotherTemplate().matches(".*" + mother.getId() + "[^/]*$")) { + fatalError("API Template (" + options.getMotherTemplate() + "): file name must contain id (" + mother.getId() +")"); } if (version != null && !version.startsWith("v1")) { fatalError("Wrong major Searsia version " + version + ": Must be v1.0.0 or higher."); @@ -297,10 +304,10 @@ public static void main(String[] args) { // If test is set, test the mother if (options.getTestOutput() != null) { - printMessage("Testing: " + mother.getId(), options.isQuiet()); + printMessage("Testing: " + mother.getName(), options.isQuiet()); testMother(mother, options.getTestOutput(), options.isQuiet()); } else { - printMessage("Starting: " + myself.getId(), options.isQuiet()); + printMessage("Starting: " + myself.getName(), options.isQuiet()); } @@ -322,7 +329,7 @@ public static void main(String[] args) { // Start the web server - String myURI = uriNormalize(options.getMyURI(), myself.getId()); + String myURI = removeFileNameUri(options.getMyURI()) + lastDir(options.getMotherTemplate()); try { SearsiaApplication app = new SearsiaApplication(index, engines); @@ -334,7 +341,7 @@ public static void main(String[] args) { // Start the update daemon if not testing if (options.getTestOutput() == null) { - printMessage("API end point: " + uriToTemplate(myURI, myself.getId()), options.isQuiet()); + printMessage("API end point: " + normalizedUriToTemplate(myURI, myself.getId()), options.isQuiet()); printMessage("Use Ctrl+c to stop.", options.isQuiet()); try { searsiaDaemon(index, engines, options.getPollInterval()); diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index e5d3536..40f5bcc 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -125,12 +125,21 @@ public void removeResourceQuery() { } } + private void addMonitoringInfo(Hit hit, Resource resource) { + String value = resource.getLastUpdatedString(); + if (value != null) { hit.put("lastupdated", value); } + value = resource.getLastChangedString(); + if (value != null) { hit.put("lastchanged", value); } + hit.put("requestsok", resource.getNrOfSuccess()); + hit.put("requests", resource.getNrOfRequests()); + } + /** * New resource ranker, adds rscore. * @param query * @param engines */ - public void scoreResourceSelection(String query, ResourceIndex engines) { + public void scoreResourceSelection(String query, ResourceIndex engines, boolean extraInfo) { final float boost = 1.0f; Map maxScore = new HashMap(); Map topEngines = engines.topValues(query, 10); @@ -156,6 +165,9 @@ public void scoreResourceSelection(String query, ResourceIndex engines) { } hit.setScore(score); hit.setResourceScore(max); + if (extraInfo) { + addMonitoringInfo(hit, engines.get(rid)); + } } else { hit.setResourceScore(hit.getScore() * boost); } @@ -164,12 +176,21 @@ public void scoreResourceSelection(String query, ResourceIndex engines) { Hit hit = new Hit(); hit.put("rid", rid); hit.setScore(topEngines.get(rid)); - hit.put("rscore", topEngines.get(rid)); + hit.setResourceScore(topEngines.get(rid)); this.hits.add(hit); } Collections.sort(this.hits, Collections.reverseOrder()); } + /** + * New Top resources. + * @param query + * @param engines + */ + public void scoreResourceSelection(String query, ResourceIndex engines) { + scoreResourceSelection(query, engines, false); + } + /** * TODO: needs a proper implementation, refactoring, and research ;-) * Scoring follows these rules: diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 6bfcf21..238d361 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -163,7 +163,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("m")) { motherTemplate = cmd.getOptionValue("m"); if (!motherTemplate.matches("^https?://.*|^file:.*")) { - motherTemplate = "file:" + motherTemplate; + motherTemplate = "file:" + motherTemplate.replace("\\", "/"); // TODO C:\file on Windows? } } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index bf0b2dc..dfe172d 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -93,8 +93,8 @@ public class Resource implements Comparable { private Long lastUsed = new Date().getTime(); // Unix time private Long lastUpdated = new Date().getTime(); // Unix time private Long lastChanged = null; - private Integer nrOfRequests = 0; - private Integer nrOfSuccess = 0; + private int nrOfRequests = 0; + private int nrOfSuccess = 0; public Resource(String urlAPITemplate, String id) { @@ -678,6 +678,13 @@ public float getPrior() { } } + public int getNrOfRequests() { + return this.nrOfRequests; + } + + public int getNrOfSuccess() { + return this.nrOfSuccess; + } private Long secondsAgo(Long last) { if (last == null) { @@ -770,12 +777,6 @@ public JSONObject toJson() { } engine.put("headers", json); } - engine.put("lastupdated", this.getLastUpdatedString()); - if (this.lastChanged != null) { - engine.put("lastchanged", this.getLastChangedString()); - } - engine.put("requestsok", this.nrOfSuccess); - engine.put("requests", this.nrOfRequests); return engine; } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 560be51..13bfe58 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -38,7 +38,7 @@ * * @author Dolf Trieschnigg and Djoerd Hiemstra */ -@Path("{resourceid}/search") +@Path("{resourceid}") public class Search { private final static org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(Search.class); @@ -62,7 +62,10 @@ public Response options() { @GET @Produces(SearchResult.SEARSIA_MIME_ENCODING) public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q") String query) { - + if (!resourceid.endsWith(".json")) { + return SearsiaApplication.responseError(404, "Not found: " + resourceid); + } + resourceid = resourceid.replaceAll("\\.json$", ""); Resource me, engine, mother; SearchResult result; JSONObject json; @@ -137,9 +140,10 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q } else { // own results? Do resource ranking. result.scoreResourceSelection(query, engines); } - } else { // no query? Return empty results + } else { // no query? Return empty results with extra info + boolean extraInfo = true; result = new SearchResult(); - result.scoreResourceSelection(query, engines); + result.scoreResourceSelection(query, engines, extraInfo); } json = result.toJson(); json.put("resource", engines.getMyself().toJson()); diff --git a/src/test/java/org/searsia/MainTest.java b/src/test/java/org/searsia/MainTest.java index 4d1aeeb..c8b1a7b 100644 --- a/src/test/java/org/searsia/MainTest.java +++ b/src/test/java/org/searsia/MainTest.java @@ -10,7 +10,7 @@ public class MainTest { @Test public void test() { String[] args = {"--path=target/index-test/", - "--mother=http://searsia.org/searsia/v1-wiki-{q}.json", + "--mother=http://searsia.org/searsia/wiki/index.json", "--log=4", "--test=json", "--quiet"}; Main.main(args); Assert.assertTrue(true); // happy if we get here! diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index d855c9a..55d1cca 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -1,10 +1,14 @@ package org.searsia; +import java.io.IOException; + +import javax.ws.rs.core.Response; + import org.junit.Assert; import org.junit.Test; - import org.searsia.Hit; import org.searsia.SearchResult; +import org.searsia.web.Search; public class SearchResultTest { @@ -29,15 +33,19 @@ public void test3() { Hit h = new Hit("The ultimate test", "Oh yeah", "http://searsia.org", "http://searsia.org/images/search.png"); sr.addHit(h); + h = new Hit("Another test", "yeah", "http://searsia.org/test.html", + "http://searsia.org/images/search.png"); + sr.addHit(h); String notThis = "test"; - String term = sr.randomTerm(notThis); + String term = sr.randomTerm(notThis); + Assert.assertFalse("Same random term", term.equals(notThis)); String terms = h.toIndexVersion().toLowerCase(); - Assert.assertFalse(term.equals(notThis)); - Assert.assertTrue(terms.contains(term)); - Assert.assertTrue(sr.getHits().size() > 0); + Assert.assertTrue("Index contains term", terms.contains(term)); + Assert.assertEquals("Total nr of hits", sr.getHits().size(), 2); + sr.scoreReranking("test", "or"); + Assert.assertEquals("Nr of hits after reranking", sr.getHits().size(), 2); sr.scoreReranking("doesnotmatch", "or"); - Assert.assertTrue(sr.getHits().size() == 0); + Assert.assertEquals("Query matches zero results", sr.getHits().size(), 0); } - } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index ce7215e..c373732 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -50,21 +50,21 @@ private Resource postSearch() throws XPathExpressionException { } private Resource searsiaMimeOnlySearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/v1-wikididyoumean-{q?}.json", "randomid"); + return new Resource("http://searsia.org/searsia/wiki/wikididyoumean{q?}.json", "randomid"); } private Resource searsiaSearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/v1-wiki-{q?}.json", "wiki"); + return new Resource("http://searsia.org/searsia/wiki/index{q}.json", "index"); } private Resource xmlSearch() throws XPathExpressionException, SearchException { - Resource wiki = new Resource("http://searsia.org/searsia/v1-wiki-{q?}.json", "wiki"); - Resource wikifull = wiki.searchResource("wikifull"); + Resource wiki = new Resource("http://searsia.org/searsia/wiki/index{q?}.json", "index"); + Resource wikifull = wiki.searchResource("wikifull1"); return wikifull; } private Resource jsonSearch() throws XPathExpressionException { - Resource wiki = new Resource("http://searsia.org/searsia/v1-wikifull-{q?}.json", "wikifull"); + Resource wiki = new Resource("http://searsia.org/searsia/wiki/wikifull1{q?}.json", "wikifull1"); wiki.setMimeType("application/json"); wiki.setItemXpath("//hits"); wiki.addExtractor( @@ -77,7 +77,7 @@ private Resource jsonSearch() throws XPathExpressionException { } private Resource javascriptSearch() throws XPathExpressionException { - Resource wikifull = new Resource("http://searsia.org/searsia/v1-wikifull-{q}.js", "wikifull"); + Resource wikifull = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.js", "wikifull1"); wikifull.setMimeType("application/x-javascript"); wikifull.setItemXpath("//hits"); wikifull.addExtractor( @@ -181,7 +181,7 @@ public void testSearchSearsiaEmpty() throws XPathExpressionException, SearchExce @Test public void testSearchResource() throws XPathExpressionException, SearchException { Resource se = searsiaSearch(); - Resource engine = se.searchResource("wikifull"); + Resource engine = se.searchResource("wikifull1"); Assert.assertTrue(engine != null); } @@ -190,7 +190,7 @@ public void testSearchNoResource1() throws XPathExpressionException, SearchExcep Resource se = htmlSearch(); Boolean exception = false; try { - se.searchResource("wikifull"); + se.searchResource("wikifull1"); } catch (SearchException e) { exception = true; } @@ -202,7 +202,7 @@ public void testSearchNoResource2() throws XPathExpressionException, SearchExcep Resource se = searsiaMimeOnlySearch(); Boolean exception = false; try { - se.searchResource("wikifull"); + se.searchResource("wikifull1"); } catch (SearchException e) { exception = true; } diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 8d00907..7644443 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -28,7 +28,7 @@ public class SearchTest { private static Resource wiki() { - return new Resource("http://searsia.org/searsia/v1-wiki-{q}.json", "wiki"); + return new Resource("http://searsia.org/searsia/wiki/wiki{q}.json", "wiki"); } private static Resource wrong() { @@ -59,7 +59,7 @@ public static void lastThing() throws IOException { @Test // returns 'my' resource description public void test() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wiki", ""); + Response response = search.query("wiki.json", ""); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -71,7 +71,7 @@ public void test() throws IOException { @Test // returns local search results for 'searsia' public void testQuery() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wiki", "searsia search for noobs"); + Response response = search.query("wiki.json", "searsia search for noobs"); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -92,7 +92,7 @@ public void testQuery() throws IOException { @Test // returns local resource 'wrong' public void testResource() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wrong", ""); + Response response = search.query("wrong.json", ""); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -104,7 +104,7 @@ public void testResource() throws IOException { @Test // returns resource 'wikididyoumean' (from mother) public void testResourceUnknown() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wikididyoumean", ""); + Response response = search.query("wikididyoumean.json", ""); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -116,11 +116,9 @@ public void testResourceUnknown() throws IOException { @Test // returns results for the engine 'wrong' (which does not exist) public void testError() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wrong", "testquery"); + Response response = search.query("wrong.json", "testquery"); int status = response.getStatus(); Assert.assertEquals(503, status); } - - } From 089364ae4ad4429799d7c00435320ca209ea79ef Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 19 Apr 2017 16:07:43 +0200 Subject: [PATCH 18/51] improved tests --- .../java/org/searsia/engine/Resource.java | 120 ++++++++++++------ .../java/org/searsia/engine/ResourceTest.java | 70 ++++------ .../org/searsia/index/TestResourceIndex.java | 56 ++++---- .../searsia/index/TestSearchResultIndex.java | 3 +- 4 files changed, 139 insertions(+), 110 deletions(-) diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index dfe172d..5f22273 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -96,7 +96,6 @@ public class Resource implements Comparable { private int nrOfRequests = 0; private int nrOfSuccess = 0; - public Resource(String urlAPITemplate, String id) { this.urlAPITemplate = urlAPITemplate; this.id = id; @@ -245,6 +244,17 @@ public void setLastChangedToDateString(String date) { } catch (ParseException e) { } } + + public Resource updateFromAPI() throws SearchException { + SearchResult result = searchWithoutQuery(); + if (result == null) { throw new SearchException("No results."); } + Resource resource = result.getResource(); + if (resource == null) { throw new SearchException("Object \"resource\" not found."); } + updateWith(resource); + return this; + } + + public SearchResult randomSearch() throws SearchException { if (this.nextQuery == null) { this.nextQuery = this.testQuery; @@ -744,42 +754,77 @@ public Resource deepcopy() { throw new RuntimeException(e); } } + + public void updateWith(Resource e2) { + if (e2.id != null) this.id = e2.id; + if (e2.name != null) this.name = e2.name; + if (e2.urlUserTemplate != null) this.urlUserTemplate = e2.urlUserTemplate; + if (e2.favicon != null) this.favicon = e2.favicon; + if (e2.banner != null) this.banner = e2.banner; + if (e2.urlAPITemplate != null) this.urlAPITemplate = e2.urlAPITemplate; + if (e2.urlSuggestTemplate != null) this.urlSuggestTemplate = e2.urlSuggestTemplate; + if (e2.mimeType != null) this.mimeType = e2.mimeType; + if (e2.rerank != null) this.rerank = e2.rerank; + if (e2.postString != null) this.postString = e2.postString; + if (e2.postQueryEncode != null) this.postQueryEncode = e2.postQueryEncode; + if (e2.testQuery != null) this.testQuery = e2.testQuery; + if (e2.prior != null) this.prior = e2.prior; + if (e2.rate != defaultRATE) this.rate = e2.rate; + if (e2.itemXpath != null) this.itemXpath = e2.itemXpath; + if (e2.extractors != null) this.extractors = e2.extractors; + if (e2.headers != null) this.headers = e2.headers; + if (e2.privateParameters != null) this.privateParameters = e2.privateParameters; + } + + + public JSONObject toJson() { + return toJsonEngine(); + } - public JSONObject toJson() { - JSONObject engine = new JSONObject(); - if (id != null) engine.put("id", id); - if (name != null) engine.put("name", name); - if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); - if (favicon != null) engine.put("favicon", favicon); - if (banner != null) engine.put("banner", banner); - if (urlAPITemplate != null) engine.put("apitemplate", urlAPITemplate); - if (urlSuggestTemplate != null) engine.put("suggesttemplate", urlSuggestTemplate); - if (mimeType != null) engine.put("mimetype", mimeType); - if (rerank != null) engine.put("rerank", rerank); - if (postString != null) engine.put("post", postString); - if (postQueryEncode != null) engine.put("postencode", postQueryEncode); - if (testQuery != null) engine.put("testquery", testQuery); - if (prior != null) engine.put("prior", prior); - if (rate != defaultRATE) engine.put("maxqueriesperday", rate); - if (itemXpath != null) engine.put("itempath", itemXpath); - if (extractors != null && extractors.size() > 0) { - JSONObject json = new JSONObject(); - for (TextExtractor e: extractors) { - json.put(e.getField(), e.getPath()); - } - engine.put("extractors", json); - } - if (headers != null && headers.size() > 0) { - JSONObject json = new JSONObject(); - for (String header: headers.keySet()) { - json.put(header, headers.get(header)); - } - engine.put("headers", json); - } - return engine; - } + public JSONObject toJsonEngine() { + JSONObject engine = new JSONObject(); + if (id != null) engine.put("id", id); + if (name != null) engine.put("name", name); + if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); + if (favicon != null) engine.put("favicon", favicon); + if (banner != null) engine.put("banner", banner); + if (urlAPITemplate != null) engine.put("apitemplate", urlAPITemplate); + if (urlSuggestTemplate != null) engine.put("suggesttemplate", urlSuggestTemplate); + if (mimeType != null) engine.put("mimetype", mimeType); + if (rerank != null) engine.put("rerank", rerank); + if (postString != null) engine.put("post", postString); + if (postQueryEncode != null) engine.put("postencode", postQueryEncode); + if (testQuery != null) engine.put("testquery", testQuery); + if (prior != null) engine.put("prior", prior); + if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + if (itemXpath != null) engine.put("itempath", itemXpath); + if (extractors != null && extractors.size() > 0) { + JSONObject json = new JSONObject(); + for (TextExtractor e: extractors) { + json.put(e.getField(), e.getPath()); + } + engine.put("extractors", json); + } + if (headers != null && headers.size() > 0) { + JSONObject json = new JSONObject(); + for (String header: headers.keySet()) { + json.put(header, headers.get(header)); + } + engine.put("headers", json); + } + return engine; + } + + + @Override + public int compareTo(Resource e2) { + Float score1 = getPrior(); + Float score2 = e2.getPrior(); + return score1.compareTo(score2); + } + @Override public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? if (o == null) return false; @@ -804,14 +849,7 @@ public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? return true; } - @Override - public int compareTo(Resource e2) { - Float score1 = getPrior(); - Float score2 = e2.getPrior(); - return score1.compareTo(score2); - } - private boolean listEquals(List a, List b) { if (a == null && b == null) return true; diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index c373732..a47d121 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -17,38 +17,6 @@ public class ResourceTest { private static final String SECRET_API_KEY = "a7235cdsf43d3a2dfgeda"; - private Resource htmlSearch() throws XPathExpressionException { - Resource hiemstra = new Resource("http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}","hiemstra"); - hiemstra.setUrlUserTemplate(hiemstra.getAPITemplate()); - hiemstra.addPrivateParameter("apikey", SECRET_API_KEY); - hiemstra.addHeader("User-Agent", "Test/1.0"); - hiemstra.setPrior(0.3f); - hiemstra.setRate(133); - hiemstra.setMimeType("text/html"); - hiemstra.setFavicon("http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico"); - hiemstra.setItemXpath("//div[@class='post']"); - hiemstra.addExtractor( - new TextExtractor("title", "./h3"), - new TextExtractor("description", "./h3/following-sibling::text()"), - new TextExtractor("url", "./h3/a/@href") - ); - return hiemstra; - } - - private Resource postSearch() throws XPathExpressionException { - Resource hiemstra = new Resource("http://wwwhome.cs.utwente.nl/~hiemstra/","hiemstrapost"); - hiemstra.setPostString("os={q}"); - hiemstra.setPostQueryEncode("application/x-www-form-urlencoded"); - hiemstra.setMimeType("application/xml"); - hiemstra.setItemXpath("//item"); - hiemstra.addExtractor( - new TextExtractor("title", "./title"), - new TextExtractor("description", "./description"), - new TextExtractor("url", "./link") - ); - return hiemstra; - } - private Resource searsiaMimeOnlySearch() throws XPathExpressionException { return new Resource("http://searsia.org/searsia/wiki/wikididyoumean{q?}.json", "randomid"); } @@ -88,6 +56,15 @@ private Resource javascriptSearch() throws XPathExpressionException { return wikifull; } + @Test + public void testje() throws XPathExpressionException, SearchException { + System.out.println(searsiaSearch().toJson()); + System.out.println(searsiaMimeOnlySearch().toJson()); + System.out.println(xmlSearch().toJson()); + System.out.println(jsonSearch().toJson()); + System.out.println(javascriptSearch().toJson()); + } + @BeforeClass public static void setUp() { Logger.getLogger("").setLevel(Level.WARNING); @@ -95,7 +72,7 @@ public static void setUp() { @Test public void testSearchSearsia() throws XPathExpressionException, SearchException { - Resource se = searsiaSearch(); + Resource se = new Resource("file:src/test/resources/index.json", null).updateFromAPI(); String query = "informat"; SearchResult result = se.search(query); Assert.assertEquals(query, result.getQuery()); @@ -104,7 +81,7 @@ public void testSearchSearsia() throws XPathExpressionException, SearchException @Test public void testSearchHtml() throws XPathExpressionException, SearchException { - Resource se = htmlSearch(); + Resource se = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); SearchResult result = se.search("dolf trieschnigg", "xml"); Assert.assertEquals("text/html", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -112,7 +89,7 @@ public void testSearchHtml() throws XPathExpressionException, SearchException { @Test public void testSearchPost() throws XPathExpressionException, SearchException { - Resource se = postSearch(); + Resource se = new Resource("file:src/test/resources/hiemstrapost.json", null).updateFromAPI(); SearchResult result = se.search("dolf trieschnigg"); Assert.assertEquals("application/xml", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -128,7 +105,7 @@ public void testSearchXml() throws XPathExpressionException, SearchException { @Test public void testSearchXml2() throws XPathExpressionException, SearchException { - Resource se = htmlSearch(); + Resource se = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); se.setMimeType("application/xml"); se.setRerank(null); long startTime = System.currentTimeMillis(); @@ -187,7 +164,7 @@ public void testSearchResource() throws XPathExpressionException, SearchExceptio @Test public void testSearchNoResource1() throws XPathExpressionException, SearchException { - Resource se = htmlSearch(); + Resource se = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); Boolean exception = false; try { se.searchResource("wikifull1"); @@ -210,22 +187,23 @@ public void testSearchNoResource2() throws XPathExpressionException, SearchExcep } @Test - public void testSearchError() throws XPathExpressionException { - Resource se = htmlSearch(); - se.setUrlAPITemplate("http://wwwhome.cs.utwente.nl/~hiemstra/WRONG/?s={q}&api={apikey}&p={p?}"); + public void testSearchError() throws XPathExpressionException, SearchException { + Resource se = new Resource("file:src/test/resources/wrong.json", null).updateFromAPI(); String message = null; + String apiKey = se.getPrivateParameter("apikey"); try { se.search("test"); } catch (SearchException e) { message = e.getMessage(); } - Assert.assertNotNull(message); - Assert.assertFalse("error message reveals secret", message.contains(SECRET_API_KEY)); + Assert.assertNotNull("Error message", message); + Assert.assertNotNull("API key", apiKey); + Assert.assertFalse("Error message reveals secret", message.contains(apiKey)); } @Test - public void testJsonRoundtrip() throws XPathExpressionException { - Resource se1 = htmlSearch(); + public void testJsonRoundtrip() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); se1.setPostString("POST"); se1.setPostQueryEncode("application/x-www-form-urlencoded"); se1.setRerank("lm"); @@ -261,8 +239,8 @@ public void testJsonPrivateParameter() throws XPathExpressionException { } @Test - public void equalEngines1() throws XPathExpressionException { - Resource se1 = htmlSearch(); + public void equalEngines1() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertTrue("Equals big engine", se1.equals(se2)); diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index 026df77..9c9ad36 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -2,59 +2,71 @@ import java.io.IOException; +import javax.xml.xpath.XPathExpressionException; + +import org.apache.log4j.Logger; +import org.apache.log4j.varia.NullAppender; +import org.json.JSONException; +import org.json.JSONObject; import org.junit.Assert; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; - import org.searsia.index.ResourceIndex; import org.searsia.engine.Resource; public class TestResourceIndex { + private static final Logger LOGGER = Logger.getLogger("org.searsia"); private static final String PATH = "target/index-test"; private static final String INDEX = "test"; private static ResourceIndex engines; @BeforeClass public static void setUp() throws Exception { + LOGGER.removeAllAppenders(); + LOGGER.addAppender(new NullAppender()); engines = new ResourceIndex(PATH, INDEX); Resource engine = searsia(); engines.putMother(engine); } @AfterClass - public static void lastThing() throws IOException { + public static void lastThing() throws IOException, XPathExpressionException, JSONException { engines.put(newby()); checkFiles(); } - private static Resource utwente() { - Resource e = new Resource("http://utwente.nl/search?q={q}", "567"); - e.setName("UT"); - return e; + private static Resource utwente() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://utwente.nl/search?q={q}\",\"id\":\"567\",\"name\":\"UT\"}" + ); + return new Resource(json); } - private static Resource searsia() { - Resource e = new Resource("http://searsia.com/?q={q}", "1234"); - e.addPrivateParameter("api", "topsecret"); - return e; + private static Resource searsia() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://searsia.com/?q={q}\",\"id\":\"1234\",\"privateparameters\":{\"api\":\"topsecret\"}}" + ); + return new Resource(json); } - private static Resource newby() { - Resource e = new Resource("http://new.com/?q={q}", "new"); - e.addPrivateParameter("apikey", "secret"); - return e; + private static Resource newby() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://new.com/?q={q}\",\"id\":\"new\",\"privateparameters\":{\"apikey\":\"secret\"}}" + ); + return new Resource(json); } - private static Resource me() { - Resource e = new Resource("http://me.org", "me"); - e.setName("Me"); - return e; + private static Resource me() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://me.org\",\"id\":\"me\",\"name\":\"Me\"}" + ); + return new Resource(json); } - public static void checkFiles() throws IOException { + public static void checkFiles() throws IOException, XPathExpressionException, JSONException { Resource e1 = me(); Resource e2 = engines.getMyself(); Assert.assertTrue("Trying to retrieve me", e1.equals(e2)); @@ -69,7 +81,7 @@ public static void checkFiles() throws IOException { } @Test - public void addResource() { + public void addResource() throws XPathExpressionException, JSONException { Resource e1 = utwente(); engines.put(e1); Resource e2 = engines.get(e1.getId()); @@ -77,7 +89,7 @@ public void addResource() { } @Test - public void addMe() { + public void addMe() throws XPathExpressionException, JSONException { Resource e1 = me(); engines.putMyself(e1); Resource e2 = engines.getMyself(); @@ -85,7 +97,7 @@ public void addMe() { } @Test - public void getMother() { + public void getMother() throws XPathExpressionException, JSONException { Resource e1 = searsia(); Resource e2 = engines.getMother(); Assert.assertTrue("Mother", e1.equals(e2)); diff --git a/src/test/java/org/searsia/index/TestSearchResultIndex.java b/src/test/java/org/searsia/index/TestSearchResultIndex.java index b436f2d..40cdad5 100644 --- a/src/test/java/org/searsia/index/TestSearchResultIndex.java +++ b/src/test/java/org/searsia/index/TestSearchResultIndex.java @@ -35,7 +35,8 @@ public static void setUp() throws Exception { index.offer(result); index.flush(); } - + + private static SearchResult readFile(String fileString) throws IOException { SearchResult result = new SearchResult(); String s, jsonString = ""; // TODO: Does the following file name work in Windows? From 0634cc8ce5d053538651bc6f3df436e6180f81e4 Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 26 Apr 2017 22:20:57 +0200 Subject: [PATCH 19/51] resource now only set via url or jsonobject --- src/main/java/org/searsia/Main.java | 2 +- .../java/org/searsia/engine/Resource.java | 45 +++++---- .../java/org/searsia/engine/ResourceTest.java | 99 +++++-------------- src/test/java/org/searsia/web/SearchTest.java | 17 ++-- src/test/resources/hiemstra.json | 24 +++++ src/test/resources/hiemstracrazy.json | 29 ++++++ src/test/resources/hiemstrapost.json | 22 +++++ src/test/resources/hiemstraxml.json | 17 ++++ src/test/resources/index.json | 6 ++ src/test/resources/javascript.json | 14 +++ src/test/resources/randomid.json | 8 ++ src/test/resources/searsia.json | 16 +++ src/test/resources/wrong.json | 24 +++++ 13 files changed, 218 insertions(+), 105 deletions(-) create mode 100644 src/test/resources/hiemstra.json create mode 100644 src/test/resources/hiemstracrazy.json create mode 100644 src/test/resources/hiemstrapost.json create mode 100644 src/test/resources/hiemstraxml.json create mode 100644 src/test/resources/index.json create mode 100644 src/test/resources/javascript.json create mode 100644 src/test/resources/randomid.json create mode 100644 src/test/resources/searsia.json create mode 100644 src/test/resources/wrong.json diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index f012c30..642d08b 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -274,7 +274,7 @@ public static void main(String[] args) { // Connect to the mother engine and gather information from the mother. Resource myself = null; Resource mother = null; - Resource connect = new Resource(options.getMotherTemplate(), null); + Resource connect = new Resource(options.getMotherTemplate()); String version = null; SearchResult result = null; try { diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 5f22273..b1d0760 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -96,9 +96,9 @@ public class Resource implements Comparable { private int nrOfRequests = 0; private int nrOfSuccess = 0; - public Resource(String urlAPITemplate, String id) { + public Resource(String urlAPITemplate) { this.urlAPITemplate = urlAPITemplate; - this.id = id; + this.id = null; this.name = null; this.mimeType = SearchResult.SEARSIA_MIME_TYPE; this.testQuery = defaultTestQuery; @@ -157,7 +157,12 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { public void setUrlAPITemplate(String urlTemplate) { this.urlAPITemplate = urlTemplate; } - + + + /* + * Setters no longer used: Everything now via JSON Objects + * + public void setUrlUserTemplate(String urlTemplate) { this.urlUserTemplate = urlTemplate; } @@ -198,20 +203,6 @@ public void setItemXpath(String itemXpath) { this.itemXpath = itemXpath; } - public void addExtractor(TextExtractor ... e) { - for (TextExtractor ee: e) { - this.extractors.add(ee); - } - } - - public void addHeader(String key, String value) { - this.headers.put(key, value); - } - - public void addPrivateParameter(String key, String value) { - this.privateParameters.put(key, value); - } - public void setPrior(float prior) { this.prior = prior; } @@ -223,6 +214,20 @@ public void setRate(int maxQueriesPerDay) { public void setRerank(String rerank) { this.rerank = rerank; } +*/ + private void addExtractor(TextExtractor ... e) { + for (TextExtractor ee: e) { + this.extractors.add(ee); + } + } + + private void addHeader(String key, String value) { + this.headers.put(key, value); + } + + private void addPrivateParameter(String key, String value) { + this.privateParameters.put(key, value); + } public void setLastUpdatedToNow() { this.lastUpdated = new Date().getTime(); @@ -463,11 +468,11 @@ private Document parseDocumentJavascript(String scriptString) throws IOException return DOMBuilder.string2DOM(xml); } - private Document parseDocumentJSON(String jsonString) throws IOException { + private Document parseDocumentJSON(String jsonString) throws IOException { // TODO Does not catch "/bla": true jsonString = jsonString.replaceAll("\"[^\"]*[/<>' =][^\"]*\":[ \n\r]*\"[^\"]*\",?", ""); // completely remove data with keys that have one of: /<>' = jsonString = jsonString.replaceAll("\"([0-9][^\"]*)\"[ \n\r]*:", "\"t$1\":"); // tags starting with a number are not well-formed XML - jsonString = jsonString.replaceAll("\"content\":", "\"searsia_org_json_content\":"); // work around. org.json.XML is broken: https://github.com/stleary/JSON-java/issues/286 - if (jsonString.startsWith("[")) { // turn lists into objects + jsonString = jsonString.replaceAll("\"content\":", "\"searsia_org_json_content\":"); // TODO write DOMBuilder.json2DOM(): this is a work around. org.json.XML is broken: https://github.com/stleary/JSON-java/issues/286 + if (jsonString.startsWith("[")) { // turn lists into objects jsonString = "{\"list\":" + jsonString + "}"; } String xml = "" + XML.toString(new JSONObject(jsonString)) + ""; diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index a47d121..256b617 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -16,55 +16,7 @@ public class ResourceTest { private static final String SECRET_API_KEY = "a7235cdsf43d3a2dfgeda"; - - private Resource searsiaMimeOnlySearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/wiki/wikididyoumean{q?}.json", "randomid"); - } - - private Resource searsiaSearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/wiki/index{q}.json", "index"); - } - - private Resource xmlSearch() throws XPathExpressionException, SearchException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki/index{q?}.json", "index"); - Resource wikifull = wiki.searchResource("wikifull1"); - return wikifull; - } - private Resource jsonSearch() throws XPathExpressionException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki/wikifull1{q?}.json", "wikifull1"); - wiki.setMimeType("application/json"); - wiki.setItemXpath("//hits"); - wiki.addExtractor( - new TextExtractor("title", "./title"), - new TextExtractor("description", "./description"), - new TextExtractor("url", "./url"), - new TextExtractor("content", "./content") - ); - return wiki; - } - - private Resource javascriptSearch() throws XPathExpressionException { - Resource wikifull = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.js", "wikifull1"); - wikifull.setMimeType("application/x-javascript"); - wikifull.setItemXpath("//hits"); - wikifull.addExtractor( - new TextExtractor("title", "./title"), - new TextExtractor("description", "./description"), - new TextExtractor("url", "./url") - ); - return wikifull; - } - - @Test - public void testje() throws XPathExpressionException, SearchException { - System.out.println(searsiaSearch().toJson()); - System.out.println(searsiaMimeOnlySearch().toJson()); - System.out.println(xmlSearch().toJson()); - System.out.println(jsonSearch().toJson()); - System.out.println(javascriptSearch().toJson()); - } - @BeforeClass public static void setUp() { Logger.getLogger("").setLevel(Level.WARNING); @@ -72,7 +24,7 @@ public static void setUp() { @Test public void testSearchSearsia() throws XPathExpressionException, SearchException { - Resource se = new Resource("file:src/test/resources/index.json", null).updateFromAPI(); + Resource se = new Resource("file:src/test/resources/index.json").updateFromAPI(); String query = "informat"; SearchResult result = se.search(query); Assert.assertEquals(query, result.getQuery()); @@ -81,7 +33,7 @@ public void testSearchSearsia() throws XPathExpressionException, SearchException @Test public void testSearchHtml() throws XPathExpressionException, SearchException { - Resource se = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); + Resource se = new Resource("file:src/test/resources/hiemstra.json").updateFromAPI(); SearchResult result = se.search("dolf trieschnigg", "xml"); Assert.assertEquals("text/html", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -89,7 +41,7 @@ public void testSearchHtml() throws XPathExpressionException, SearchException { @Test public void testSearchPost() throws XPathExpressionException, SearchException { - Resource se = new Resource("file:src/test/resources/hiemstrapost.json", null).updateFromAPI(); + Resource se = new Resource("file:src/test/resources/hiemstrapost.json").updateFromAPI(); SearchResult result = se.search("dolf trieschnigg"); Assert.assertEquals("application/xml", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -97,36 +49,36 @@ public void testSearchPost() throws XPathExpressionException, SearchException { @Test public void testSearchXml() throws XPathExpressionException, SearchException { - Resource se = xmlSearch(); - SearchResult result = se.search("informat"); - Assert.assertEquals("application/xml", se.getMimeType()); + Resource se1 = new Resource("http://searsia.org/searsia/wiki/index{q}.json").updateFromAPI(); + Resource se2 = se1.searchResource("wikifull1"); + SearchResult result = se2.search("informat"); + Assert.assertEquals("application/xml", se2.getMimeType()); Assert.assertEquals(10, result.getHits().size()); } @Test public void testSearchXml2() throws XPathExpressionException, SearchException { - Resource se = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); - se.setMimeType("application/xml"); - se.setRerank(null); + Resource se = new Resource("file:src/test/resources/hiemstraxml.json").updateFromAPI(); long startTime = System.currentTimeMillis(); SearchResult result = se.search("test"); + Assert.assertEquals("application/xml", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); Assert.assertFalse("Parser timed out", System.currentTimeMillis() - startTime > 10000); } @Test public void testSearchJson() throws XPathExpressionException, SearchException { - Resource se = jsonSearch(); + Resource se = new Resource("file:src/test/resources/searsia.json").updateFromAPI(); String debug = "xml"; SearchResult result = se.search("informat", debug); Assert.assertNotNull(result.getDebugOut()); Assert.assertEquals("application/json", se.getMimeType()); - Assert.assertEquals(10, result.getHits().size()); + Assert.assertTrue("Result size 10 or more", result.getHits().size() >= 10); } @Test public void testSearchJson2() throws XPathExpressionException, SearchException { - Resource se = jsonSearch(); + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); SearchResult result = se.search("json"); Assert.assertEquals(1, result.getHits().size()); Assert.assertEquals("extra content", result.getHits().get(0).getString("content")); @@ -134,14 +86,14 @@ public void testSearchJson2() throws XPathExpressionException, SearchException { @Test public void testSearchJson3() throws XPathExpressionException, SearchException { - Resource se = jsonSearch(); + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); SearchResult result = se.search("strange keys"); Assert.assertEquals(1, result.getHits().size()); } @Test public void testSearchJavascript() throws XPathExpressionException, SearchException { - Resource se = javascriptSearch(); + Resource se = new Resource("file:src/test/resources/javascript.json").updateFromAPI(); String debug = "xml"; SearchResult result = se.search("informat", debug); Assert.assertEquals("application/x-javascript", se.getMimeType()); @@ -150,21 +102,21 @@ public void testSearchJavascript() throws XPathExpressionException, SearchExcept @Test public void testSearchSearsiaEmpty() throws XPathExpressionException, SearchException { - Resource se = searsiaSearch(); + Resource se = new Resource("http://searsia.org/searsia/wiki/index{q}.json").updateFromAPI(); SearchResult result = se.searchWithoutQuery(); Assert.assertTrue(result.getHits().size() > 0); } @Test public void testSearchResource() throws XPathExpressionException, SearchException { - Resource se = searsiaSearch(); + Resource se = new Resource("file:src/test/resources/index.json").updateFromAPI(); Resource engine = se.searchResource("wikifull1"); Assert.assertTrue(engine != null); } @Test public void testSearchNoResource1() throws XPathExpressionException, SearchException { - Resource se = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); + Resource se = new Resource("file:src/test/resources/hiemstra.json").updateFromAPI(); Boolean exception = false; try { se.searchResource("wikifull1"); @@ -176,7 +128,7 @@ public void testSearchNoResource1() throws XPathExpressionException, SearchExcep @Test public void testSearchNoResource2() throws XPathExpressionException, SearchException { - Resource se = searsiaMimeOnlySearch(); + Resource se = new Resource("file:src/test/resources/randomid.json").updateFromAPI(); Boolean exception = false; try { se.searchResource("wikifull1"); @@ -188,7 +140,7 @@ public void testSearchNoResource2() throws XPathExpressionException, SearchExcep @Test public void testSearchError() throws XPathExpressionException, SearchException { - Resource se = new Resource("file:src/test/resources/wrong.json", null).updateFromAPI(); + Resource se = new Resource("file:src/test/resources/wrong.json").updateFromAPI(); String message = null; String apiKey = se.getPrivateParameter("apikey"); try { @@ -203,12 +155,7 @@ public void testSearchError() throws XPathExpressionException, SearchException @Test public void testJsonRoundtrip() throws XPathExpressionException, SearchException { - Resource se1 = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); - se1.setPostString("POST"); - se1.setPostQueryEncode("application/x-www-form-urlencoded"); - se1.setRerank("lm"); - se1.setBanner("me.png"); - se1.setUrlSuggestTemplate("http://whatever"); + Resource se1 = new Resource("file:src/test/resources/hiemstracrazy.json").updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertEquals("id", se1.getId(), se2.getId()); @@ -240,15 +187,15 @@ public void testJsonPrivateParameter() throws XPathExpressionException { @Test public void equalEngines1() throws XPathExpressionException, SearchException { - Resource se1 = new Resource("file:src/test/resources/hiemstra.json", null).updateFromAPI(); + Resource se1 = new Resource("file:src/test/resources/hiemstra.json").updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertTrue("Equals big engine", se1.equals(se2)); } @Test - public void equalEngines2() throws XPathExpressionException { - Resource se1 = searsiaSearch(); + public void equalEngines2() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("file:src/test/resources/index.json").updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertTrue("Truely Equals small engine", se1.equals(se2)); diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 7644443..44daaaf 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -3,16 +3,17 @@ import java.io.IOException; import javax.ws.rs.core.Response; +import javax.xml.xpath.XPathExpressionException; import org.apache.log4j.Logger; import org.apache.log4j.varia.NullAppender; import org.json.JSONArray; +import org.json.JSONException; import org.json.JSONObject; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; - import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.web.Search; @@ -27,16 +28,16 @@ public class SearchTest { private static ResourceIndex engines; - private static Resource wiki() { - return new Resource("http://searsia.org/searsia/wiki/wiki{q}.json", "wiki"); + private static Resource wiki() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wiki{q}.json\", \"id\":\"wiki\"}")); } - private static Resource wrong() { - return new Resource("http://searsia.com/doesnotexist?q={q}", "wrong"); + private static Resource wrong() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.com/doesnotexist?q={q}\", \"id\":\"wrong\"}")); } - private static Resource me() { - return new Resource("http://me.org?q={q}", "wiki"); + private static Resource me() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://me.org?q={q}\", \"id\":\"wiki\"}")); } @@ -90,7 +91,7 @@ public void testQuery() throws IOException { } @Test // returns local resource 'wrong' - public void testResource() throws IOException { + public void testResource() throws IOException, XPathExpressionException, JSONException { Search search = new Search(index, engines); Response response = search.query("wrong.json", ""); int status = response.getStatus(); diff --git a/src/test/resources/hiemstra.json b/src/test/resources/hiemstra.json new file mode 100644 index 0000000..d547439 --- /dev/null +++ b/src/test/resources/hiemstra.json @@ -0,0 +1,24 @@ +{ + "resource": { + "id": "hiemstra", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "headers": { + "User-Agent": "Test/1.0" + }, + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "text/html", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + } +} diff --git a/src/test/resources/hiemstracrazy.json b/src/test/resources/hiemstracrazy.json new file mode 100644 index 0000000..a42f883 --- /dev/null +++ b/src/test/resources/hiemstracrazy.json @@ -0,0 +1,29 @@ +{ + "resource": { + "id": "hiemstrapost", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "post": "POST", + "postencode": "application/x-www-form-urlencoded", + "rerank": "lm", + "banner": "me.png", + "suggesttemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/keywords.php?q=d&limit=10", + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "headers": { + "User-Agent": "Test/1.0" + }, + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "text/html", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + } +} diff --git a/src/test/resources/hiemstrapost.json b/src/test/resources/hiemstrapost.json new file mode 100644 index 0000000..763de35 --- /dev/null +++ b/src/test/resources/hiemstrapost.json @@ -0,0 +1,22 @@ +{ + "resource": { + "id": "hiemstra", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/", + "post": "s={q}&api={apikey}&p={p?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "application/xml", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + } +} diff --git a/src/test/resources/hiemstraxml.json b/src/test/resources/hiemstraxml.json new file mode 100644 index 0000000..deb2b31 --- /dev/null +++ b/src/test/resources/hiemstraxml.json @@ -0,0 +1,17 @@ +{ + "resource": { + "id": "hiemstraxml", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&p={p?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "itempath": "//div[@class='post']", + "mimetype": "application/xml", + "prior": 0.3, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + } +} diff --git a/src/test/resources/index.json b/src/test/resources/index.json new file mode 100644 index 0000000..fffc537 --- /dev/null +++ b/src/test/resources/index.json @@ -0,0 +1,6 @@ +{ + "resource": { + "apitemplate": "http://searsia.org/searsia/wiki/index{q}.json", + "id": "index", + } +} diff --git a/src/test/resources/javascript.json b/src/test/resources/javascript.json new file mode 100644 index 0000000..d50beac --- /dev/null +++ b/src/test/resources/javascript.json @@ -0,0 +1,14 @@ +{ + "resource": { + "id": "javascript", + "apitemplate": "http://searsia.org/searsia/wiki/wikifull1{q}.js", + "itempath": "//hits", + "extractors": { + "description": "./description", + "title": "./title", + "url": "./url" + }, + "mimetype": "application/x-javascript", + "name": "Wiki Full 1", + } +} diff --git a/src/test/resources/randomid.json b/src/test/resources/randomid.json new file mode 100644 index 0000000..c1f2e39 --- /dev/null +++ b/src/test/resources/randomid.json @@ -0,0 +1,8 @@ +{ + "resource": { + "apitemplate": "http://searsia.org/searsia/wiki/wikididyoumean{q?}.json", + "id": "randomid", + "mimetype": "application/searsia+json", + "testquery": "searsia" + } +} diff --git a/src/test/resources/searsia.json b/src/test/resources/searsia.json new file mode 100644 index 0000000..9ac05f6 --- /dev/null +++ b/src/test/resources/searsia.json @@ -0,0 +1,16 @@ +{ + "resource": { + "id": "searsia", + "apitemplate": "http://searsia.org/searsia/search.json", + "itempath": "//hits", + "extractors": { + "description": "./description", + "title": "./title", + "url": "./url" + }, + "favicon": "http://searsia.org/images/searsia.png", + "mimetype": "application/json", + "name": "Searsia", + "urltemplate": "http://searsia.org/searsia/search.json" + } +} diff --git a/src/test/resources/wrong.json b/src/test/resources/wrong.json new file mode 100644 index 0000000..8e275d5 --- /dev/null +++ b/src/test/resources/wrong.json @@ -0,0 +1,24 @@ +{ + "resource": { + "id": "hiemstra", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/WRONG?s={q}&api={apikey}&p={p?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "headers": { + "User-Agent": "Test/1.0" + }, + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "text/html", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + } +} From 6f08ab36c6ea2689b96754c3b63b2174e957e82a Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 28 Apr 2017 11:10:57 +0200 Subject: [PATCH 20/51] new Json2DOM reader --- .../java/org/searsia/engine/DOMBuilder.java | 105 +++++++++++++++++- .../java/org/searsia/engine/Resource.java | 14 +-- 2 files changed, 102 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 7a2a712..922c220 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -1,5 +1,6 @@ /* - * Copyright Walter Kasper + * Jsoup2DOM Copyright Walter Kasper + * Json2DOC Copyright Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +32,8 @@ import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import org.json.JSONArray; +import org.json.JSONObject; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.w3c.dom.Node; @@ -108,7 +111,7 @@ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { /* Create a document to contain the content. */ document = docBuilder.newDocument(); - createDOM(jsoupDocument, document, document, new HashMap()); + createDOMfromJsoup(jsoupDocument, document, document, new HashMap()); } catch (ParserConfigurationException pce) { throw new RuntimeException(pce); @@ -116,18 +119,46 @@ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { return document; } + + /** + * Returns a W3C DOM that exposes the same content as the supplied Jsoup document into a W3C DOM. + * @param jsoupDocument The Jsoup document to convert. + * @return A W3C Document. + */ + public static Document json2DOM(JSONObject jsonDocument) { + + Document document = null; + + try { + + /* Obtain the document builder for the configured XML parser. */ + DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); + + /* Create a document to contain the content. */ + document = docBuilder.newDocument(); + org.w3c.dom.Element _e = document.createElement("root"); + document.appendChild(_e); + createDOMfromJSONObject(jsonDocument, _e, document); + } catch (ParserConfigurationException pce) { + throw new RuntimeException(pce); + } + return document; + } + + /** * The internal helper that copies content from the specified Jsoup Node into a W3C {@link Node}. * @param node The Jsoup node containing the content to copy to the specified W3C {@link Node}. * @param out The W3C {@link Node} that receives the DOM content. */ - private static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map ns) { + private static void createDOMfromJsoup(org.jsoup.nodes.Node node, Node out, Document doc, Map ns) { if (node instanceof org.jsoup.nodes.Document) { org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node); for (org.jsoup.nodes.Node n : d.childNodes()) { - createDOM(n, out,doc,ns); + createDOMfromJsoup(n, out,doc,ns); } } else if (node instanceof org.jsoup.nodes.Element) { @@ -164,7 +195,7 @@ else if (!attPrefix.equals("xml")) { } for (org.jsoup.nodes.Node n : e.childNodes()) { - createDOM(n, _e, doc,ns); + createDOMfromJsoup(n, _e, doc,ns); } } else if (node instanceof org.jsoup.nodes.TextNode) { @@ -197,4 +228,66 @@ private static String getLocalName(String name) { return name; } -} + /** + * The internal helpers that copy content from the specified JSON Object into a W3C {@link Node}. + * @param json The JSON object containing the content to copy to the specified W3C {@link Node}. + * @param out The W3C {@link Node} that receives the DOM content. + */ + private static void createDOMfromJSONObject(JSONObject json, Node out, Document doc) { + for (String name : JSONObject.getNames(json)) { + Object object = json.get(name); + if (object instanceof JSONArray) { + createDOMfromJSONArray((JSONArray) object, out, doc, name); + } else { + if (object instanceof JSONObject) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONObject((JSONObject) object, _e, doc); + } else + createDOMfromJSONPrimitive(object, out, doc, name); + } + } + } + + private static void createDOMfromJSONArray(JSONArray json, Node out, Document doc, String name) { + for (Object o: json) { + if (o instanceof JSONArray) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONArray((JSONArray) o, _e, doc, "list"); + } else if (o instanceof JSONObject) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONObject((JSONObject) o, _e, doc); + } else { + createDOMfromJSONPrimitive(o, out, doc, name); + } + } + } + + private static void createDOMfromJSONPrimitive(Object object, Node out, Document doc, String name) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + if (object instanceof String) { + _e.appendChild(doc.createTextNode((String) object)); + } else if (object instanceof Boolean) { + _e.appendChild(doc.createTextNode(object.toString())); + } else if (object instanceof Integer) { + _e.appendChild(doc.createTextNode(Integer.toString((Integer) object))); + } else if (object instanceof Double) { + _e.appendChild(doc.createTextNode(Double.toString((Double) object))); + } + } + + /** + * Element names can contain letters, digits, hyphens, underscores, and periods + * Element names must start with a letter or underscore + * @param name + * @return + */ + private static String correctXML(String name) { + name = name.replaceAll("[^A-Z0-9a-z\\-_\\.]|^([^A-Za-z])", "_$1"); + return name; + } + +} \ No newline at end of file diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index b1d0760..45cdca7 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -48,7 +48,6 @@ import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; -import org.json.XML; import org.jsoup.Jsoup; import org.w3c.dom.Document; import org.w3c.dom.Node; @@ -455,7 +454,6 @@ private Document parseDocumentJavascript(String scriptString) throws IOException nrOfCurly -= 1; if (nrOfCurly == 0) { String subString = scriptString.substring(first, i + 1); - subString = subString.replaceAll("\"([0-9][^\"]*)\":", "\"t$1\":"); // tags starting with a number are not well-formed XML try { array.put(new JSONObject(subString)); } catch (JSONException e) { } @@ -464,20 +462,14 @@ private Document parseDocumentJavascript(String scriptString) throws IOException } JSONObject object = new JSONObject(); object.put("list", array); - String xml = "" + XML.toString(object) + ""; - return DOMBuilder.string2DOM(xml); + return DOMBuilder.json2DOM(object); } - private Document parseDocumentJSON(String jsonString) throws IOException { // TODO Does not catch "/bla": true - jsonString = jsonString.replaceAll("\"[^\"]*[/<>' =][^\"]*\":[ \n\r]*\"[^\"]*\",?", ""); // completely remove data with keys that have one of: /<>' = - jsonString = jsonString.replaceAll("\"([0-9][^\"]*)\"[ \n\r]*:", "\"t$1\":"); // tags starting with a number are not well-formed XML - jsonString = jsonString.replaceAll("\"content\":", "\"searsia_org_json_content\":"); // TODO write DOMBuilder.json2DOM(): this is a work around. org.json.XML is broken: https://github.com/stleary/JSON-java/issues/286 + private Document parseDocumentJSON(String jsonString) throws IOException { if (jsonString.startsWith("[")) { // turn lists into objects jsonString = "{\"list\":" + jsonString + "}"; } - String xml = "" + XML.toString(new JSONObject(jsonString)) + ""; - xml = xml.replaceAll("searsia_org_json_content>", "content>"); // use a constant for 'searsia_org_json_content'? see 5 lines above - return DOMBuilder.string2DOM(xml); + return DOMBuilder.json2DOM(new JSONObject(jsonString)); } private Document parseDocumentXML(String xmlString) throws IOException { From bde332c4326f6429b25cb98f07efc21b9cd77229 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 28 Apr 2017 16:56:35 +0200 Subject: [PATCH 21/51] json fixed --- .../java/org/searsia/engine/DOMBuilder.java | 25 ++++---- .../org/searsia/engine/DOMBuilderTest.java | 62 +++++++++++++++++++ 2 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 src/test/java/org/searsia/engine/DOMBuilderTest.java diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 922c220..362a167 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -129,7 +129,7 @@ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { public static Document json2DOM(JSONObject jsonDocument) { Document document = null; - + try { /* Obtain the document builder for the configured XML parser. */ @@ -234,17 +234,20 @@ private static String getLocalName(String name) { * @param out The W3C {@link Node} that receives the DOM content. */ private static void createDOMfromJSONObject(JSONObject json, Node out, Document doc) { - for (String name : JSONObject.getNames(json)) { - Object object = json.get(name); - if (object instanceof JSONArray) { - createDOMfromJSONArray((JSONArray) object, out, doc, name); - } else { - if (object instanceof JSONObject) { - org.w3c.dom.Element _e = doc.createElement(correctXML(name)); - out.appendChild(_e); - createDOMfromJSONObject((JSONObject) object, _e, doc); - } else + String [] names = JSONObject.getNames(json); + if (names != null) { + for (String name : names) { + Object object = json.get(name); + if (object instanceof JSONArray) { + createDOMfromJSONArray((JSONArray) object, out, doc, name); + } else { + if (object instanceof JSONObject) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONObject((JSONObject) object, _e, doc); + } else createDOMfromJSONPrimitive(object, out, doc, name); + } } } } diff --git a/src/test/java/org/searsia/engine/DOMBuilderTest.java b/src/test/java/org/searsia/engine/DOMBuilderTest.java new file mode 100644 index 0000000..3820a3d --- /dev/null +++ b/src/test/java/org/searsia/engine/DOMBuilderTest.java @@ -0,0 +1,62 @@ +package org.searsia.engine; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.junit.Test; +import org.w3c.dom.Document; + +import org.searsia.engine.DOMBuilder; + +public class DOMBuilderTest { + + private static String readFile(String fileName) throws IOException { + String s, result = ""; + BufferedReader reader = new BufferedReader(new FileReader("src/test/resources/" + fileName)); + try { + while ((s = reader.readLine()) != null) { + result += s; + } + } + finally { + reader.close(); + } + return result; + } + + @Test + public void testJsonFileIfExists() { + String jsonString = null; + try { + jsonString = readFile("test.json"); + } catch (IOException e) { } + if (jsonString != null) { + if (jsonString.startsWith("[")) { + jsonString = "{\"list\":" + jsonString + "}"; + } + JSONObject json = new JSONObject(jsonString); + Document doc = DOMBuilder.json2DOM(json); + String xml = DOMBuilder.DOM2String(doc); + System.out.println(xml); + } + } + + @Test + public void testHtmlFileIfExists() { + String htmlString = null; + try { + htmlString = readFile("test.html"); + } catch (IOException e) { } + if (htmlString != null) { + org.jsoup.nodes.Document jsoup = Jsoup.parse(htmlString); + Document doc = DOMBuilder.jsoup2DOM(jsoup); + String xml = DOMBuilder.DOM2String(doc); + System.out.println(xml); + } + } + + +} From 36622bb0737e712b8fd0ff277060578b12bcec76 Mon Sep 17 00:00:00 2001 From: Searsia Date: Mon, 1 May 2017 22:26:11 +0200 Subject: [PATCH 22/51] adds health info --- src/main/java/org/searsia/Main.java | 12 +- src/main/java/org/searsia/SearchResult.java | 23 +- src/main/java/org/searsia/SearsiaOptions.java | 31 ++- .../java/org/searsia/engine/DOMBuilder.java | 10 +- .../java/org/searsia/engine/Resource.java | 219 ++++++++++++------ .../java/org/searsia/index/ResourceIndex.java | 81 +++++-- src/main/java/org/searsia/web/Search.java | 31 ++- .../java/org/searsia/SearchResultTest.java | 2 +- 8 files changed, 273 insertions(+), 136 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 642d08b..6cfabfa 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -141,16 +141,6 @@ private static String removeFileNameUri(String uri) { return uri; } - private static String lastDir(String uri) { - if (uri.contains("/")) { - uri = uri.replaceAll("\\/[^\\/]*$", ""); - uri = uri.replaceAll("^.+\\/", ""); - return uri + "/"; - } else { - return ""; - } - } - private static String normalizedUriToTemplate(String uri, String rid) { if (uri != null) { if (uri.endsWith("/") ) { @@ -329,7 +319,7 @@ public static void main(String[] args) { // Start the web server - String myURI = removeFileNameUri(options.getMyURI()) + lastDir(options.getMotherTemplate()); + String myURI = removeFileNameUri(options.getMyURI()); try { SearsiaApplication app = new SearsiaApplication(index, engines); diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 40f5bcc..395dc81 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -125,21 +125,12 @@ public void removeResourceQuery() { } } - private void addMonitoringInfo(Hit hit, Resource resource) { - String value = resource.getLastUpdatedString(); - if (value != null) { hit.put("lastupdated", value); } - value = resource.getLastChangedString(); - if (value != null) { hit.put("lastchanged", value); } - hit.put("requestsok", resource.getNrOfSuccess()); - hit.put("requests", resource.getNrOfRequests()); - } - /** * New resource ranker, adds rscore. * @param query * @param engines */ - public void scoreResourceSelection(String query, ResourceIndex engines, boolean extraInfo) { + public void scoreResourceSelection(String query, ResourceIndex engines) { final float boost = 1.0f; Map maxScore = new HashMap(); Map topEngines = engines.topValues(query, 10); @@ -165,9 +156,6 @@ public void scoreResourceSelection(String query, ResourceIndex engines, boolean } hit.setScore(score); hit.setResourceScore(max); - if (extraInfo) { - addMonitoringInfo(hit, engines.get(rid)); - } } else { hit.setResourceScore(hit.getScore() * boost); } @@ -182,15 +170,6 @@ public void scoreResourceSelection(String query, ResourceIndex engines, boolean Collections.sort(this.hits, Collections.reverseOrder()); } - /** - * New Top resources. - * @param query - * @param engines - */ - public void scoreResourceSelection(String query, ResourceIndex engines) { - scoreResourceSelection(query, engines, false); - } - /** * TODO: needs a proper implementation, refactoring, and research ;-) * Scoring follows these rules: diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 238d361..12ad136 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -17,8 +17,8 @@ package org.searsia; import java.io.File; -import org.apache.log4j.Level; +import org.apache.log4j.Level; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -37,6 +37,7 @@ public class SearsiaOptions { private String test; private Boolean quiet; private Boolean help; + private Boolean dontshare; private int cacheSize; private int pollInterval; private int logLevel; @@ -51,27 +52,43 @@ public class SearsiaOptions { public SearsiaOptions(String[] args) throws IllegalArgumentException { Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); - options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response')."); + options.addOption("d", "dontshare",false, "Do not share resource definitions."); options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); options.addOption("m", "mother", true, "Set url of mother's web service end point."); options.addOption("p", "path", true, "Set directory path to store the index."); options.addOption("q", "quiet", false, "No output to console."); + options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response')."); options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); + if (myURI == null) { + myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); + } } + private static String lastDir(String uri) { + if (uri.contains("/")) { + uri = uri.replaceAll("\\/[^\\/]*$", ""); + uri = uri.replaceAll("^.+\\/", ""); + return uri + "/"; + } else { + return ""; + } + } + + private void setDefaults() { test = null; // no test help = false; quiet = false; + dontshare = false; cacheSize = 500; pollInterval = 120; logLevel = 2; - myURI = "http://localhost:16842/searsia/"; + myURI = null; // is set in constructor motherTemplate = null; indexPath = friendlyIndexPath(); } @@ -157,6 +174,9 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("q")) { quiet = true; } + if (cmd.hasOption("d")) { + dontshare = true; + } if (cmd.hasOption("u")) { myURI = cmd.getOptionValue("u"); } @@ -226,6 +246,10 @@ public Boolean isQuiet() { return quiet; } + public Boolean isNotShared() { + return dontshare; + } + public Boolean isHelp() { return help; } @@ -240,6 +264,7 @@ public String toString() { result += "\n Poll Interval = " + getPollInterval(); result += "\n Cache Size = " + getCacheSize(); result += "\n Test Output = " + getTestOutput(); + result += "\n Do Not Share = " + isNotShared(); return result; } diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 362a167..9be99af 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -17,7 +17,6 @@ package org.searsia.engine; -import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.HashMap; @@ -43,6 +42,11 @@ * Returns a W3C DOM for a Jsoup parsed document. * * @author Walter Kasper + * + * Returns a W3C DOM for a Json document + * + * @author Djoerd Hiemstra + * */ public class DOMBuilder { @@ -52,7 +56,7 @@ public class DOMBuilder { * @return A W3C Document. * @throws */ - public static Document string2DOM(String xmlString) throws IOException { + public static Document string2DOM(String xmlString) { Document document = null; @@ -68,7 +72,7 @@ public static Document string2DOM(String xmlString) throws IOException { DocumentBuilder docBuilder = factory.newDocumentBuilder(); document = docBuilder.parse(new InputSource(new StringReader(xmlString))); } catch (Exception e) { - throw new IOException(e); + throw new RuntimeException(e); } return document; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 45cdca7..729ce23 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -66,7 +66,7 @@ public class Resource implements Comparable { // TODO: private static final Pattern queryPattern = Pattern.compile("\\{q\\??\\}"); - // data to be set by setters + // data to be set by JSON private String id = null; private String name = null; private String urlAPITemplate = null; @@ -86,14 +86,17 @@ public class Resource implements Comparable { private String rerank = null; private int rate = defaultRATE; - // internal data not to be shared - private String nextQuery = null; - private double allowance = defaultRATE / 2; - private Long lastUsed = new Date().getTime(); // Unix time - private Long lastUpdated = new Date().getTime(); // Unix time - private Long lastChanged = null; - private int nrOfRequests = 0; - private int nrOfSuccess = 0; + // internal data shared for health report + private String nextQuery = null; + private String lastMessage = null; + private double allowance = defaultRATE / 2; + private long lastUsed = new Date().getTime(); // Unix time + private long lastUsedOk = lastUsed; + private long lastUsedError = lastUsed; + private long lastUpdated = lastUsed; + private long upsince = lastUsed; + private int nrOfError = 0; + private int nrOfOk = 0; public Resource(String urlAPITemplate) { this.urlAPITemplate = urlAPITemplate; @@ -238,13 +241,13 @@ public void setLastUpdatedToDateString(String date) { } catch (ParseException e) { } } - public void setLastChangedToNow() { - this.lastChanged = new Date().getTime(); + public void setUpSinceToNow() { + this.upsince = new Date().getTime(); } - public void setLastChangedToDateString(String date) { + public void setUpSinceDateString(String date) { try { - this.lastChanged = dateFormat.parse(date).getTime(); + this.upsince = dateFormat.parse(date).getTime(); } catch (ParseException e) { } } @@ -265,12 +268,8 @@ public SearchResult randomSearch() throws SearchException { } String thisQuery = this.nextQuery; this.nextQuery = null; // so, nextQuery will be null in case of a searchexception - SearchResult result = search(thisQuery); - if (this.testQuery.equals(thisQuery) && result.getHits().isEmpty()) { - throw new SearchException("No results for test query: " + thisQuery); - } else { - this.nextQuery = result.randomTerm(thisQuery); - } + SearchResult result = search(thisQuery, null); + this.nextQuery = result.randomTerm(thisQuery); return result; } @@ -281,7 +280,11 @@ public SearchResult search(String query) throws SearchException { public SearchResult search(String query, String debug) throws SearchException { - this.nrOfRequests += 1; + if (rateLimitReached()) { + this.lastMessage = "Too many queries"; + this.lastUsedError = new Date().getTime(); + throw new SearchException(this.lastMessage); + } SearchResult result; try { String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); @@ -309,11 +312,18 @@ public SearchResult search(String query, String debug) throws SearchException { } if (this.rerank != null && query != null) { result.scoreReranking(query, this.rerank); + } + if (this.testQuery.equals(query) && result.getHits().isEmpty()) { + throw new SearchException("No results for test query: " + query); } } catch (Exception e) { // catch all, also runtime exceptions + this.nrOfError += 1; + this.lastUsedError = new Date().getTime(); + this.lastMessage = e.getMessage(); throw createPrivateSearchException(e); - } - this.nrOfSuccess += 1; + } + this.nrOfOk += 1; + this.lastUsedOk = new Date().getTime(); result.setQuery(query); result.setResourceId(this.getId()); return result; @@ -430,7 +440,7 @@ private Hit extractHit(Node item) throws XPathExpressionException { return hit; } - private Document parseDocumentHTML(String htmlString, String urlString) throws IOException { + private Document parseDocumentHTML(String htmlString, String urlString) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(htmlString, urlString); return DOMBuilder.jsoup2DOM(jsoupDoc); } @@ -441,7 +451,7 @@ private Document parseDocumentHTML(String htmlString, String urlString) throws I * @return Document * @throws IOException */ - private Document parseDocumentJavascript(String scriptString) throws IOException { + private Document parseDocumentJavascript(String scriptString) { int nrOfCurly = 0; int first = -1; JSONArray array = new JSONArray(); @@ -465,18 +475,18 @@ private Document parseDocumentJavascript(String scriptString) throws IOException return DOMBuilder.json2DOM(object); } - private Document parseDocumentJSON(String jsonString) throws IOException { + private Document parseDocumentJSON(String jsonString) { if (jsonString.startsWith("[")) { // turn lists into objects jsonString = "{\"list\":" + jsonString + "}"; } return DOMBuilder.json2DOM(new JSONObject(jsonString)); } - private Document parseDocumentXML(String xmlString) throws IOException { + private Document parseDocumentXML(String xmlString) { return DOMBuilder.string2DOM(xmlString); } - private String fillTemplate(String template, String query) throws IOException { + private String fillTemplate(String template, String query) throws SearchException { String url = template; for (String param: getPrivateParameterKeys()) { url = url.replaceAll("\\{" + param + "\\??\\}", getPrivateParameter(param)); @@ -485,7 +495,7 @@ private String fillTemplate(String template, String query) throws IOException { url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters if (url.matches(".*\\{[0-9A-Za-z\\-_]+\\}.*")) { String param = url.substring(url.indexOf("{"), url.indexOf("}") + 1); - throw new IOException("Missing url parameter " + param); + throw new SearchException("Missing url parameter " + param); } return url; } @@ -566,9 +576,6 @@ private InputStream fileConnect(URLConnection connection) throws IOException { } private String getCompletePage(String urlString, String postString, Map headers) throws IOException { - if (rateLimitReached()) { - throw new IOException("Rate limited"); - } URL url = new URL(urlString); URLConnection connection = setConnectionProperties(url, headers); InputStream stream; @@ -677,6 +684,15 @@ public int getRate() { return this.rate; } + public int getAllowance() { + long timePassed = new Date().getTime() - this.lastUsed; + double currentAllowance = this.allowance + (((double) timePassed / defaultPER)) * this.rate; + if (currentAllowance > this.rate) { + return this.rate; + } + return (int) currentAllowance; + } + public float getPrior() { if (this.prior == null) { return 0.0f; @@ -685,44 +701,58 @@ public float getPrior() { } } - public int getNrOfRequests() { - return this.nrOfRequests; + public int getNrOfErrors() { + return this.nrOfError; } public int getNrOfSuccess() { - return this.nrOfSuccess; + return this.nrOfOk; } - private Long secondsAgo(Long last) { - if (last == null) { - return null; - } else { - Long now = new Date().getTime(); - Long ago = 1 + (now - last) / 1000; - if (ago < 0 || ago > 8640000l) { // 100 days... - ago = 8640000l; - } - return ago; - } + private long secondsAgo(long last) { + long now = new Date().getTime(); + long ago = 1 + (now - last) / 1000; + if (ago < 0 || ago > 8640000l) { // 100 days... + ago = 8640000l; + } + return ago; } + public String getLastError() { + return this.lastMessage; + } + + public String getLastUsedString() { + return dateFormat.format(new Date(this.lastUsed)); + } + + public String getLastSuccessDate() { + return dateFormat.format(new Date(this.lastUsedOk)); + } + + public String getLastErrorDate() { + return dateFormat.format(new Date(this.lastUsedError)); + } public String getLastUpdatedString() { return dateFormat.format(new Date(this.lastUpdated)); } - public String getLastChangedString() { - return dateFormat.format(new Date(this.lastChanged)); + public String getUpSinceString() { + return dateFormat.format(new Date(this.upsince)); } public long getLastUpdatedSecondsAgo() { return secondsAgo(this.lastUpdated); } - public Long getLastUsedSecondsAgo() { return secondsAgo(this.lastUsed); } + + public boolean isHealthy() { + return this.lastUsedOk >= this.lastUsedError; + } public float score(String query) { @@ -748,30 +778,36 @@ public Resource deepcopy() { try { return new Resource(this.toJson()); } catch (XPathExpressionException | JSONException e) { - throw new RuntimeException(e); + throw new RuntimeException(e.getMessage()); } } public void updateWith(Resource e2) { - if (e2.id != null) this.id = e2.id; - if (e2.name != null) this.name = e2.name; - if (e2.urlUserTemplate != null) this.urlUserTemplate = e2.urlUserTemplate; - if (e2.favicon != null) this.favicon = e2.favicon; - if (e2.banner != null) this.banner = e2.banner; - if (e2.urlAPITemplate != null) this.urlAPITemplate = e2.urlAPITemplate; - if (e2.urlSuggestTemplate != null) this.urlSuggestTemplate = e2.urlSuggestTemplate; - if (e2.mimeType != null) this.mimeType = e2.mimeType; - if (e2.rerank != null) this.rerank = e2.rerank; - if (e2.postString != null) this.postString = e2.postString; - if (e2.postQueryEncode != null) this.postQueryEncode = e2.postQueryEncode; - if (e2.testQuery != null) this.testQuery = e2.testQuery; - if (e2.prior != null) this.prior = e2.prior; - if (e2.rate != defaultRATE) this.rate = e2.rate; - if (e2.itemXpath != null) this.itemXpath = e2.itemXpath; - if (e2.extractors != null) this.extractors = e2.extractors; - if (e2.headers != null) this.headers = e2.headers; - if (e2.privateParameters != null) this.privateParameters = e2.privateParameters; + setLastUpdatedToNow(); + if (!equals(e2)) { + setUpSinceToNow(); + if (this.id != null && !this.id.equals(e2.id)) throw new RuntimeException("Cannot update resource ID."); + this.id = e2.id; + this.name = e2.name; + this.urlUserTemplate = e2.urlUserTemplate; + this.favicon = e2.favicon; + this.banner = e2.banner; + this.urlAPITemplate = e2.urlAPITemplate; + this.urlSuggestTemplate = e2.urlSuggestTemplate; + if (e2.mimeType == null) { this.mimeType = SearchResult.SEARSIA_MIME_TYPE; } + else { this.mimeType = e2.mimeType; } + this.rerank = e2.rerank; + this.postString = e2.postString; + this.postQueryEncode = e2.postQueryEncode; + if (e2.testQuery == null) { this.testQuery = defaultTestQuery; } else { this.testQuery = e2.testQuery; } + this.prior = e2.prior; + this.rate = e2.rate; + this.itemXpath = e2.itemXpath; + this.extractors = e2.extractors; + this.headers = e2.headers; + this.privateParameters = e2.privateParameters; + } } @@ -814,6 +850,55 @@ public JSONObject toJsonEngine() { } + public JSONObject toJsonEngineDontShare() { + JSONObject engine = new JSONObject(); + if (id != null) engine.put("id", id); + if (name != null) engine.put("name", name); + if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); + if (favicon != null) engine.put("favicon", favicon); + if (banner != null) engine.put("banner", banner); + if (mimeType != null && !mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) + engine.put("mimetype", mimeType); + if (rerank != null) engine.put("rerank", rerank); + if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + return engine; + } + + + public JSONObject toJsonHealth() { + JSONObject health = new JSONObject(); + health.put("dayallowance", getAllowance()); + health.put("requestsok", this.nrOfOk); + health.put("requestserr", this.nrOfError); + health.put("lastsuccess", getLastSuccessDate()); + health.put("lasterror", getLastErrorDate()); + health.put("lastupdated", getLastUpdatedString()); + health.put("upsince", getUpSinceString()); + if (this.lastMessage != null) health.put("lastmessage", this.lastMessage); + return health; + } + + + /** + * Only used at startup when reading resources from disk + * @param health + * @throws ParseException + */ + public void updateHealth(JSONObject health) throws ParseException { + //try { + Integer num = health.getInt("requestsok"); + if (num != null) this.nrOfOk = num; + num = health.getInt("requestserr"); + if (num != null) this.nrOfError = num; + this.lastUsedOk = dateFormat.parse(health.getString("lastsuccess")).getTime(); + this.lastUsedError = dateFormat.parse(health.getString("lasterror")).getTime(); + this.lastUpdated = dateFormat.parse(health.getString("lastupdated")).getTime(); + this.upsince = dateFormat.parse(health.getString("upsince")).getTime(); + if (health.has("lastmessage")) this.lastMessage = health.getString("lastmessage"); + // } catch (Exception e) { } // TODO: woops? + } + + @Override public int compareTo(Resource e2) { Float score1 = getPrior(); diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 2bcecb7..be5277c 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -127,26 +127,24 @@ private void readResourceIndex() throws IOException { for (ScoreDoc hit: hits) { Document doc = searcher.doc(hit.doc); JSONObject json = new JSONObject(doc.get("json")); - Resource engine = new Resource(json); - if (json.has("lastupdated")) { - String lastUpdated = json.getString("lastupdated"); + Resource engine = new Resource((JSONObject) json.get("resource")); + if (json.has("health")) { + engine.updateHealth((JSONObject) json.get("health")); + String lastUpdated = engine.getLastUpdatedString(); if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { this.lastFlushed = lastUpdated; } - engine.setLastUpdatedToDateString(lastUpdated); - } - if (json.has("lastupdated")) { - engine.setLastUpdatedToDateString(json.getString("lastupdated")); } this.engines.put(engine.getId(), engine); } - } catch (javax.xml.xpath.XPathExpressionException | JSONException e) { + } catch (Exception e) { throw new IOException(e.getMessage()); } finally { reader.close(); } } + private void initResourceIndex() throws IOException { Directory dir = FSDirectory.open(indexDir.toFile()); @@ -193,12 +191,10 @@ public void put(Resource engine) { throw new RuntimeException("Local id conflict: " + engine.getId()); } Resource old = get(engine.getId()); - if (old != null && old.equals(engine)) { // nothing new - old.setLastUpdatedToNow(); - } else { - engine.setLastUpdatedToNow(); - engine.setLastChangedToNow(); + if (old == null) { this.engines.put(engine.getId(), engine); + } else { + old.updateWith(engine); } } @@ -251,20 +247,24 @@ public Map topValues(String queryString, int max) { public void putMother(Resource mother) { mother.setLastUpdatedToNow(); - this.mother = mother; + if (this.mother == null) { + this.mother = mother; + } else { + this.mother.updateWith(mother); + } } - public void putMyself(Resource engine) { - if (get(engine.getId()) != null) { - throw new RuntimeException("The server id '" + engine.getId() + "' already exists."); + public void putMyself(Resource me) { + if (get(me.getId()) != null) { + throw new RuntimeException("The server id '" + me.getId() + "' already exists."); } - engine.setLastUpdatedToNow(); + me.setLastUpdatedToNow(); try { - writeMyselfFile(engine); + writeMyselfFile(me); } catch (IOException e) { - LOGGER.error("Could not write index file"); + LOGGER.error("Could not write resource index file"); } - this.me = engine; + this.me = me; } public float maxPrior() { @@ -289,8 +289,13 @@ public void dump() { private Document luceneDocument(Resource engine) { Document doc = new Document(); String id = engine.getId(); - JSONObject json = engine.toJson(); - json.put("privateparameters", engine.getJsonPrivateParameters()); // we need to remember those + JSONObject json = new JSONObject(); + JSONObject resourceJson = engine.toJsonEngine(); + resourceJson.put("privateparameters", engine.getJsonPrivateParameters()); // we need to remember those + JSONObject healthJson = engine.toJsonHealth(); + json.put("resource", resourceJson); + json.put("health", healthJson); + json.put("searsia", "v1"); doc.add(new StringField("id", id, Field.Store.YES)); // unique identifier doc.add(new StoredField("json", json.toString())); return doc; @@ -331,4 +336,34 @@ public void close() throws IOException { this.me = null; } + + public JSONObject toJsonHealth() { + String lastMessage = null; + int countOk = 0, + countError = 0; + if (this.mother.isHealthy()) { + countOk += 1; + } else { + countError += 1; + lastMessage = this.mother.getId() + " (mother): " + this.mother.getLastError(); + } + for (Resource engine: this.engines.values()) { + if (engine.isHealthy()) { + countOk += 1; + } else { + countError += 1; + if (lastMessage == null) { // last error of any engine + lastMessage = engine.getId() + ": " + engine.getLastError(); + } + } + } + JSONObject stats = new JSONObject(); + stats.put("enginesok", countOk); + stats.put("engineserr", countError); + if (lastMessage != null) { + stats.put("lastmessage", lastMessage); + } + return stats; + } + } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 13bfe58..3dd1772 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -17,6 +17,10 @@ package org.searsia.web; import java.io.IOException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; import javax.ws.rs.GET; import javax.ws.rs.OPTIONS; @@ -42,9 +46,14 @@ public class Search { private final static org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(Search.class); + private final static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); + private final static String startTime = dateFormat.format(new Date()); private ResourceIndex engines; private SearchResultIndex index; + private long nrOfQueriesOk = 0; + private long nrOfQueriesError = 0; + public Search(SearchResultIndex index, ResourceIndex engines) throws IOException { this.engines = engines; @@ -116,18 +125,22 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q } } else { json = new JSONObject().put("resource", engine.toJson()); + json.put("health", engine.toJsonHealth()); LOGGER.info("Resource " + resourceid + "."); return SearsiaApplication.responseOk(json); } } else { - if (query != null && query.trim().length() > 0) { + JSONObject healthJson = null; + if (query != null && query.trim().length() > 0) { try { result = index.search(query); } catch (Exception e) { String message = "Service unavailable: " + e.getMessage(); LOGGER.warn(message); + this.nrOfQueriesError += 1; return SearsiaApplication.responseError(503, message); } + this.nrOfQueriesOk += 1; if (result.getHits().isEmpty() && mother != null) { // empty? ask mother! try { result = mother.search(query); @@ -140,14 +153,20 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q } else { // own results? Do resource ranking. result.scoreResourceSelection(query, engines); } - } else { // no query? Return empty results with extra info - boolean extraInfo = true; + } else { result = new SearchResult(); - result.scoreResourceSelection(query, engines, extraInfo); + result.scoreResourceSelection(query, engines); + healthJson = engines.toJsonHealth(); + healthJson.put("requestsok", this.nrOfQueriesOk); + healthJson.put("requestserr", this.nrOfQueriesError); + healthJson.put("upsince", startTime); } json = result.toJson(); - json.put("resource", engines.getMyself().toJson()); - LOGGER.info("Local " + resourceid + ": " + query); + json.put("resource", me.toJson()); + if (healthJson != null) { + json.put("health", healthJson); + } + LOGGER.info("Local " + resourceid + ": " + query); // TODO query can be null return SearsiaApplication.responseOk(json); } } diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index 55d1cca..1047fad 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -40,7 +40,7 @@ public void test3() { String term = sr.randomTerm(notThis); Assert.assertFalse("Same random term", term.equals(notThis)); String terms = h.toIndexVersion().toLowerCase(); - Assert.assertTrue("Index contains term", terms.contains(term)); + Assert.assertTrue("Index contains random term: " + term, terms.contains(term)); Assert.assertEquals("Total nr of hits", sr.getHits().size(), 2); sr.scoreReranking("test", "or"); Assert.assertEquals("Nr of hits after reranking", sr.getHits().size(), 2); From e32d1d959c4da3f69f9a1e1792dedef93fb9fc1f Mon Sep 17 00:00:00 2001 From: Searsia Date: Mon, 1 May 2017 23:41:23 +0200 Subject: [PATCH 23/51] fix test query check --- src/main/java/org/searsia/engine/Resource.java | 17 +++++++++-------- .../java/org/searsia/index/ResourceIndex.java | 8 ++++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 729ce23..966f9e5 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -269,7 +269,11 @@ public SearchResult randomSearch() throws SearchException { String thisQuery = this.nextQuery; this.nextQuery = null; // so, nextQuery will be null in case of a searchexception SearchResult result = search(thisQuery, null); - this.nextQuery = result.randomTerm(thisQuery); + if (this.testQuery.equals(thisQuery) && result.getHits().isEmpty()) { + throw new SearchException("No results for test query: " + thisQuery); + } else { + this.nextQuery = result.randomTerm(thisQuery); + } return result; } @@ -313,17 +317,14 @@ public SearchResult search(String query, String debug) throws SearchException { if (this.rerank != null && query != null) { result.scoreReranking(query, this.rerank); } - if (this.testQuery.equals(query) && result.getHits().isEmpty()) { - throw new SearchException("No results for test query: " + query); + if (!result.getHits().isEmpty()) { + this.nrOfOk += 1; // only success if at least one result + this.lastUsedOk = new Date().getTime(); } } catch (Exception e) { // catch all, also runtime exceptions - this.nrOfError += 1; - this.lastUsedError = new Date().getTime(); this.lastMessage = e.getMessage(); throw createPrivateSearchException(e); } - this.nrOfOk += 1; - this.lastUsedOk = new Date().getTime(); result.setQuery(query); result.setResourceId(this.getId()); return result; @@ -406,7 +407,7 @@ private SearchResult xpathSearch(String url, String page, String debug) document = parseDocumentJavascript(page); } else if (this.mimeType.equals("application/xml")) { document = parseDocumentXML(page); - } else if (this.mimeType.equals("text/html")){ + } else if (this.mimeType.equals("text/html")) { document = parseDocumentHTML(page, url); } else { throw new IOException("MIME Type not supported: " + this.mimeType); diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index be5277c..0e002da 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -348,14 +348,18 @@ public JSONObject toJsonHealth() { lastMessage = this.mother.getId() + " (mother): " + this.mother.getLastError(); } for (Resource engine: this.engines.values()) { + String error = engine.getLastError(); if (engine.isHealthy()) { countOk += 1; } else { countError += 1; - if (lastMessage == null) { // last error of any engine - lastMessage = engine.getId() + ": " + engine.getLastError(); + if (lastMessage == null) { + lastMessage = engine.getId() + ": " + error; } } + if (countError == 0 && lastMessage == null && error != null) { + lastMessage = engine.getId() + ": " + error; // last error of any engine + } } JSONObject stats = new JSONObject(); stats.put("enginesok", countOk); From ce27c8c3e324f32d6ce2a9638333ffee3c1bc6d5 Mon Sep 17 00:00:00 2001 From: Searsia Date: Tue, 2 May 2017 14:09:02 +0200 Subject: [PATCH 24/51] test all option --- src/main/java/org/searsia/Main.java | 52 +++++++++++++------ src/main/java/org/searsia/SearsiaOptions.java | 6 +-- .../java/org/searsia/engine/Resource.java | 4 +- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 6cfabfa..8a31ab2 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -190,13 +190,29 @@ public static String getHashString(String inputString) { } - private static void testMother(Resource mother, String debugInfo, Boolean isQuiet) { - SearchResult result = null; - try { - result = mother.search(mother.getTestQuery(), debugInfo); - } catch (SearchException e) { - fatalError("Test failed: " + e.getMessage()); + private static void testAll(Resource mother, SearchResult result, Boolean isQuiet) throws SearchException { + int nrFailed = 0; + for (Hit hit: result.getHits()) { + if (hit.getRid() != null) { + try { + Resource engine = mother.searchResource(hit.getRid()); + testMother(engine, "none", isQuiet); + } catch (Exception e) { + nrFailed += 1; + printMessage("Test failed: " + e.getMessage(), isQuiet); + } + } } + if (nrFailed > 0) { + throw new SearchException(nrFailed + " engines failed."); + } + } + + + private static void testMother(Resource mother, String debugInfo, Boolean isQuiet) throws SearchException { + printMessage("Testing: " + mother.getName() + " (" + mother.getId() + ")", isQuiet); + SearchResult result = null; + result = mother.search(mother.getTestQuery(), debugInfo); if (!isQuiet) { if (debugInfo.equals("json")) { System.out.println(result.toJson().toString(2)); @@ -211,12 +227,13 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie } System.out.flush(); if (result.getHits().isEmpty()) { - fatalError("Test failed: No results for test query."); - } else { - if (result.getHits().size() < 10) { - printMessage("Warning: less than 10 results for query: " + result.getQuery() + "; see \"testquery\" or \"rerank\".", isQuiet); - } - printMessage("Test succeeded.", isQuiet); + throw new SearchException("No results for test query."); + } + if (result.getHits().size() < 10) { + printMessage("Warning: less than 10 results for query: " + result.getQuery() + "; see \"testquery\" or \"rerank\".", isQuiet); + } + if (debugInfo.equals("all")) { + testAll(mother, result, isQuiet); } } @@ -294,8 +311,12 @@ public static void main(String[] args) { // If test is set, test the mother if (options.getTestOutput() != null) { - printMessage("Testing: " + mother.getName(), options.isQuiet()); - testMother(mother, options.getTestOutput(), options.isQuiet()); + try { + testMother(mother, options.getTestOutput(), options.isQuiet()); + printMessage("Test succeeded.", options.isQuiet()); + } catch (Exception e) { + fatalError("Test failed: " + e.getMessage()); + } } else { printMessage("Starting: " + myself.getName(), options.isQuiet()); } @@ -320,8 +341,7 @@ public static void main(String[] args) { // Start the web server String myURI = removeFileNameUri(options.getMyURI()); - try { - + try { SearsiaApplication app = new SearsiaApplication(index, engines); server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), app); } catch (Exception e) { diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 12ad136..20066bb 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -59,7 +59,7 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { options.addOption("m", "mother", true, "Set url of mother's web service end point."); options.addOption("p", "path", true, "Set directory path to store the index."); options.addOption("q", "quiet", false, "No output to console."); - options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response')."); + options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response', 'all')."); options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); @@ -148,8 +148,8 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } if (cmd.hasOption("t")) { test = cmd.getOptionValue("t").toLowerCase(); - if (!(test.equals("json") || test.equals("xml") || test.equals("response"))) { - throw new IllegalArgumentException("Test output must be one of 'json', 'xml' or 'response'."); + if (!(test.equals("json") || test.equals("xml") || test.equals("response") || test.equals("all"))) { + throw new IllegalArgumentException("Test output must be one of 'json', 'xml', 'response', or 'all'."); } } try { diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 966f9e5..ac324d8 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -568,6 +568,9 @@ private InputStream httpConnect(URLConnection connection, String postString) thr http.setRequestMethod("GET"); http.connect(); } + if (http.getResponseCode() == 301) { // FollowRedirects did not work?! + throw new IOException("Moved permanently"); + } return http.getInputStream(); } @@ -585,7 +588,6 @@ private String getCompletePage(String urlString, String postString, Map Date: Tue, 2 May 2017 23:14:18 +0200 Subject: [PATCH 25/51] fix reporting, more options --- src/main/java/org/searsia/Main.java | 4 ++-- src/main/java/org/searsia/SearsiaOptions.java | 15 +++++++++++++-- src/main/java/org/searsia/engine/Resource.java | 9 +++++++-- .../java/org/searsia/index/ResourceIndex.java | 18 ++++++++---------- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 8a31ab2..3a93c12 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -318,7 +318,7 @@ public static void main(String[] args) { fatalError("Test failed: " + e.getMessage()); } } else { - printMessage("Starting: " + myself.getName(), options.isQuiet()); + printMessage("Starting: " + myself.getName() + " (" + myself.getId() + ")", options.isQuiet()); } @@ -357,6 +357,6 @@ public static void main(String[] args) { searsiaDaemon(index, engines, options.getPollInterval()); } catch (InterruptedException e) { } } - server.shutdownNow(); + server.shutdownNow(); // Catch ctrl+c: http://www.waelchatila.com/2006/01/13/1137143896635.html } } diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 20066bb..d054d4a 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -38,6 +38,7 @@ public class SearsiaOptions { private Boolean quiet; private Boolean help; private Boolean dontshare; + private Boolean nohealth; private int cacheSize; private int pollInterval; private int logLevel; @@ -57,13 +58,14 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); options.addOption("m", "mother", true, "Set url of mother's web service end point."); + options.addOption("n", "nohealth", false, "Do not share health report."); options.addOption("p", "path", true, "Set directory path to store the index."); options.addOption("q", "quiet", false, "No output to console."); options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response', 'all')."); options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); - if (myURI == null) { + if (myURI == null && motherTemplate != null) { myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); } } @@ -85,6 +87,7 @@ private void setDefaults() { help = false; quiet = false; dontshare = false; + nohealth = false; cacheSize = 500; pollInterval = 120; logLevel = 2; @@ -177,6 +180,9 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("d")) { dontshare = true; } + if (cmd.hasOption("n")) { + nohealth = true; + } if (cmd.hasOption("u")) { myURI = cmd.getOptionValue("u"); } @@ -199,7 +205,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti private void help(Options options) { HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp("SearsiaServer", options); + formatter.printHelp("searsiaserver", options); } public int getCacheSize() { @@ -250,6 +256,10 @@ public Boolean isNotShared() { return dontshare; } + public Boolean isNoHealthReport() { + return nohealth; + } + public Boolean isHelp() { return help; } @@ -265,6 +275,7 @@ public String toString() { result += "\n Cache Size = " + getCacheSize(); result += "\n Test Output = " + getTestOutput(); result += "\n Do Not Share = " + isNotShared(); + result += "\n No Health Rep.= " + isNoHealthReport(); return result; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index ac324d8..df7b695 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -270,7 +270,10 @@ public SearchResult randomSearch() throws SearchException { this.nextQuery = null; // so, nextQuery will be null in case of a searchexception SearchResult result = search(thisQuery, null); if (this.testQuery.equals(thisQuery) && result.getHits().isEmpty()) { - throw new SearchException("No results for test query: " + thisQuery); + this.nrOfError += 1; + this.lastUsedError = new Date().getTime(); + this.lastMessage = "No results for test query: " + thisQuery; + throw new SearchException(this.lastMessage); } else { this.nextQuery = result.randomTerm(thisQuery); } @@ -322,7 +325,9 @@ public SearchResult search(String query, String debug) throws SearchException { this.lastUsedOk = new Date().getTime(); } } catch (Exception e) { // catch all, also runtime exceptions - this.lastMessage = e.getMessage(); + this.nrOfError += 1; + this.lastUsedError = new Date().getTime(); + this.lastMessage = e.getMessage(); throw createPrivateSearchException(e); } result.setQuery(query); diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 0e002da..0baff04 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -341,26 +341,24 @@ public JSONObject toJsonHealth() { String lastMessage = null; int countOk = 0, countError = 0; - if (this.mother.isHealthy()) { - countOk += 1; - } else { - countError += 1; - lastMessage = this.mother.getId() + " (mother): " + this.mother.getLastError(); - } for (Resource engine: this.engines.values()) { String error = engine.getLastError(); if (engine.isHealthy()) { countOk += 1; } else { countError += 1; - if (lastMessage == null) { - lastMessage = engine.getId() + ": " + error; - } + lastMessage = engine.getId() + ": " + error; } if (countError == 0 && lastMessage == null && error != null) { - lastMessage = engine.getId() + ": " + error; // last error of any engine + lastMessage = engine.getId() + ": " + error; // last error of any engine } } + if (this.mother.isHealthy()) { + countOk += 1; + } else { + countError += 1; + lastMessage = this.mother.getId() + " (mother): " + this.mother.getLastError(); + } JSONObject stats = new JSONObject(); stats.put("enginesok", countOk); stats.put("engineserr", countError); From 1f72ec47b67187d91523abc21cd6b3c89f2ebb40 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 4 May 2017 22:34:26 +0200 Subject: [PATCH 26/51] xml and index bug fixes --- src/main/java/org/searsia/Hit.java | 12 ++++++++++-- src/main/java/org/searsia/Main.java | 2 +- src/main/java/org/searsia/SearchResult.java | 8 ++++++-- src/main/java/org/searsia/engine/DOMBuilder.java | 4 ++-- src/main/java/org/searsia/web/Search.java | 8 +++----- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index b696f62..9069d4e 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -155,14 +155,22 @@ private String noHTML(String value) { return value.replaceAll("[<>]", ""); } - public JSONObject toJson() { + public JSONObject toJson() { + return toJson(null); + } + + + public JSONObject toJson(String ignoreKey) { JSONObject json = new JSONObject(); for (Entry e: map.entrySet()) { Object value = e.getValue(); if (value instanceof String) { value = noHTML((String) value); } - json.put(e.getKey(), value); + String key = e.getKey(); + if (!e.equals(ignoreKey)) { + json.put(e.getKey(), value); + } } return json; } diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 3a93c12..8afe4f9 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -101,7 +101,7 @@ private static int getResources(Resource mother, SearchResult result, ResourceIn String rid = hit.getString("rid"); if (rid != null ) { Resource engine = engines.get(rid); - if (engine == null || engine.getLastUpdatedSecondsAgo() > 3600) { + if (engine == null || engine.getLastUpdatedSecondsAgo() > 7200) { // TODO: option for 7200 ? i += 1; try { engine = mother.searchResource(rid); diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 395dc81..2f9875b 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -264,11 +264,15 @@ public String randomTerm(String notThisOne) { // TODO: keep track of more previo } } - public JSONObject toJson() { + public JSONObject toJson() { + return toJson(null); + } + + public JSONObject toJson(String ignoreKey) { JSONObject r = new JSONObject(); r.put("hits", new JSONArray()); for (Hit hit: hits) { - r.append("hits", hit.toJson()); + r.append("hits", hit.toJson(ignoreKey)); } return r; } diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 9be99af..ee6da9d 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -168,7 +168,7 @@ private static void createDOMfromJsoup(org.jsoup.nodes.Node node, Node out, Docu } else if (node instanceof org.jsoup.nodes.Element) { org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node); - org.w3c.dom.Element _e = doc.createElement(e.tagName()); + org.w3c.dom.Element _e = doc.createElement(correctXML(e.tagName())); out.appendChild(_e); org.jsoup.nodes.Attributes atts = e.attributes(); @@ -293,7 +293,7 @@ private static void createDOMfromJSONPrimitive(Object object, Node out, Document * @return */ private static String correctXML(String name) { - name = name.replaceAll("[^A-Z0-9a-z\\-_\\.]|^([^A-Za-z])", "_$1"); + name = name.replaceAll("[^A-Z0-9a-z\\-_\\.]|^([^A-Za-z_])", "_$1"); return name; } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 3dd1772..2f3241d 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -82,8 +82,8 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q mother = engines.getMother(); if (!resourceid.equals(me.getId())) { engine = engines.get(resourceid); - if (engine == null || engine.getLastUpdatedSecondsAgo() > 7200) { // unknown or really old? ask your mother - if (mother != null) { // TODO: option for 7200 and similar value (3600) in Main + if (engine == null || engine.getLastUpdatedSecondsAgo() > 9600) { // unknown or really old? ask your mother + if (mother != null) { // TODO: option for 9600 and similar value (7200) in Main try { engine = mother.searchResource(resourceid); } catch (SearchException e) { @@ -102,9 +102,7 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q if (query != null && query.trim().length() > 0) { result = index.cacheSearch(query, engine.getId()); if (result != null) { - result.removeResourceQuery(); - json = result.toJson(); - json.put("resource", engine.toJson()); + json = result.toJson("query"); LOGGER.info("Cache " + resourceid + ": " + query); return SearsiaApplication.responseOk(json); } else { From 4d0dfb41fa41aa0841804715ecc377ba7e2d9272 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 5 May 2017 23:31:14 +0200 Subject: [PATCH 27/51] resource presentation --- src/main/java/org/searsia/Main.java | 23 +++++++++------- .../java/org/searsia/engine/DOMBuilder.java | 13 +++++----- .../java/org/searsia/engine/Resource.java | 26 ++++++++++++++++++- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 8afe4f9..d39b003 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -61,9 +61,11 @@ public class Main { private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines, - int pollInterval) throws InterruptedException { - Resource mother = engines.getMother(); - Resource engine = null; + SearsiaOptions options) throws InterruptedException { + Resource mother = engines.getMother(); + Resource engine = null; + int pollInterval = options.getPollInterval(); + String myUri = options.getMyURI(); while(true) { Thread.sleep(pollInterval * 1000); try { @@ -74,7 +76,8 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines result = engine.randomSearch(); Resource newmother = result.getResource(); if (newmother != null && newmother.getId().equals(mother.getId())) { - engines.putMother(newmother); // TODO myself! + engines.putMother(newmother); + engines.putMyself(newmother.getLocalResource(myUri)); } else { LOGGER.warn("Unable to update mother: Did ids change?"); } @@ -207,8 +210,8 @@ private static void testAll(Resource mother, SearchResult result, Boolean isQuie throw new SearchException(nrFailed + " engines failed."); } } - - + + private static void testMother(Resource mother, String debugInfo, Boolean isQuiet) throws SearchException { printMessage("Testing: " + mother.getName() + " (" + mother.getId() + ")", isQuiet); SearchResult result = null; @@ -300,15 +303,15 @@ public static void main(String[] args) { if (version != null && !version.startsWith("v1")) { fatalError("Wrong major Searsia version " + version + ": Must be v1.0.0 or higher."); } - myself = mother.deepcopy(); - myself.setUrlAPITemplate(options.getMyURI()); + if (mother.getAPITemplate() == null) { mother.setUrlAPITemplate(options.getMotherTemplate()); } else if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); } - + myself = mother.getLocalResource(options.getMyURI()); + // If test is set, test the mother if (options.getTestOutput() != null) { try { @@ -354,7 +357,7 @@ public static void main(String[] args) { printMessage("API end point: " + normalizedUriToTemplate(myURI, myself.getId()), options.isQuiet()); printMessage("Use Ctrl+c to stop.", options.isQuiet()); try { - searsiaDaemon(index, engines, options.getPollInterval()); + searsiaDaemon(index, engines, options); } catch (InterruptedException e) { } } server.shutdownNow(); // Catch ctrl+c: http://www.waelchatila.com/2006/01/13/1137143896635.html diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index ee6da9d..0735963 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -192,9 +192,9 @@ else if (!attPrefix.equals("xml")) { } } try { - _e.setAttribute(attName, a.getValue()); + _e.setAttribute(attName, a.getValue()); } catch (DOMException domExcept) { - continue; + continue; } } @@ -244,12 +244,11 @@ private static void createDOMfromJSONObject(JSONObject json, Node out, Document Object object = json.get(name); if (object instanceof JSONArray) { createDOMfromJSONArray((JSONArray) object, out, doc, name); + } else if (object instanceof JSONObject) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONObject((JSONObject) object, _e, doc); } else { - if (object instanceof JSONObject) { - org.w3c.dom.Element _e = doc.createElement(correctXML(name)); - out.appendChild(_e); - createDOMfromJSONObject((JSONObject) object, _e, doc); - } else createDOMfromJSONPrimitive(object, out, doc, name); } } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index df7b695..9904e7d 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -762,7 +762,31 @@ public boolean isHealthy() { return this.lastUsedOk >= this.lastUsedError; } - + + public Resource getLocalResource(String myUri) { + JSONObject json = new JSONObject(); + Resource result = null; + json.put("apitemplate", myUri); + json.put("id", this.getId()); + json.put("mimetype", SearchResult.SEARSIA_MIME_TYPE); + String value = this.getName(); + if (value != null) { json.put("name", value); } + json.put("name", this.getName()); + value = this.getBanner(); + if (value != null) { json.put("banner", value); } + value = this.getFavicon(); + if (value != null) { json.put("favicon", value); } + value = this.getSuggestTemplate(); + if (value != null) { json.put("suggesttemplate", value); } + value = this.getTestQuery(); + if (value != null) { json.put("testquery", value); } + try { + result = new Resource(json); + } catch (XPathExpressionException e) { } + return result; + } + + public float score(String query) { float score = 0.0f; Map nameTerm = new HashMap(); From 1ea3954abc228eaeb7ea4a8048352aa944357030 Mon Sep 17 00:00:00 2001 From: Searsia Date: Sat, 6 May 2017 00:05:34 +0200 Subject: [PATCH 28/51] fix my apitemplate --- src/main/java/org/searsia/SearsiaOptions.java | 8 +++++--- src/main/java/org/searsia/engine/Resource.java | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index d054d4a..76b6cb4 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -65,14 +65,16 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); - if (myURI == null && motherTemplate != null) { + if (myURI == null) { myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); + } else if (myURI.endsWith("/")) { + myURI += "/"; } } - + private static String lastDir(String uri) { - if (uri.contains("/")) { + if (uri != null && uri.contains("/")) { uri = uri.replaceAll("\\/[^\\/]*$", ""); uri = uri.replaceAll("^.+\\/", ""); return uri + "/"; diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 9904e7d..4329ad2 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -766,7 +766,7 @@ public boolean isHealthy() { public Resource getLocalResource(String myUri) { JSONObject json = new JSONObject(); Resource result = null; - json.put("apitemplate", myUri); + json.put("apitemplate", myUri + this.getId() + ".json?q={q}"); json.put("id", this.getId()); json.put("mimetype", SearchResult.SEARSIA_MIME_TYPE); String value = this.getName(); From 106ebcd68376c0676106a92549e883a8a9911cc2 Mon Sep 17 00:00:00 2001 From: Searsia Date: Sat, 6 May 2017 13:08:19 +0200 Subject: [PATCH 29/51] documentation and minor bug fixes --- src/main/java/org/searsia/Hit.java | 27 +++++++++--- src/main/java/org/searsia/Main.java | 43 +++++++++++-------- src/main/java/org/searsia/SearchResult.java | 31 +++++++++---- src/main/java/org/searsia/SearsiaOptions.java | 34 ++++++++++++--- .../java/org/searsia/engine/DOMBuilder.java | 24 ++++++----- .../java/org/searsia/engine/Resource.java | 7 +++ .../org/searsia/engine/SearchException.java | 4 +- .../org/searsia/engine/TextExtractor.java | 37 ++++++++++------ .../org/searsia/index/SearchResultIndex.java | 8 ++-- 9 files changed, 146 insertions(+), 69 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index 9069d4e..59d5dec 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -23,6 +23,13 @@ import org.json.JSONObject; +/** + * A single search hit. A hit can have any field. + * Standard fields are "title", "description", "url, "favicon", "image". + * + * @author Djoerd Hiemstra and Dolf Trieschnigg + */ + public class Hit implements Comparable { private Map map; @@ -89,9 +96,9 @@ public void setUrl(String url) { /** * This id of will be used the Lucene index. - * So one url may be indexed multiple times, + * One url may be indexed multiple times, * once for each resource id (rid). - * @return + * @return unique identifier */ public String getId() { String result = (String) map.get("url"); @@ -156,11 +163,19 @@ private String noHTML(String value) { } public JSONObject toJson() { - return toJson(null); + JSONObject json = new JSONObject(); + for (Entry e: map.entrySet()) { + Object value = e.getValue(); + if (value instanceof String) { + value = noHTML((String) value); + } + json.put(e.getKey(), value); + } + return json; } - public JSONObject toJson(String ignoreKey) { + public JSONObject toJsonNoQueryResourceId() { JSONObject json = new JSONObject(); for (Entry e: map.entrySet()) { Object value = e.getValue(); @@ -168,8 +183,8 @@ public JSONObject toJson(String ignoreKey) { value = noHTML((String) value); } String key = e.getKey(); - if (!e.equals(ignoreKey)) { - json.put(e.getKey(), value); + if (!key.equals("query") && !key.equals("rid")) { + json.put(key, value); } } return json; diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index d39b003..774ae8e 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -39,14 +39,14 @@ /** - * Searsia Main class does the following actions: + * Searsia Main class. Does the following actions: * - * 1. Connect to mother peer - * 2. If it runs in test mode, test the mother, print results and exit. - * 3. Open/create Lucene indexes - * 4. Get the 10 top resources if older than one hour - * 5. Run the web server - * 6. Run the daemon to periodically poll the mother and resources + * 1. Connect to mother peer; + * 2. If it runs in test mode, test the mother, print results and exit; + * 3. Open/create Lucene indexes; + * 4. Get the 10 top resources if not existing or too old; + * 5. Run the web server; + * 6. Run the daemon to periodically poll the mother and resources. * * Start as: java -jar target/searsiaserver.jar * More info: java -jar target/searsiaserver.jar --help @@ -91,8 +91,12 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines index.offer(result); LOGGER.info("Sampled " + engine.getId() + ": " + result.getQuery()); } - } catch (Exception e) { - LOGGER.warn("Sampling " + engine.getId() + " failed: " + e.getMessage()); + } catch (Exception e) { + if (engine != null) { + LOGGER.warn("Sampling " + engine.getId() + " failed: " + e.getMessage()); + } else { + LOGGER.warn("Flushing index to disk failed:" + e.getMessage()); + } } } } @@ -170,7 +174,7 @@ private static void fatalError(String message) { /** * For a unique filename (public because used in searsiafedweb) * @param inputString - * @return + * @return Unique hash */ public static String getHashString(String inputString) { MessageDigest md; @@ -227,8 +231,8 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie System.out.println(debugOut); } } + System.out.flush(); } - System.out.flush(); if (result.getHits().isEmpty()) { throw new SearchException("No results for test query."); } @@ -264,7 +268,7 @@ private static void setupLogger(String path, String filename, Level level) throw public static void main(String[] args) { - ResourceIndex engines = null; + ResourceIndex engines = null; SearchResultIndex index = null; SearsiaOptions options = null; HttpServer server = null; @@ -282,10 +286,10 @@ public static void main(String[] args) { // Connect to the mother engine and gather information from the mother. - Resource myself = null; - Resource mother = null; + Resource myself = null; + Resource mother = null; Resource connect = new Resource(options.getMotherTemplate()); - String version = null; + String version = null; SearchResult result = null; try { result = connect.searchWithoutQuery(); @@ -303,13 +307,15 @@ public static void main(String[] args) { if (version != null && !version.startsWith("v1")) { fatalError("Wrong major Searsia version " + version + ": Must be v1.0.0 or higher."); } - + + if (mother.getAPITemplate() == null) { mother.setUrlAPITemplate(options.getMotherTemplate()); } else if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); } - myself = mother.getLocalResource(options.getMyURI()); + String myURI = removeFileNameUri(options.getMyURI()); + myself = mother.getLocalResource(myURI); // If test is set, test the mother @@ -343,8 +349,7 @@ public static void main(String[] args) { // Start the web server - String myURI = removeFileNameUri(options.getMyURI()); - try { + try { SearsiaApplication app = new SearsiaApplication(index, engines); server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), app); } catch (Exception e) { diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 2f9875b..f17f1b8 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -31,6 +31,13 @@ import org.searsia.index.ResourceIndex; import org.searsia.engine.Resource; +/** + * A Searsia Search result page, + * consisting of "hits", a "query" and a "resource". + * + * @author Djoerd Hiemstra and Dolf Trieschnigg + */ + public class SearchResult { public static final String SEARSIA_MIME_TYPE = "application/searsia+json"; public static final String SEARSIA_MIME_ENCODING = SEARSIA_MIME_TYPE + "; charset=utf-8"; @@ -171,13 +178,13 @@ public void scoreResourceSelection(String query, ResourceIndex engines) { } /** - * TODO: needs a proper implementation, refactoring, and research ;-) * Scoring follows these rules: - * 1. hits are ordered such that the first hit per rid determines the resource ranking - * 2. if a resource has a exact query match, then these are ranked highest (given rule 1) - * 3. order by score (given rule 1 and rule 2) - * 4. TODO: not more than x (=10?) hits per resource - * 5. stop after 20 resources + * (TODO: needs a proper implementation, refactoring, and research ;-) ) + * 1. hits are ordered such that the first hit per rid determines the resource ranking; + * 2. if a resource has a exact query match, then these are ranked highest (given rule 1); + * 3. order by score (given rule 1 and rule 2); + * 4. TODO: not more than x (=10?) hits per resource; + * 5. stop after 20 resources. * @param query * @param engines */ @@ -265,15 +272,21 @@ public String randomTerm(String notThisOne) { // TODO: keep track of more previo } public JSONObject toJson() { - return toJson(null); + return toJson(false); } - public JSONObject toJson(String ignoreKey) { + public JSONObject toJson(boolean censorQueryResourceId) { JSONObject r = new JSONObject(); r.put("hits", new JSONArray()); for (Hit hit: hits) { - r.append("hits", hit.toJson(ignoreKey)); + if (censorQueryResourceId) { + r.append("hits", hit.toJsonNoQueryResourceId()); + } else { + r.append("hits", hit.toJson()); + } } return r; } + + } diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 76b6cb4..4959aff 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -27,7 +27,7 @@ import org.apache.commons.cli.ParseException; /** - * Searsia Server options + * Searsia Server options. * @author Djoerd Hiemstra * */ @@ -47,10 +47,11 @@ public class SearsiaOptions { private String indexPath; /** - * Takes command line options and sensible defaults - * + * Takes command line options and sensible defaults. + * @param args Command Line options + * @throws IllegalArgumentException */ - public SearsiaOptions(String[] args) throws IllegalArgumentException { + public SearsiaOptions(String[] args) throws IllegalArgumentException { Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); options.addOption("d", "dontshare",false, "Do not share resource definitions."); @@ -67,8 +68,6 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { parse(options, args); if (myURI == null) { myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); - } else if (myURI.endsWith("/")) { - myURI += "/"; } } @@ -210,18 +209,37 @@ private void help(Options options) { formatter.printHelp("searsiaserver", options); } + /** + * Get the size of the SearchResult cache. + * @return cache size + */ public int getCacheSize() { return cacheSize; } + /** + * Get the test that needs to be executed. + * Possible values: "json", "xml", "response", "all", or null (no test) + * @return test + */ public String getTestOutput() { return test; } + /** + * Get log level, a value between 0 and 5 + * Possible values: 0=off, 1=error, 2=warn (default), 3=info, 4=debug, 5=trace + * @return log level + */ public int getLogLevel() { return logLevel; } + /** + * Get the log4j level. + * Possible values: off, error, warn (default), info, debug, trace + * @return log4j level + */ public Level getLoggerLevel() { switch(logLevel) { case 0 : return Level.OFF; @@ -234,6 +252,10 @@ public Level getLoggerLevel() { } } + /** + * Get poll interval (in seconds). + * @return poll interval. + */ public int getPollInterval() { return pollInterval; } diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 0735963..0a0a1be 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -39,12 +39,9 @@ import org.xml.sax.InputSource; /** - * Returns a W3C DOM for a Jsoup parsed document. - * - * @author Walter Kasper - * - * Returns a W3C DOM for a Json document + * Returns a W3C DOM for a Jsoup parsed document or a Json parsed document. * + * @author Walter Kasper * @author Djoerd Hiemstra * */ @@ -54,7 +51,7 @@ public class DOMBuilder { * Returns a W3C DOM that exposes the content as the supplied XML string. * @param xmlString The XML string to parse. * @return A W3C Document. - * @throws + * @throws RuntimeException if not well-formed */ public static Document string2DOM(String xmlString) { @@ -78,6 +75,11 @@ public static Document string2DOM(String xmlString) { } + /** + * Returns an XML string for a W3C Document + * @param document A W3C Document + * @return XML string + */ public static String DOM2String(Document document) { TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer; @@ -126,8 +128,8 @@ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { /** - * Returns a W3C DOM that exposes the same content as the supplied Jsoup document into a W3C DOM. - * @param jsoupDocument The Jsoup document to convert. + * Returns a W3C DOM that exposes the same content as the supplied JSON Object into a W3C DOM. + * @param jsonDocument The JSON Object to convert. * @return A W3C Document. */ public static Document json2DOM(JSONObject jsonDocument) { @@ -286,10 +288,10 @@ private static void createDOMfromJSONPrimitive(Object object, Node out, Document } /** - * Element names can contain letters, digits, hyphens, underscores, and periods + * XML Element names can contain letters, digits, hyphens, underscores, and periods * Element names must start with a letter or underscore - * @param name - * @return + * @param name XML element name + * @return correct XML element name */ private static String correctXML(String name) { name = name.replaceAll("[^A-Z0-9a-z\\-_\\.]|^([^A-Za-z_])", "_$1"); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 4329ad2..31ae035 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -55,6 +55,12 @@ import org.searsia.Hit; import org.searsia.SearchResult; +/** + * A Searsia Resource: A wrapper for external search engines. It can read results from + * engines that produce results in: HTML, XML, JSON, and SEARSIA JSON. + * + * @author Djoerd Hiemstra and Dolf Trieschnigg + */ public class Resource implements Comparable { public final static String defaultTestQuery = "searsia"; @@ -156,6 +162,7 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { } } + public void setUrlAPITemplate(String urlTemplate) { this.urlAPITemplate = urlTemplate; } diff --git a/src/main/java/org/searsia/engine/SearchException.java b/src/main/java/org/searsia/engine/SearchException.java index b3a3006..46c0806 100644 --- a/src/main/java/org/searsia/engine/SearchException.java +++ b/src/main/java/org/searsia/engine/SearchException.java @@ -16,7 +16,9 @@ package org.searsia.engine; - +/** + * A Searsia Search Exception + */ public class SearchException extends Exception { private static final long serialVersionUID = -7429746644586456271L; diff --git a/src/main/java/org/searsia/engine/TextExtractor.java b/src/main/java/org/searsia/engine/TextExtractor.java index cb39252..8bad627 100644 --- a/src/main/java/org/searsia/engine/TextExtractor.java +++ b/src/main/java/org/searsia/engine/TextExtractor.java @@ -27,13 +27,18 @@ import org.searsia.Hit; +/** + * Manage XPath queries and extract the hit fields. + * + * @author Dolf Trieschnigg + * @author Djoerd Hiemstra + */ public class TextExtractor { private String field; private String xpath; private XPathExpression compiledXpath; - - private boolean trim = true; + public TextExtractor(String field, String xpath) throws XPathExpressionException { this.field = field; @@ -45,6 +50,12 @@ public TextExtractor(String field, String xpath) throws XPathExpressionException } + /** + * Modifies hit by adding result for the text extractor + * @param item An XML context element + * @param hit An updated hit + * @throws XPathExpressionException + */ public void extract(Node item, Hit hit) throws XPathExpressionException { String resultString = ""; // TODO: StringBuilder try { @@ -66,32 +77,30 @@ public void extract(Node item, Hit hit) throws XPathExpressionException { } } - /** - * processes the match found with the XPath - * - * By default, uses the trim attribute to indicate whether the match should be trimmed - * Note: the string can be null - * - * @param s - * @return - */ private String processMatch(String s) { s = s.replaceAll("(?i)]*>||||", ""); // No HTML, please: spans removed s = s.replaceAll("<[^>]+>|\ufffd", " "); // all other tags or unicode replace character replaced by a space - if (trim) { - s = s.trim(); - } + s = s.trim(); // TODO multiple spaces, \\s ? return s; } + /** + * Get the field for the text extractor + * @return field + */ public String getField() { return field; } + /** + * Get the XPath query for the text extractor + * @return XPath query + */ public String getPath() { return xpath; } + @Override public boolean equals(Object o) { TextExtractor e = (TextExtractor) o; if (!getField().equals(e.getField())) return false; diff --git a/src/main/java/org/searsia/index/SearchResultIndex.java b/src/main/java/org/searsia/index/SearchResultIndex.java index ab60744..0700429 100644 --- a/src/main/java/org/searsia/index/SearchResultIndex.java +++ b/src/main/java/org/searsia/index/SearchResultIndex.java @@ -178,7 +178,7 @@ public SearchResult search (String queryString, int hitsPerPage) throws IOExcept /** * Get Hit by Lucene id. Used for tests only * @param hitId - * @return + * @return hit * @throws IOException */ protected Hit getHit(String hitId) throws IOException { @@ -199,13 +199,15 @@ protected Hit getHit(String hitId) throws IOException { /** * Searches the queue for a cached result + * TODO: Is this thread safe in case of a concurrent cash flush? See: + * https://docs.oracle.com/javase/8/docs/api/java/util/concurrent/package-summary.html#Weakly * @param query - * @return + * @return search result page */ public SearchResult cacheSearch(String query, String resourceId) { if (query != null && resourceId != null) { // TODO: make more efficient with initial HashMap check query+id Iterator iterator = this.queue.iterator(); - while(iterator.hasNext()) { + while (iterator.hasNext()) { SearchResult result = iterator.next(); if (query.equals(result.getQuery()) && resourceId.equals(result.getResourceId())) { return result; From 2b8ca381ef5378da8bad9256c2e59b21f5b5380c Mon Sep 17 00:00:00 2001 From: Searsia Date: Tue, 9 May 2017 21:37:42 +0200 Subject: [PATCH 30/51] tests and cache fix --- src/main/java/org/searsia/Main.java | 3 +- .../org/searsia/index/SearchResultIndex.java | 1 + src/main/java/org/searsia/web/Search.java | 4 +- .../java/org/searsia/SearchResultTest.java | 8 +-- .../searsia/index/TestSearchResultIndex.java | 11 ++-- src/test/java/org/searsia/web/SearchTest.java | 50 +++++++++++++++++-- src/test/resources/exampleSearchResult.json | 3 +- 7 files changed, 64 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 774ae8e..a6bbe27 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -73,6 +73,7 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines SearchResult result = null; if (mother != null && random.nextBoolean()) { // sample mostly from mother engine = mother; + LOGGER.trace("Next: mother sample"); result = engine.randomSearch(); Resource newmother = result.getResource(); if (newmother != null && newmother.getId().equals(mother.getId())) { @@ -84,6 +85,7 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines getResources(mother, result, engines); } else { engine = engines.getRandom(); + LOGGER.trace("Next sample: " + engine.getId()); result = engine.randomSearch(); result.removeResourceQuery(); // only trust your mother result.addQueryResourceDate(engine.getId()); @@ -260,7 +262,6 @@ private static void setupLogger(String path, String filename, Level level) throw new PatternLayout("%p %d{ISO8601} %m%n"), logDir.resolve("searsia.log").toString(), "'.'yyyy-MM-dd"); - // Appender appender = new ConsoleAppender(new PatternLayout("%m%n"), ConsoleAppender.SYSTEM_ERR); LOGGER.addAppender(appender); LOGGER.setLevel(level); LOGGER.warn("Searsia restart"); diff --git a/src/main/java/org/searsia/index/SearchResultIndex.java b/src/main/java/org/searsia/index/SearchResultIndex.java index 0700429..461cba6 100644 --- a/src/main/java/org/searsia/index/SearchResultIndex.java +++ b/src/main/java/org/searsia/index/SearchResultIndex.java @@ -142,6 +142,7 @@ private void storeSearchResult(SearchResult result) throws IOException { } public void offer(SearchResult result) { + // assert(result.getQuery() != null && result.getResourceId() != null); this.queue.offer(result); } diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 2f3241d..c80f9f2 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -102,7 +102,9 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q if (query != null && query.trim().length() > 0) { result = index.cacheSearch(query, engine.getId()); if (result != null) { - json = result.toJson("query"); + boolean censorQueryResourceId = true; + json = result.toJson(censorQueryResourceId); + json.put("resource", engine.toJson()); LOGGER.info("Cache " + resourceid + ": " + query); return SearsiaApplication.responseOk(json); } else { diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index 1047fad..b217740 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -1,14 +1,9 @@ package org.searsia; -import java.io.IOException; - -import javax.ws.rs.core.Response; - import org.junit.Assert; import org.junit.Test; import org.searsia.Hit; import org.searsia.SearchResult; -import org.searsia.web.Search; public class SearchResultTest { @@ -33,13 +28,14 @@ public void test3() { Hit h = new Hit("The ultimate test", "Oh yeah", "http://searsia.org", "http://searsia.org/images/search.png"); sr.addHit(h); + String terms = h.toIndexVersion().toLowerCase(); h = new Hit("Another test", "yeah", "http://searsia.org/test.html", "http://searsia.org/images/search.png"); sr.addHit(h); + terms += " " + h.toIndexVersion().toLowerCase(); String notThis = "test"; String term = sr.randomTerm(notThis); Assert.assertFalse("Same random term", term.equals(notThis)); - String terms = h.toIndexVersion().toLowerCase(); Assert.assertTrue("Index contains random term: " + term, terms.contains(term)); Assert.assertEquals("Total nr of hits", sr.getHits().size(), 2); sr.scoreReranking("test", "or"); diff --git a/src/test/java/org/searsia/index/TestSearchResultIndex.java b/src/test/java/org/searsia/index/TestSearchResultIndex.java index 40cdad5..bc7fbd5 100644 --- a/src/test/java/org/searsia/index/TestSearchResultIndex.java +++ b/src/test/java/org/searsia/index/TestSearchResultIndex.java @@ -30,7 +30,7 @@ public class TestSearchResultIndex { public static void setUp() throws Exception { LOGGER.removeAllAppenders(); LOGGER.addAppender(new NullAppender()); - index = new SearchResultIndex(PATH, INDEX, 2); + index = new SearchResultIndex(PATH, INDEX, 10); SearchResult result = readFile("exampleSearchResult.json"); index.offer(result); index.flush(); @@ -56,6 +56,7 @@ private static SearchResult readFile(String fileString) throws IOException { } JSONObject resource = json.getJSONObject("resource"); result.setResourceId(resource.getString("id")); + result.setQuery(json.getString("query")); return result; } @@ -76,11 +77,15 @@ public void testSearch1() throws Exception { public void testSearch2() throws Exception { SearchResult result = readFile("exampleSearchResult.json"); index.offer(result); - index.flush(); // add it again String query = "dolf"; + String resourceId = result.getResourceId(); + SearchResult result2 = index.cacheSearch(query, resourceId); + Assert.assertEquals(query, result2.getQuery()); + Assert.assertEquals("Cache result size", 10, result2.getHits().size()); + index.flush(); result = index.search(query); Assert.assertEquals(query, result.getQuery()); - Assert.assertEquals(1, result.getHits().size()); + Assert.assertEquals("Index result size", 1, result.getHits().size()); } @Test diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 44daaaf..5ad01a7 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -5,7 +5,11 @@ import javax.ws.rs.core.Response; import javax.xml.xpath.XPathExpressionException; +import org.apache.log4j.Appender; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Level; import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.apache.log4j.varia.NullAppender; import org.json.JSONArray; import org.json.JSONException; @@ -20,6 +24,8 @@ import org.searsia.engine.Resource; public class SearchTest { + + private static boolean letsLog = false; private static final Logger LOGGER = Logger.getLogger("org.searsia"); private static final String PATH = "target/index-test"; @@ -33,7 +39,11 @@ private static Resource wiki() throws XPathExpressionException, JSONException { } private static Resource wrong() throws XPathExpressionException, JSONException { - return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.com/doesnotexist?q={q}\", \"id\":\"wrong\"}")); + return new Resource(new JSONObject("{\"apitemplate\":\"http://doesnotexist.com/wrong?q={q}\", \"id\":\"wrong\"}")); + } + + private static Resource ok() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wikifull1{q}.json\", \"id\":\"wikifull1\"}")); } private static Resource me() throws XPathExpressionException, JSONException { @@ -43,12 +53,20 @@ private static Resource me() throws XPathExpressionException, JSONException { @BeforeClass public static void setUp() throws Exception { + Appender appender = null; LOGGER.removeAllAppenders(); - LOGGER.addAppender(new NullAppender()); // thou shall not log - index = new SearchResultIndex(PATH, INDEX, 2); + if (letsLog) { + appender = new ConsoleAppender(new PatternLayout("%m%n"), ConsoleAppender.SYSTEM_ERR); + } else { + appender = new NullAppender(); // thou shall not log + } + LOGGER.addAppender(appender); + LOGGER.setLevel(Level.ALL); + index = new SearchResultIndex(PATH, INDEX, 10); engines = new ResourceIndex(PATH, INDEX); engines.putMother(wiki()); - engines.put(wrong()); + engines.put(wrong()); + engines.put(ok()); engines.putMyself(me()); } @@ -88,6 +106,7 @@ public void testQuery() throws IOException { Assert.assertEquals(200, status); Assert.assertTrue(hits.length() > 0); Assert.assertEquals("http://searsia.org", url); + Assert.assertNotNull(json.get("resource")); } @Test // returns local resource 'wrong' @@ -122,4 +141,27 @@ public void testError() throws IOException { Assert.assertEquals(503, status); } + @Test // returns results for the engine 'wikifull1' + public void testOk() throws IOException { + Search search = new Search(index, engines); + Response response = search.query("wikifull1.json", "informat"); + int status = response.getStatus(); + String entity = (String) response.getEntity(); + JSONObject json = new JSONObject(entity); + Assert.assertEquals(200, status); + Assert.assertNotNull(json.get("hits")); + Assert.assertNotNull(json.get("resource")); + LOGGER.trace("Query result: " + json); + + response = search.query("wikifull1.json", "informat"); + status = response.getStatus(); + entity = (String) response.getEntity(); + json = new JSONObject(entity); + Assert.assertEquals(200, status); + Assert.assertNotNull(json.get("hits")); + Assert.assertNotNull(json.get("resource")); + LOGGER.trace("Cache result: " + json); + } + + } diff --git a/src/test/resources/exampleSearchResult.json b/src/test/resources/exampleSearchResult.json index 3a051e8..16f5ce9 100644 --- a/src/test/resources/exampleSearchResult.json +++ b/src/test/resources/exampleSearchResult.json @@ -56,5 +56,6 @@ "urltemplate": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/?s={q}", "favicon": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/images\/ut.ico", "name": "Djoerd Hiemstra" - } + }, + "query": "dolf" } From 586a818cefd0f82837faeb74be6e80818866bccb Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 11 May 2017 15:06:29 +0200 Subject: [PATCH 31/51] random term and apitemplate bugs --- src/main/java/org/searsia/Hit.java | 10 ++++++++++ src/main/java/org/searsia/Main.java | 3 +++ src/main/java/org/searsia/SearchResult.java | 5 +++-- src/main/java/org/searsia/SearsiaOptions.java | 4 ++-- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index 59d5dec..fa7ec8d 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -200,6 +200,16 @@ public String toIndexVersion() { // TODO: special treatment for urls, etc. and S return result.trim(); } + public String toTitleDescriptionIndexVersion() { + String result = (String) this.get("title"); + String desc = (String) this.get("description"); + if (result == null) { result = ""; } + if (desc != null) { + result += " " + desc; + } + return result.trim(); + } + @Override public int compareTo(Hit hit2) { Float score1 = getResourceScore(); // order by best resources diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index a6bbe27..446f4a7 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -77,6 +77,9 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines result = engine.randomSearch(); Resource newmother = result.getResource(); if (newmother != null && newmother.getId().equals(mother.getId())) { + if (newmother.getAPITemplate() == null) { + newmother.setUrlAPITemplate(mother.getAPITemplate()); + } engines.putMother(newmother); engines.putMyself(newmother.getLocalResource(myUri)); } else { diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index f17f1b8..a374ff0 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -255,15 +255,16 @@ public String randomTerm(String notThisOne) { // TODO: keep track of more previo int size = this.hits.size(); if (size > 0) { int nr = random.nextInt(this.hits.size()); - String text = this.hits.get(nr).toIndexVersion().toLowerCase(); + String text = this.hits.get(nr).toTitleDescriptionIndexVersion().toLowerCase(); String terms[] = text.split(TOKENIZER); // TODO Lucene tokenizer? nr = random.nextInt(terms.length); String thisOne = terms[nr]; int i = nr + 1; while (thisOne.length() < 1 || notThisOne.equals(thisOne)) { if (i >= terms.length) { i = 0; } - thisOne = terms[i]; + else { i += 1; } if (i == nr) { return null; } + thisOne = terms[i]; } return thisOne; } else { diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 4959aff..2c0b4de 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -159,8 +159,8 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti try { if (cmd.hasOption("i")) { pollInterval = new Integer(cmd.getOptionValue("i")); - if (pollInterval < 5) { - pollInterval = 5; + if (pollInterval < 10) { + pollInterval = 10; } } if (cmd.hasOption("l")) { From 2c0e6b164596715edd0bf1d4f57bc302e8f4bed9 Mon Sep 17 00:00:00 2001 From: Searsia Date: Sun, 14 May 2017 22:54:35 +0200 Subject: [PATCH 32/51] random rank --- src/main/java/org/searsia/SearchResult.java | 26 +++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index a374ff0..1833c6a 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -226,8 +226,30 @@ public void scoreResourceSelectionOld(String query, ResourceIndex engines) { Collections.sort(this.hits, Collections.reverseOrder()); } - - public void scoreReranking(String query, String model) { // TODO use model + + public void scoreReranking(String query, String model) { + if ("random".equals(model)) { + scoreRerankingRandom(); + } else { + scoreRerankingRest(query); + } + } + + + private void scoreRerankingRandom() { + Hit hit; + int i, j, + size = this.hits.size(); + for (i = 0; i < size; i += 1) { + j = random.nextInt(size); + hit = this.hits.get(i); + this.hits.set(i, this.hits.get(j)); + this.hits.set(j, hit); + } + } + + + private void scoreRerankingRest(String query) { SearchResult newResult = new SearchResult(); Map queryTerms = new HashMap(); for (String term: query.toLowerCase().split(TOKENIZER)) { From 34db912c41abfdb4dbdc364a5bce3bd79242c6cb Mon Sep 17 00:00:00 2001 From: Searsia Date: Sun, 21 May 2017 11:42:33 +0200 Subject: [PATCH 33/51] template handling --- src/main/java/org/searsia/Main.java | 2 +- src/main/java/org/searsia/engine/Resource.java | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 446f4a7..98b5ae4 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -336,7 +336,7 @@ public static void main(String[] args) { // Create or open indexes. The filename appends the MD5 of the id so we don't confuse indexes - String fileName = myself.getId() + "_" + getHashString(options.getMotherTemplate()); + String fileName = myself.getId() + "_" + getHashString(mother.getAPITemplate()); String path = options.getIndexPath(); Level level = options.getLoggerLevel(); try { diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 31ae035..e0678af 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -301,6 +301,9 @@ public SearchResult search(String query, String debug) throws SearchException { } SearchResult result; try { + if (this.urlAPITemplate == null) { + throw new SearchException("No API Template"); + } String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); String postString = ""; String postQuery; From 7604c29f03bd1dbbb92f36f0b27a5a8629cfc6af Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 25 May 2017 16:30:10 +0200 Subject: [PATCH 34/51] sampling and warning --- src/main/java/org/searsia/Main.java | 2 ++ src/main/java/org/searsia/SearchResult.java | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 98b5ae4..bd429b9 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -243,6 +243,8 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie } if (result.getHits().size() < 10) { printMessage("Warning: less than 10 results for query: " + result.getQuery() + "; see \"testquery\" or \"rerank\".", isQuiet); + } else if (result.getHits().size() > 49) { + printMessage("Warning: more than 49 results for query: " + result.getQuery(), isQuiet); } if (debugInfo.equals("all")) { testAll(mother, result, isQuiet); diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 1833c6a..5c80143 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -281,9 +281,9 @@ public String randomTerm(String notThisOne) { // TODO: keep track of more previo String terms[] = text.split(TOKENIZER); // TODO Lucene tokenizer? nr = random.nextInt(terms.length); String thisOne = terms[nr]; - int i = nr + 1; + int i = nr; while (thisOne.length() < 1 || notThisOne.equals(thisOne)) { - if (i >= terms.length) { i = 0; } + if (i + 1 >= terms.length) { i = 0; } else { i += 1; } if (i == nr) { return null; } thisOne = terms[i]; From 80fb3626c4c720f24ae575cefa79b22bdec977b9 Mon Sep 17 00:00:00 2001 From: Searsia Date: Mon, 29 May 2017 13:13:27 +0200 Subject: [PATCH 35/51] deleted resources --- src/main/java/org/searsia/Main.java | 20 ++- .../java/org/searsia/engine/Resource.java | 132 ++++++++++++------ .../java/org/searsia/index/ResourceIndex.java | 16 ++- src/main/java/org/searsia/web/Search.java | 18 ++- src/test/java/org/searsia/web/SearchTest.java | 19 ++- 5 files changed, 140 insertions(+), 65 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index bd429b9..7e2fad1 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -117,12 +117,16 @@ private static int getResources(Resource mother, SearchResult result, ResourceIn i += 1; try { engine = mother.searchResource(rid); - } catch (Exception e) { + } catch (SearchException e) { LOGGER.warn("Warning: Update failed: " + e.getMessage()); } if (engine != null && rid.equals(engine.getId())) { engines.put(engine); - LOGGER.debug("Updated " + rid); + if (engine.isDeleted()) { + LOGGER.debug("Deleted: " + rid); + } else { + LOGGER.debug("Updated: " + rid); + } } else { LOGGER.warn("Warning: Resource not found: " + rid); } @@ -322,10 +326,17 @@ public static void main(String[] args) { } String myURI = removeFileNameUri(options.getMyURI()); myself = mother.getLocalResource(myURI); + String fileName = myself.getId() + "_" + getHashString(mother.getAPITemplate()); + String path = options.getIndexPath(); + Level level = options.getLoggerLevel(); - // If test is set, test the mother + // If test is set, test the mother if (options.getTestOutput() != null) { + String tmpDir = System.getProperty("java.io.tmpdir"); + if (tmpDir != null) { + path = tmpDir; + } try { testMother(mother, options.getTestOutput(), options.isQuiet()); printMessage("Test succeeded.", options.isQuiet()); @@ -338,9 +349,6 @@ public static void main(String[] args) { // Create or open indexes. The filename appends the MD5 of the id so we don't confuse indexes - String fileName = myself.getId() + "_" + getHashString(mother.getAPITemplate()); - String path = options.getIndexPath(); - Level level = options.getLoggerLevel(); try { setupLogger(path, fileName, level); engines = new ResourceIndex(path, fileName); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index e0678af..86c72e0 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -64,6 +64,7 @@ public class Resource implements Comparable { public final static String defaultTestQuery = "searsia"; + public final static String goneErrorMessage = "Searsia Gone"; // For rate limiting: Default = 1000 queries per day private final static int defaultRATE = 1000; // unit: queries @@ -91,7 +92,8 @@ public class Resource implements Comparable { private Float prior = null; private String rerank = null; private int rate = defaultRATE; - + private boolean deleted = false; + // internal data shared for health report private String nextQuery = null; private String lastMessage = null; @@ -101,8 +103,8 @@ public class Resource implements Comparable { private long lastUsedError = lastUsed; private long lastUpdated = lastUsed; private long upsince = lastUsed; - private int nrOfError = 0; - private int nrOfOk = 0; + private int nrOfError = 0; + private int nrOfOk = 0; public Resource(String urlAPITemplate) { this.urlAPITemplate = urlAPITemplate; @@ -128,6 +130,7 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { if (jo.has("rerank")) this.rerank = jo.getString("rerank"); if (jo.has("banner")) this.banner = jo.getString("banner"); if (jo.has("itempath")) this.itemXpath = jo.getString("itempath"); + if (jo.has("deleted")) this.deleted = jo.getBoolean("deleted"); if (jo.has("prior")) this.prior = new Float(jo.getDouble("prior")); if (jo.has("maxqueriesperday")) this.rate = jo.getInt("maxqueriesperday"); if (jo.has("extractors")) { @@ -379,12 +382,31 @@ public Resource searchResource(String resourceid) throws SearchException { if (json.has("resource")) { engine = new Resource(json.getJSONObject("resource")); } - return engine; + } catch (IOException e) { + String message = e.getMessage(); + if (message != null && message.equals(goneErrorMessage)) { // TODO: not using error message like this?? + engine = deletedResource(resourceid); + } else { + throw createPrivateSearchException(e); + } } catch (Exception e) { - throw createPrivateSearchException(e); + throw createPrivateSearchException(e); } + return engine; + } + + private Resource deletedResource(String resourceid) throws SearchException { + Resource engine = null; + JSONObject json = new JSONObject(); + json.put("id", resourceid); + json.put("deleted", true); + try { + engine = new Resource(json); + } catch (XPathExpressionException e) { + throw new SearchException(e); + } + return engine; } - private SearchResult searsiaSearch(String jsonPage, String debug) throws XPathExpressionException, JSONException { SearchResult result = new SearchResult(); @@ -583,12 +605,16 @@ private InputStream httpConnect(URLConnection connection, String postString) thr http.setRequestMethod("GET"); http.connect(); } - if (http.getResponseCode() == 301) { // FollowRedirects did not work?! + int responseCode = http.getResponseCode(); + if (responseCode == 301) { // FollowRedirects did not work?! throw new IOException("Moved permanently"); } + if (responseCode == 410) { // Gone: we will use this special error message. + throw new IOException(goneErrorMessage); + } return http.getInputStream(); } - + private InputStream fileConnect(URLConnection connection) throws IOException { String fileName = connection.getURL().getFile(); return new FileInputStream(new File(fileName)); @@ -702,6 +728,10 @@ public int getRate() { return this.rate; } + public boolean isDeleted() { + return this.deleted; + } + public int getAllowance() { long timePassed = new Date().getTime() - this.lastUsed; double currentAllowance = this.allowance + (((double) timePassed / defaultPER)) * this.rate; @@ -825,12 +855,15 @@ public Resource deepcopy() { } - public void updateWith(Resource e2) { + public void updateWith(Resource e2) { // TODO: bad idea in multi-threaded app!? setLastUpdatedToNow(); if (!equals(e2)) { - setUpSinceToNow(); if (this.id != null && !this.id.equals(e2.id)) throw new RuntimeException("Cannot update resource ID."); + setUpSinceToNow(); + this.nrOfOk = 0; + this.nrOfError = 0; this.id = e2.id; + this.deleted = e2.deleted; this.name = e2.name; this.urlUserTemplate = e2.urlUserTemplate; this.favicon = e2.favicon; @@ -848,7 +881,7 @@ public void updateWith(Resource e2) { this.itemXpath = e2.itemXpath; this.extractors = e2.extractors; this.headers = e2.headers; - this.privateParameters = e2.privateParameters; + this.privateParameters = e2.privateParameters; } } @@ -859,34 +892,38 @@ public JSONObject toJson() { public JSONObject toJsonEngine() { JSONObject engine = new JSONObject(); - if (id != null) engine.put("id", id); - if (name != null) engine.put("name", name); - if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); - if (favicon != null) engine.put("favicon", favicon); - if (banner != null) engine.put("banner", banner); - if (urlAPITemplate != null) engine.put("apitemplate", urlAPITemplate); - if (urlSuggestTemplate != null) engine.put("suggesttemplate", urlSuggestTemplate); - if (mimeType != null) engine.put("mimetype", mimeType); - if (rerank != null) engine.put("rerank", rerank); - if (postString != null) engine.put("post", postString); - if (postQueryEncode != null) engine.put("postencode", postQueryEncode); - if (testQuery != null) engine.put("testquery", testQuery); - if (prior != null) engine.put("prior", prior); - if (rate != defaultRATE) engine.put("maxqueriesperday", rate); - if (itemXpath != null) engine.put("itempath", itemXpath); - if (extractors != null && extractors.size() > 0) { - JSONObject json = new JSONObject(); - for (TextExtractor e: extractors) { - json.put(e.getField(), e.getPath()); + if (id != null) engine.put("id", id); + if (deleted) { + engine.put("deleted", true); + } else { + if (name != null) engine.put("name", name); + if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); + if (favicon != null) engine.put("favicon", favicon); + if (banner != null) engine.put("banner", banner); + if (urlAPITemplate != null) engine.put("apitemplate", urlAPITemplate); + if (urlSuggestTemplate != null) engine.put("suggesttemplate", urlSuggestTemplate); + if (mimeType != null) engine.put("mimetype", mimeType); + if (rerank != null) engine.put("rerank", rerank); + if (postString != null) engine.put("post", postString); + if (postQueryEncode != null) engine.put("postencode", postQueryEncode); + if (testQuery != null) engine.put("testquery", testQuery); + if (prior != null) engine.put("prior", prior); + if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + if (itemXpath != null) engine.put("itempath", itemXpath); + if (extractors != null && extractors.size() > 0) { + JSONObject json = new JSONObject(); + for (TextExtractor e: extractors) { + json.put(e.getField(), e.getPath()); + } + engine.put("extractors", json); } - engine.put("extractors", json); - } - if (headers != null && headers.size() > 0) { - JSONObject json = new JSONObject(); - for (String header: headers.keySet()) { - json.put(header, headers.get(header)); + if (headers != null && headers.size() > 0) { + JSONObject json = new JSONObject(); + for (String header: headers.keySet()) { + json.put(header, headers.get(header)); + } + engine.put("headers", json); } - engine.put("headers", json); } return engine; } @@ -895,14 +932,18 @@ public JSONObject toJsonEngine() { public JSONObject toJsonEngineDontShare() { JSONObject engine = new JSONObject(); if (id != null) engine.put("id", id); - if (name != null) engine.put("name", name); - if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); - if (favicon != null) engine.put("favicon", favicon); - if (banner != null) engine.put("banner", banner); - if (mimeType != null && !mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) - engine.put("mimetype", mimeType); - if (rerank != null) engine.put("rerank", rerank); - if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + if (deleted) { + engine.put("deleted", true); + } else { + if (name != null) engine.put("name", name); + if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); + if (favicon != null) engine.put("favicon", favicon); + if (banner != null) engine.put("banner", banner); + if (mimeType != null && !mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) + engine.put("mimetype", mimeType); + if (rerank != null) engine.put("rerank", rerank); + if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + } return engine; } @@ -954,6 +995,7 @@ public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? if (o == null) return false; Resource e = (Resource) o; if (!stringEquals(this.getId(), e.getId())) return false; + if (this.isDeleted() != e.isDeleted()) return false; if (!stringEquals(this.getName(), e.getName())) return false; if (!stringEquals(this.getMimeType(), e.getMimeType())) return false; if (!stringEquals(this.getRerank(), e.getRerank())) return false; diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 0baff04..ea7ee61 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -211,10 +211,18 @@ public Resource getRandom() { Object[] keys = this.engines.keySet().toArray(); if (keys.length > 0) { int nr = random.nextInt(keys.length); - return this.engines.get(keys[nr]); - } else { - return getMother(); - } + int i = nr + 1; + Resource engine = this.engines.get(keys[nr]); + while (engine.isDeleted() && i != nr) { + if (i >= keys.length) { i = 0; } + engine = this.engines.get(keys[i]); + i += 1; + } + if (!engine.isDeleted()) { + return engine; + } + } + return getMother(); } // Efficiency can be gained here? diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index c80f9f2..54e2d0c 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -85,20 +85,26 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q if (engine == null || engine.getLastUpdatedSecondsAgo() > 9600) { // unknown or really old? ask your mother if (mother != null) { // TODO: option for 9600 and similar value (7200) in Main try { - engine = mother.searchResource(resourceid); + Resource newEngine = mother.searchResource(resourceid); + engine = newEngine; + engines.put(engine); } catch (SearchException e) { - String message = "Resource not found: " + resourceid; - LOGGER.warn(message); - return SearsiaApplication.responseError(404, message); + if (engine != null) { + LOGGER.warn("Not found at mother: " + resourceid); + } } } if (engine == null) { - String message = "Unknown resource identifier: " + resourceid; + String message = "Not found: " + resourceid; LOGGER.warn(message); return SearsiaApplication.responseError(404, message); } - engines.put(engine); } + if (engine.isDeleted()) { + String message = "Gone: " + resourceid; + LOGGER.warn(message); + return SearsiaApplication.responseError(410, message); + } if (query != null && query.trim().length() > 0) { result = index.cacheSearch(query, engine.getId()); if (result != null) { diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 5ad01a7..2fc62e8 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -39,18 +39,21 @@ private static Resource wiki() throws XPathExpressionException, JSONException { } private static Resource wrong() throws XPathExpressionException, JSONException { - return new Resource(new JSONObject("{\"apitemplate\":\"http://doesnotexist.com/wrong?q={q}\", \"id\":\"wrong\"}")); + return new Resource(new JSONObject("{\"apitemplate\":\"http://reallyreallydoesnotexist.com/wrong?q={q}\", \"id\":\"wrong\"}")); } private static Resource ok() throws XPathExpressionException, JSONException { return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wikifull1{q}.json\", \"id\":\"wikifull1\"}")); } + private static Resource okDeleted() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"deleted\":true, \"id\":\"wikifull1\"}")); + } + private static Resource me() throws XPathExpressionException, JSONException { return new Resource(new JSONObject("{\"apitemplate\":\"http://me.org?q={q}\", \"id\":\"wiki\"}")); } - - + @BeforeClass public static void setUp() throws Exception { Appender appender = null; @@ -142,7 +145,7 @@ public void testError() throws IOException { } @Test // returns results for the engine 'wikifull1' - public void testOk() throws IOException { + public void testOk() throws IOException, XPathExpressionException, JSONException { Search search = new Search(index, engines); Response response = search.query("wikifull1.json", "informat"); int status = response.getStatus(); @@ -161,6 +164,14 @@ public void testOk() throws IOException { Assert.assertNotNull(json.get("hits")); Assert.assertNotNull(json.get("resource")); LOGGER.trace("Cache result: " + json); + + engines.put(okDeleted()); + response = search.query("wikifull1.json", "informat"); + status = response.getStatus(); + entity = (String) response.getEntity(); + json = new JSONObject(entity); + Assert.assertEquals(410, status); + LOGGER.trace("No result: " + json); } From 45b25443ec5e9cd2013c46fba45cdb6520444990 Mon Sep 17 00:00:00 2001 From: Searsia Date: Tue, 30 May 2017 19:51:24 +0200 Subject: [PATCH 36/51] url check, logging --- src/main/java/org/searsia/Main.java | 2 +- src/main/java/org/searsia/SearsiaOptions.java | 22 ++++++++++++------- .../java/org/searsia/engine/ResourceTest.java | 7 +++--- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 7e2fad1..2ff59b8 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -286,7 +286,7 @@ public static void main(String[] args) { // Get options. This will also set the default options. try { options = new SearsiaOptions(args); - } catch (IllegalArgumentException e) { + } catch (Exception e) { fatalError(e.getMessage()); } if (options.isHelp()) { diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 2c0b4de..d0266c7 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -17,6 +17,8 @@ package org.searsia; import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; import org.apache.log4j.Level; import org.apache.commons.cli.DefaultParser; @@ -50,17 +52,18 @@ public class SearsiaOptions { * Takes command line options and sensible defaults. * @param args Command Line options * @throws IllegalArgumentException + * @throws MalformedURLException */ - public SearsiaOptions(String[] args) throws IllegalArgumentException { + public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedURLException { Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); - options.addOption("d", "dontshare",false, "Do not share resource definitions."); + options.addOption("d", "dontshare",false, "Do not share resource definitions."); // TODO options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); options.addOption("m", "mother", true, "Set url of mother's web service end point."); options.addOption("n", "nohealth", false, "Do not share health report."); - options.addOption("p", "path", true, "Set directory path to store the index."); + options.addOption("p", "path", true, "Set directory path to store the index."); // TODO options.addOption("q", "quiet", false, "No output to console."); options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response', 'all')."); options.addOption("u", "url", true, "Set url of my web service endpoint."); @@ -72,11 +75,14 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException { } - private static String lastDir(String uri) { - if (uri != null && uri.contains("/")) { - uri = uri.replaceAll("\\/[^\\/]*$", ""); - uri = uri.replaceAll("^.+\\/", ""); - return uri + "/"; + private static String lastDir(String urlString) throws MalformedURLException { + urlString = urlString.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); + URL url = new URL(urlString); + String path = url.getPath(); + if (path != null && path.contains("/")) { + path = path.replaceAll("\\/[^\\/]*$", ""); // remove file + path = path.replaceAll("^.+\\/", ""); // remove trailing directories + return path + "/"; } else { return ""; } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index 256b617..8b91363 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -1,13 +1,12 @@ package org.searsia.engine; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; import org.json.JSONObject; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; -import java.util.logging.Level; -import java.util.logging.Logger; - import javax.xml.xpath.XPathExpressionException; import org.searsia.SearchResult; @@ -19,7 +18,7 @@ public class ResourceTest { @BeforeClass public static void setUp() { - Logger.getLogger("").setLevel(Level.WARNING); + Logger.getLogger("").setLevel(Level.WARN); } @Test From b9c70f67fe261250f8e595fea8702c9f68834362 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 9 Jun 2017 11:01:07 +0200 Subject: [PATCH 37/51] Redirect, no query, refactoring Adds a permanent redirect for the server root to 'id.json'; Refactors Search.java in two (still large) subroutines; No longer stores 'query' as part of a hit. --- src/main/java/org/searsia/Main.java | 4 +- src/main/java/org/searsia/SearchResult.java | 8 +- .../java/org/searsia/index/ResourceIndex.java | 1 + src/main/java/org/searsia/web/Redirect.java | 29 +++ src/main/java/org/searsia/web/Search.java | 206 +++++++++--------- .../org/searsia/web/SearsiaApplication.java | 1 + 6 files changed, 144 insertions(+), 105 deletions(-) create mode 100644 src/main/java/org/searsia/web/Redirect.java diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 2ff59b8..fbe0957 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -90,8 +90,8 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines engine = engines.getRandom(); LOGGER.trace("Next sample: " + engine.getId()); result = engine.randomSearch(); - result.removeResourceQuery(); // only trust your mother - result.addQueryResourceDate(engine.getId()); + result.removeResource(); // only trust your mother + result.addResourceDate(engine.getId()); } index.offer(result); LOGGER.info("Sampled " + engine.getId() + ": " + result.getQuery()); diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 5c80143..bf3489a 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -116,19 +116,17 @@ public String getQuery() { } // TODO: maybe a list of query-resource pairs, if result found by multiple engines for multiple queries. - public void addQueryResourceDate(String resourceID) { - String query = getQuery(); + public void addResourceDate(String resourceID) { for (Hit hit: this.hits) { - hit.putIfEmpty("query", query); hit.putIfEmpty("rid", resourceID); // TODO: if unknown rid, then replace! hit.putIfEmpty("foundBefore", df.format(new Date())); } } - public void removeResourceQuery() { + public void removeResource() { for (Hit hit: this.hits) { hit.remove("rid"); - hit.remove("query"); + hit.remove("query"); // for legacy reasons, we added the query to the result before } } diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index ea7ee61..d43e92d 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -339,6 +339,7 @@ public void flush() { * @throws IOException */ public void close() throws IOException { + this.flush(); this.writer.close(); this.mother = null; this.me = null; diff --git a/src/main/java/org/searsia/web/Redirect.java b/src/main/java/org/searsia/web/Redirect.java new file mode 100644 index 0000000..bf63683 --- /dev/null +++ b/src/main/java/org/searsia/web/Redirect.java @@ -0,0 +1,29 @@ +package org.searsia.web; + +import java.io.IOException; + +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.core.Response; + + +@Path("/") +public class Redirect { + + String id; + + public Redirect(String id) throws IOException { + this.id = id; + } + + @GET + public Response redirect() { + return Response + .status(301) + .entity("") + .header("Access-Control-Allow-Origin", "*") + .header("Location", this.id + ".json") + .build(); + } + +} \ No newline at end of file diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 54e2d0c..9926d1a 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -75,106 +75,116 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q return SearsiaApplication.responseError(404, "Not found: " + resourceid); } resourceid = resourceid.replaceAll("\\.json$", ""); - Resource me, engine, mother; - SearchResult result; - JSONObject json; - me = engines.getMyself(); - mother = engines.getMother(); + Resource me = engines.getMyself(); if (!resourceid.equals(me.getId())) { - engine = engines.get(resourceid); - if (engine == null || engine.getLastUpdatedSecondsAgo() > 9600) { // unknown or really old? ask your mother - if (mother != null) { // TODO: option for 9600 and similar value (7200) in Main - try { - Resource newEngine = mother.searchResource(resourceid); - engine = newEngine; - engines.put(engine); - } catch (SearchException e) { - if (engine != null) { - LOGGER.warn("Not found at mother: " + resourceid); - } - } - } - if (engine == null) { - String message = "Not found: " + resourceid; - LOGGER.warn(message); - return SearsiaApplication.responseError(404, message); - } - } - if (engine.isDeleted()) { - String message = "Gone: " + resourceid; - LOGGER.warn(message); - return SearsiaApplication.responseError(410, message); - } - if (query != null && query.trim().length() > 0) { - result = index.cacheSearch(query, engine.getId()); - if (result != null) { - boolean censorQueryResourceId = true; - json = result.toJson(censorQueryResourceId); - json.put("resource", engine.toJson()); - LOGGER.info("Cache " + resourceid + ": " + query); - return SearsiaApplication.responseOk(json); - } else { - try { - result = engine.search(query); - result.removeResourceQuery(); // only trust your mother - json = result.toJson(); // first json for response, so - result.addQueryResourceDate(engine.getId()); // response will not have query + resource - index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) - json.put("resource", engine.toJson()); - LOGGER.info("Query " + resourceid + ": " + query); - return SearsiaApplication.responseOk(json); - } catch (Exception e) { - String message = "Resource " + resourceid + " unavailable: " + e.getMessage(); - LOGGER.warn(message); - return SearsiaApplication.responseError(503, message); - } - } - } else { - json = new JSONObject().put("resource", engine.toJson()); - json.put("health", engine.toJsonHealth()); - LOGGER.info("Resource " + resourceid + "."); - return SearsiaApplication.responseOk(json); - } + return getRemoteResults(resourceid, query); } else { - JSONObject healthJson = null; - if (query != null && query.trim().length() > 0) { - try { - result = index.search(query); - } catch (Exception e) { - String message = "Service unavailable: " + e.getMessage(); - LOGGER.warn(message); - this.nrOfQueriesError += 1; - return SearsiaApplication.responseError(503, message); - } - this.nrOfQueriesOk += 1; - if (result.getHits().isEmpty() && mother != null) { // empty? ask mother! - try { - result = mother.search(query); - index.offer(result); // really trust mother - } catch (SearchException e) { - LOGGER.warn("Mother not available"); - } catch (Exception e) { - LOGGER.warn(e); - } - } else { // own results? Do resource ranking. - result.scoreResourceSelection(query, engines); - } - } else { - result = new SearchResult(); - result.scoreResourceSelection(query, engines); - healthJson = engines.toJsonHealth(); - healthJson.put("requestsok", this.nrOfQueriesOk); - healthJson.put("requestserr", this.nrOfQueriesError); - healthJson.put("upsince", startTime); - } - json = result.toJson(); - json.put("resource", me.toJson()); - if (healthJson != null) { - json.put("health", healthJson); - } - LOGGER.info("Local " + resourceid + ": " + query); // TODO query can be null - return SearsiaApplication.responseOk(json); + return getLocalResults(query); } } - + + private Response getRemoteResults(String resourceid, String query) { + Resource engine = engines.get(resourceid); + Resource mother = engines.getMother(); + JSONObject json = null; + if (engine == null || engine.getLastUpdatedSecondsAgo() > 9600) { // unknown or really old? ask your mother + if (mother != null) { // TODO: option for 9600 and similar value (7200) in Main + try { + Resource newEngine = mother.searchResource(resourceid); + engine = newEngine; + engines.put(engine); + } catch (SearchException e) { + if (engine != null) { + LOGGER.warn("Not found at mother: " + resourceid); + } + } + } + if (engine == null) { + String message = "Not found: " + resourceid; + LOGGER.warn(message); + return SearsiaApplication.responseError(404, message); + } + } + if (engine.isDeleted()) { + String message = "Gone: " + resourceid; + LOGGER.warn(message); + return SearsiaApplication.responseError(410, message); + } + if (query != null && query.trim().length() > 0) { + SearchResult result = index.cacheSearch(query, engine.getId()); + if (result != null) { + boolean censorQueryResourceId = true; + json = result.toJson(censorQueryResourceId); + json.put("resource", engine.toJson()); + LOGGER.info("Cache " + resourceid + ": " + query); + return SearsiaApplication.responseOk(json); + } else { + try { + result = engine.search(query); + result.removeResource(); // only trust your mother + json = result.toJson(); // first json for response, so + result.addResourceDate(engine.getId()); // response will not have query + resource + index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) + json.put("resource", engine.toJson()); + LOGGER.info("Query " + resourceid + ": " + query); + return SearsiaApplication.responseOk(json); + } catch (Exception e) { + String message = "Resource " + resourceid + " unavailable: " + e.getMessage(); + LOGGER.warn(message); + return SearsiaApplication.responseError(503, message); + } + } + } else { + json = new JSONObject().put("resource", engine.toJson()); + json.put("health", engine.toJsonHealth()); + LOGGER.info("Resource " + resourceid + "."); + return SearsiaApplication.responseOk(json); + } + } + + private Response getLocalResults(String query) { + JSONObject json = null, healthJson = null; + Resource mother = engines.getMother(); + Resource me = engines.getMyself(); + SearchResult result = null; + if (query != null && query.trim().length() > 0) { + try { + result = index.search(query); + } catch (Exception e) { + String message = "Service unavailable: " + e.getMessage(); + LOGGER.warn(message); + this.nrOfQueriesError += 1; + return SearsiaApplication.responseError(503, message); + } + this.nrOfQueriesOk += 1; + if (result.getHits().isEmpty() && mother != null) { // empty? ask mother! + try { + result = mother.search(query); + index.offer(result); // really trust mother + } catch (SearchException e) { + LOGGER.warn("Mother not available"); + } catch (Exception e) { + LOGGER.warn(e); + } + } else { // own results? Do resource ranking. + result.scoreResourceSelection(query, engines); + } + } else { // no query: create a 'resource only' result, plus health report + result = new SearchResult(); + result.scoreResourceSelection(query, engines); + healthJson = engines.toJsonHealth(); + healthJson.put("requestsok", this.nrOfQueriesOk); + healthJson.put("requestserr", this.nrOfQueriesError); + healthJson.put("upsince", startTime); + LOGGER.info("Local:" + query); + } + json = result.toJson(); + json.put("resource", me.toJson()); + if (healthJson != null) { + json.put("health", healthJson); + } + LOGGER.info("Local."); + return SearsiaApplication.responseOk(json); + } + } diff --git a/src/main/java/org/searsia/web/SearsiaApplication.java b/src/main/java/org/searsia/web/SearsiaApplication.java index d4a3569..fc69e02 100644 --- a/src/main/java/org/searsia/web/SearsiaApplication.java +++ b/src/main/java/org/searsia/web/SearsiaApplication.java @@ -69,6 +69,7 @@ public SearsiaApplication(SearchResultIndex index, ResourceIndex engines) throws java.util.logging.Logger.getLogger("").setLevel(java.util.logging.Level.WARNING); register(new Search(index, engines)); register(new OpenSearch(engines)); + register(new Redirect(engines.getMyself().getId())); } } From 82df7ba00417261dea1dadbd768f7b373edc049c Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 14 Jun 2017 21:43:31 +0200 Subject: [PATCH 38/51] Export index, apitemplate * Adds a new option '--export' that exports the index to standard out and exits. * Removes the apitemplate from 'mySelf' engine definition and report * Fixes a small issue with options u --- src/main/java/org/searsia/Main.java | 23 +++++++++--- src/main/java/org/searsia/SearsiaOptions.java | 36 +++++++++++-------- .../java/org/searsia/engine/Resource.java | 3 +- 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index fbe0957..1c23e5f 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -65,7 +65,6 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines Resource mother = engines.getMother(); Resource engine = null; int pollInterval = options.getPollInterval(); - String myUri = options.getMyURI(); while(true) { Thread.sleep(pollInterval * 1000); try { @@ -81,7 +80,7 @@ private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines newmother.setUrlAPITemplate(mother.getAPITemplate()); } engines.putMother(newmother); - engines.putMyself(newmother.getLocalResource(myUri)); + engines.putMyself(newmother.getLocalResource()); } else { LOGGER.warn("Unable to update mother: Did ids change?"); } @@ -324,8 +323,7 @@ public static void main(String[] args) { } else if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); } - String myURI = removeFileNameUri(options.getMyURI()); - myself = mother.getLocalResource(myURI); + myself = mother.getLocalResource(); String fileName = myself.getId() + "_" + getHashString(mother.getAPITemplate()); String path = options.getIndexPath(); Level level = options.getLoggerLevel(); @@ -360,9 +358,24 @@ public static void main(String[] args) { engines.putMyself(myself); getResources(mother, result, engines); - + + // Export index and exit + if (options.isExport()) { + printMessage("Exporting index...", options.isQuiet()); + try { + engines.dump(); + engines.close(); + index.dump(); + index.close(); + } catch (IOException e) { + fatalError("Index export failed: " + e.getMessage()); + } + printMessage("Done.", options.isQuiet()); + System.exit(0); + } // Start the web server + String myURI = removeFileNameUri(options.getMyURI()); try { SearsiaApplication app = new SearsiaApplication(index, engines); server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), app); diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index d0266c7..3f1611d 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -40,6 +40,7 @@ public class SearsiaOptions { private Boolean quiet; private Boolean help; private Boolean dontshare; + private Boolean export; private Boolean nohealth; private int cacheSize; private int pollInterval; @@ -58,6 +59,7 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedU Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); options.addOption("d", "dontshare",false, "Do not share resource definitions."); // TODO + options.addOption("e", "export", false, "Export index to stdout and exit."); options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); @@ -69,7 +71,7 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedU options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); - if (myURI == null) { + if (myURI == null && motherTemplate != null) { myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); } } @@ -94,6 +96,7 @@ private void setDefaults() { help = false; quiet = false; dontshare = false; + export = false; nohealth = false; cacheSize = 500; pollInterval = 120; @@ -162,21 +165,17 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti throw new IllegalArgumentException("Test output must be one of 'json', 'xml', 'response', or 'all'."); } } - try { - if (cmd.hasOption("i")) { - pollInterval = new Integer(cmd.getOptionValue("i")); - if (pollInterval < 10) { - pollInterval = 10; - } + if (cmd.hasOption("i")) { + pollInterval = new Integer(cmd.getOptionValue("i")); + if (pollInterval < 10) { + pollInterval = 10; } - if (cmd.hasOption("l")) { - logLevel = new Integer(cmd.getOptionValue("l")); - if (logLevel < 0) { - logLevel = 0; - } + } + if (cmd.hasOption("l")) { + logLevel = new Integer(cmd.getOptionValue("l")); + if (logLevel < 0) { + logLevel = 0; } - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage()); } if (cmd.hasOption("p")) { indexPath = cmd.getOptionValue("p"); @@ -187,6 +186,9 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("d")) { dontshare = true; } + if (cmd.hasOption("e")) { + export = true; + } if (cmd.hasOption("n")) { nohealth = true; } @@ -285,7 +287,11 @@ public Boolean isQuiet() { public Boolean isNotShared() { return dontshare; } - + + public Boolean isExport() { + return export; + } + public Boolean isNoHealthReport() { return nohealth; } diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 86c72e0..1bbcc8b 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -803,10 +803,9 @@ public boolean isHealthy() { } - public Resource getLocalResource(String myUri) { + public Resource getLocalResource() { JSONObject json = new JSONObject(); Resource result = null; - json.put("apitemplate", myUri + this.getId() + ".json?q={q}"); json.put("id", this.getId()); json.put("mimetype", SearchResult.SEARSIA_MIME_TYPE); String value = this.getName(); From c9c78dac30b6b7f0d347186d67d16550b09417f6 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 22 Jun 2017 14:25:17 +0200 Subject: [PATCH 39/51] Charset and attribute checks Checks for the system charset in the Main (when exporting index) Check for charset returned by a resource (currently only iso latin 1) Check for attribute in unit test --- src/main/java/org/searsia/Main.java | 4 +++ .../java/org/searsia/engine/Resource.java | 29 ++++++++++++------- .../searsia/index/TestSearchResultIndex.java | 4 ++- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 1c23e5f..8c50590 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -361,6 +361,10 @@ public static void main(String[] args) { // Export index and exit if (options.isExport()) { + String encoding = System.getProperties().getProperty("file.encoding"); + if (encoding == null || !encoding.equals("UTF-8")) { + printMessage("Warning: Unknown encoding. Set JVM encoding with '-Dfile.encoding=UTF-8'", options.isQuiet()); + } printMessage("Exporting index...", options.isQuiet()); try { engines.dump(); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 1bbcc8b..c0e0df5 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -21,7 +21,6 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URLConnection; @@ -590,7 +589,16 @@ private URLConnection setConnectionProperties(URL url, Map heade return connection; } - private InputStream httpConnect(URLConnection connection, String postString) throws IOException { + private String correctContentType(String contentType) { // TODO more charsets + if (contentType != null && contentType.toLowerCase().contains("charset=iso-8859-1")) { + contentType = "ISO-8859-1"; + } else { + contentType = "UTF-8"; + } + return contentType; + } + + private InputStreamReader httpConnect(URLConnection connection, String postString) throws IOException { HttpURLConnection http = (HttpURLConnection) connection; http.setInstanceFollowRedirects(true); if (postString != null && !postString.equals("")) { @@ -609,27 +617,28 @@ private InputStream httpConnect(URLConnection connection, String postString) thr if (responseCode == 301) { // FollowRedirects did not work?! throw new IOException("Moved permanently"); } - if (responseCode == 410) { // Gone: we will use this special error message. + if (responseCode == 410) { // Gone: we will use this special error message elsewhere in this code. throw new IOException(goneErrorMessage); } - return http.getInputStream(); + String contentType = correctContentType(http.getHeaderField("Content-Type")); + return new InputStreamReader(http.getInputStream(), contentType); } - private InputStream fileConnect(URLConnection connection) throws IOException { + private InputStreamReader fileConnect(URLConnection connection) throws IOException { String fileName = connection.getURL().getFile(); - return new FileInputStream(new File(fileName)); + return new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8"); } private String getCompletePage(String urlString, String postString, Map headers) throws IOException { URL url = new URL(urlString); URLConnection connection = setConnectionProperties(url, headers); - InputStream stream; + InputStreamReader reader; if (url.getProtocol().equals("file")) { - stream = fileConnect(connection); + reader = fileConnect(connection); } else { - stream = httpConnect(connection, postString); + reader = httpConnect(connection, postString); } - BufferedReader in = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + BufferedReader in = new BufferedReader(reader); StringBuilder page = new StringBuilder(); if (in != null) { String inputLine; diff --git a/src/test/java/org/searsia/index/TestSearchResultIndex.java b/src/test/java/org/searsia/index/TestSearchResultIndex.java index bc7fbd5..8e9ec36 100644 --- a/src/test/java/org/searsia/index/TestSearchResultIndex.java +++ b/src/test/java/org/searsia/index/TestSearchResultIndex.java @@ -56,7 +56,9 @@ private static SearchResult readFile(String fileString) throws IOException { } JSONObject resource = json.getJSONObject("resource"); result.setResourceId(resource.getString("id")); - result.setQuery(json.getString("query")); + if (json.has("query")) { + result.setQuery(json.getString("query")); + } return result; } From 8a18a7b22e8bf0914b55b34cfb80b957500d7d7b Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 30 Jun 2017 17:23:51 +0200 Subject: [PATCH 40/51] paging, error message, ranking Adds paging for local search Fixes error messages that may leak secrets Changes ranking to idf ranking. TODO: get code in one place, now: in SearchResultIndex - see BM25Similarity and in SearchResult - see boost in scoreResourceSelection. --- src/main/java/org/searsia/SearchResult.java | 124 +++++++++--------- .../java/org/searsia/engine/Resource.java | 18 ++- .../org/searsia/index/SearchResultIndex.java | 4 +- src/main/java/org/searsia/web/Search.java | 41 ++++-- src/test/java/org/searsia/web/SearchTest.java | 20 +-- 5 files changed, 123 insertions(+), 84 deletions(-) diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index bf3489a..fb2155a 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -130,15 +130,23 @@ public void removeResource() { } } + /* ******************************************************************* + * Code below reranks search results for resource selection + * *******************************************************************/ + + /** * New resource ranker, adds rscore. * @param query * @param engines */ - public void scoreResourceSelection(String query, ResourceIndex engines) { - final float boost = 1.0f; - Map maxScore = new HashMap(); - Map topEngines = engines.topValues(query, 10); + public void scoreResourceSelection(String query, ResourceIndex engines, int max, int start) { + SearchResult newResult = new SearchResult(); + final float boost = 0.05f; + final int maxSize = max + start; + Map maxScores = new HashMap(); + Map resourceReturned = new HashMap(); + Map topEngines = engines.topValues(query, maxSize); for (Hit hit: this.hits) { String rid = hit.getString("rid"); if (rid != null) { @@ -153,78 +161,69 @@ public void scoreResourceSelection(String query, ResourceIndex engines) { } topEngines.remove(rid); } + Integer returned = resourceReturned.get(rid); + if (returned == null) { + returned = 0; + } + resourceReturned.put(rid, returned + 1); Float score = prior + hit.getScore() * boost; - Float max = maxScore.get(rid); - if (max == null || max < score) { - max = score; - maxScore.put(rid, max); - } + Float maxScore = maxScores.get(rid); + if (maxScore == null || maxScore < score) { + maxScore = score; + maxScores.put(rid, maxScore); + returned = 0; // this is the best one, so we will add it below no matter what + } hit.setScore(score); - hit.setResourceScore(max); + hit.setResourceScore(maxScore); + if (returned < 4) { // at most 4 results per resource + newResult.addHit(hit); + } } else { hit.setResourceScore(hit.getScore() * boost); + newResult.addHit(hit); } } - for (String rid: topEngines.keySet()) { - Hit hit = new Hit(); + for (String rid: topEngines.keySet()) { + Hit hit = new Hit(); hit.put("rid", rid); hit.setScore(topEngines.get(rid)); hit.setResourceScore(topEngines.get(rid)); - this.hits.add(hit); - } + newResult.addHit(hit); + } + this.hits = newResult.hits; Collections.sort(this.hits, Collections.reverseOrder()); + selectBestResources(max, start); // TODO: efficiently combine this with sort? } - - /** - * Scoring follows these rules: - * (TODO: needs a proper implementation, refactoring, and research ;-) ) - * 1. hits are ordered such that the first hit per rid determines the resource ranking; - * 2. if a resource has a exact query match, then these are ranked highest (given rule 1); - * 3. order by score (given rule 1 and rule 2); - * 4. TODO: not more than x (=10?) hits per resource; - * 5. stop after 20 resources. - * @param query - * @param engines + + /** + * Selects the 'max' best resources, starting at resource 'start' + * Hits MUST be sorted already on rid (rscore). + * @param max + * @param start */ - public void scoreResourceSelectionOld(String query, ResourceIndex engines) { - final float boost = 1.0f; - Map maxScore = new HashMap(); - Map topEngines = engines.topValues(query, 20); + private void selectBestResources(int max, int start) { + String rid, previousRid = null; + int rFound = 0; + int rNeeded = start + max; + int first = 0, i = 0; for (Hit hit: this.hits) { - String rid = hit.getString("rid"); - if (rid != null) { - float prior = 0.0f; - if (engines.containsKey(rid)) { - prior = engines.get(rid).getPrior(); - } - float score = hit.getScore() * boost + prior; - Float top = topEngines.get(rid); - if (top != null) { - if (top > score) { - score = top; - } - topEngines.remove(rid); - } - Float max = maxScore.get(rid); - if (max == null || max < score) { - maxScore.put(rid, score); - max = score; - } - hit.setScore(score); - //hit.put("rscore", max); + rid = hit.getRid(); + if (rid != null && !rid.equals(previousRid)) { + previousRid = rid; + if (start > 0 && rFound == start) { first = i; } + rFound += 1; + if (rFound > rNeeded) { break; } } + i += 1; } - for (String rid: topEngines.keySet()) { - Hit hit = new Hit(); - hit.put("rid", rid); - hit.setScore(topEngines.get(rid)); - //hit.put("rscore", topEngines.get(rid)); - this.hits.add(hit); + if (rFound <= start) { + this.hits.clear(); + } else { + this.hits = this.hits.subList(first, i); } - Collections.sort(this.hits, Collections.reverseOrder()); - } - - + } + + public void scoreReranking(String query, String model) { if ("random".equals(model)) { scoreRerankingRandom(); @@ -271,6 +270,11 @@ private void scoreRerankingRest(String query) { } + /* ******************************************************************* + * End of reranking code + * *******************************************************************/ + + public String randomTerm(String notThisOne) { // TODO: keep track of more previous random queries? int size = this.hits.size(); if (size > 0) { diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index c0e0df5..fe7e266 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -339,8 +339,9 @@ public SearchResult search(String query, String debug) throws SearchException { } catch (Exception e) { // catch all, also runtime exceptions this.nrOfError += 1; this.lastUsedError = new Date().getTime(); - this.lastMessage = e.getMessage(); - throw createPrivateSearchException(e); + SearchException se = createPrivateSearchException(e); + this.lastMessage = se.getMessage(); + throw se; } result.setQuery(query); result.setResourceId(this.getId()); @@ -849,7 +850,7 @@ public float score(String query) { score += 2.0f; // some arbitrary number } } - } + } return score; } @@ -862,7 +863,10 @@ public Resource deepcopy() { } } - + /** + * Update resource + * @param e2 + */ public void updateWith(Resource e2) { // TODO: bad idea in multi-threaded app!? setLastUpdatedToNow(); if (!equals(e2)) { @@ -870,6 +874,7 @@ public void updateWith(Resource e2) { // TODO: bad idea in multi-threaded app!? setUpSinceToNow(); this.nrOfOk = 0; this.nrOfError = 0; + this.lastMessage = null; this.id = e2.id; this.deleted = e2.deleted; this.name = e2.name; @@ -892,6 +897,11 @@ public void updateWith(Resource e2) { // TODO: bad idea in multi-threaded app!? this.privateParameters = e2.privateParameters; } } + + public void updateAllowance(Resource e2) { + if (this.id != null && !this.id.equals(e2.id)) throw new RuntimeException("Cannot update resource ID."); + this.allowance = e2.allowance; + } public JSONObject toJson() { diff --git a/src/main/java/org/searsia/index/SearchResultIndex.java b/src/main/java/org/searsia/index/SearchResultIndex.java index 461cba6..ba6af89 100644 --- a/src/main/java/org/searsia/index/SearchResultIndex.java +++ b/src/main/java/org/searsia/index/SearchResultIndex.java @@ -42,6 +42,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.searsia.Hit; @@ -101,6 +102,7 @@ private void openWriter() throws IOException { private void openReader() throws IOException { this.hitsReader = DirectoryReader.open(FSDirectory.open(this.hitsDirectory)); this.hitsSearcher = new IndexSearcher(this.hitsReader); + this.hitsSearcher.setSimilarity(new BM25Similarity(0.0f, 0.0f)); // simple idf scoring //searcher.setSimilarity(new BM25Similarity(1.2f, 0.75f)); // k1, b //searcher.setSimilarity(new LMDirichletSimilarity(200f)); // mu //searcher.setSimilarity(new LMJelinekMercerSimilarity(0.5f)); // lambda @@ -147,7 +149,7 @@ public void offer(SearchResult result) { } public SearchResult search (String queryString) throws IOException { - return search(queryString, 40); + return search(queryString, 80); } public SearchResult search (String queryString, int hitsPerPage) throws IOException { diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 9926d1a..188f740 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -70,16 +70,39 @@ public Response options() { @GET @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q") String query) { + public Response query(@PathParam("resourceid") String resourceid, + @QueryParam("q") String searchTerms, + @QueryParam("resources") String countResources, + @QueryParam("page") String pageOffset) { if (!resourceid.endsWith(".json")) { return SearsiaApplication.responseError(404, "Not found: " + resourceid); } resourceid = resourceid.replaceAll("\\.json$", ""); Resource me = engines.getMyself(); if (!resourceid.equals(me.getId())) { - return getRemoteResults(resourceid, query); + return getRemoteResults(resourceid, searchTerms); } else { - return getLocalResults(query); + Integer max = 10, start = 0; + if (countResources != null) { + try { + max = Integer.parseInt(countResources); + } catch (NumberFormatException e) { + max = 10; + } + if (max > 1000) { max = 1000; } + if (max < 1) { max = 1; } + } + if (pageOffset != null) { + try { + start = Integer.parseInt(pageOffset); + start = (start - 1) * max; // openSearch standard default starts at 1 + } catch (NumberFormatException e) { + start = 0; + } + if (start > 99) { start = 99; } + if (start < 0) { start = 0; } + } + return getLocalResults(searchTerms, max, start); } } @@ -123,7 +146,7 @@ private Response getRemoteResults(String resourceid, String query) { result = engine.search(query); result.removeResource(); // only trust your mother json = result.toJson(); // first json for response, so - result.addResourceDate(engine.getId()); // response will not have query + resource + result.addResourceDate(engine.getId()); // response will not have resource id + date index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) json.put("resource", engine.toJson()); LOGGER.info("Query " + resourceid + ": " + query); @@ -142,7 +165,7 @@ private Response getRemoteResults(String resourceid, String query) { } } - private Response getLocalResults(String query) { + private Response getLocalResults(String query, int max, int start) { JSONObject json = null, healthJson = null; Resource mother = engines.getMother(); Resource me = engines.getMyself(); @@ -167,23 +190,23 @@ private Response getLocalResults(String query) { LOGGER.warn(e); } } else { // own results? Do resource ranking. - result.scoreResourceSelection(query, engines); + result.scoreResourceSelection(query, engines, max, start); } + LOGGER.info("Local: " + query); } else { // no query: create a 'resource only' result, plus health report result = new SearchResult(); - result.scoreResourceSelection(query, engines); + result.scoreResourceSelection(null, engines, max, start); healthJson = engines.toJsonHealth(); healthJson.put("requestsok", this.nrOfQueriesOk); healthJson.put("requestserr", this.nrOfQueriesError); healthJson.put("upsince", startTime); - LOGGER.info("Local:" + query); + LOGGER.info("Local."); } json = result.toJson(); json.put("resource", me.toJson()); if (healthJson != null) { json.put("health", healthJson); } - LOGGER.info("Local."); return SearsiaApplication.responseOk(json); } diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 2fc62e8..0d86ee6 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -81,7 +81,7 @@ public static void lastThing() throws IOException { @Test // returns 'my' resource description public void test() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wiki.json", ""); + Response response = search.query("wiki.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -93,7 +93,7 @@ public void test() throws IOException { @Test // returns local search results for 'searsia' public void testQuery() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wiki.json", "searsia search for noobs"); + Response response = search.query("wiki.json", "searsia search for noobs", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -107,15 +107,15 @@ public void testQuery() throws IOException { } } Assert.assertEquals(200, status); - Assert.assertTrue(hits.length() > 0); + Assert.assertTrue(hits.length() == 3); Assert.assertEquals("http://searsia.org", url); - Assert.assertNotNull(json.get("resource")); + Assert.assertNotNull(json.get("resource")); } @Test // returns local resource 'wrong' public void testResource() throws IOException, XPathExpressionException, JSONException { Search search = new Search(index, engines); - Response response = search.query("wrong.json", ""); + Response response = search.query("wrong.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -127,7 +127,7 @@ public void testResource() throws IOException, XPathExpressionException, JSONExc @Test // returns resource 'wikididyoumean' (from mother) public void testResourceUnknown() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wikididyoumean.json", ""); + Response response = search.query("wikididyoumean.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -139,7 +139,7 @@ public void testResourceUnknown() throws IOException { @Test // returns results for the engine 'wrong' (which does not exist) public void testError() throws IOException { Search search = new Search(index, engines); - Response response = search.query("wrong.json", "testquery"); + Response response = search.query("wrong.json", "testquery", null, null); int status = response.getStatus(); Assert.assertEquals(503, status); } @@ -147,7 +147,7 @@ public void testError() throws IOException { @Test // returns results for the engine 'wikifull1' public void testOk() throws IOException, XPathExpressionException, JSONException { Search search = new Search(index, engines); - Response response = search.query("wikifull1.json", "informat"); + Response response = search.query("wikifull1.json", "informat", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -156,7 +156,7 @@ public void testOk() throws IOException, XPathExpressionException, JSONException Assert.assertNotNull(json.get("resource")); LOGGER.trace("Query result: " + json); - response = search.query("wikifull1.json", "informat"); + response = search.query("wikifull1.json", "informat", null, null); status = response.getStatus(); entity = (String) response.getEntity(); json = new JSONObject(entity); @@ -166,7 +166,7 @@ public void testOk() throws IOException, XPathExpressionException, JSONException LOGGER.trace("Cache result: " + json); engines.put(okDeleted()); - response = search.query("wikifull1.json", "informat"); + response = search.query("wikifull1.json", "informat", null, null); status = response.getStatus(); entity = (String) response.getEntity(); json = new JSONObject(entity); From 18af1e2ec97f355c990ed0424cead352ac7c6f05 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 13 Jul 2017 12:04:12 +0200 Subject: [PATCH 41/51] Improved ranking and result pages * Improved ranking and result pages * Removing results from deleted resources * Robust casting of scores in hits (including unit test) * Adaption of priors towards more successful resources (rich get richer?) --- src/main/java/org/searsia/Hit.java | 49 +++++++++++-------- src/main/java/org/searsia/SearchResult.java | 10 ++-- .../java/org/searsia/engine/Resource.java | 35 ++++++++++--- .../java/org/searsia/index/ResourceIndex.java | 49 ++++++++++++++----- src/main/java/org/searsia/web/Search.java | 3 +- .../java/org/searsia/SearchResultTest.java | 25 ++++++++-- .../org/searsia/index/TestResourceIndex.java | 4 +- 7 files changed, 124 insertions(+), 51 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index fa7ec8d..a170640 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -113,23 +113,31 @@ public String getId() { } return rid + "@" + result; } - + + private float getFloatValue(String field) { + Float score = 0.0f; + Object scoreObject = map.get(field); + if (scoreObject instanceof Float) { + score = (float) scoreObject; + } else if (scoreObject instanceof Double) { + score = new Float((double) scoreObject); + } else if (scoreObject instanceof Integer) { + score = new Float((int) scoreObject); + } else if (scoreObject instanceof String) { + try { + score = Float.parseFloat((String) scoreObject); + } catch (NumberFormatException e) { } + } + return score; + } + + public float getScore() { - Float score = (Float) map.get("score"); - if (score == null) { - return 0.0f; - } else { - return score; - } + return getFloatValue("score"); } public float getResourceScore() { - Float score = (Float) map.get("rscore"); - if (score == null) { - return 0.0f; - } else { - return score; - } + return getFloatValue("rscore"); } public Object get(String field) { @@ -220,16 +228,15 @@ public int compareTo(Hit hit2) { } else { String rid1 = getRid(); // if two resources the same score String rid2 = hit2.getRid(); - if (rid1 != null && rid2 != null && rid1.compareTo(rid2) != 0) { - return compare = rid1.compareTo(rid2); + if (rid1 != null && rid2 != null) { + compare = rid1.compareTo(rid2); + } + if (compare != 0) { + return compare; } else { - score1 = getScore(); + score1 = getScore(); // cannot be null score2 = hit2.getScore(); - if (score1 != null && score2 != null) { - return score1.compareTo(score2); - } else { - return 0; - } + return score1.compareTo(score2); } } } diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index fb2155a..a7b363b 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -146,12 +146,14 @@ public void scoreResourceSelection(String query, ResourceIndex engines, int max, final int maxSize = max + start; Map maxScores = new HashMap(); Map resourceReturned = new HashMap(); - Map topEngines = engines.topValues(query, maxSize); + Map topEngines = engines.topValuesNotDeleted(query, maxSize); for (Hit hit: this.hits) { String rid = hit.getString("rid"); if (rid != null) { - float prior = 0.0f; - if (engines.containsKey(rid)) { + Resource engine = engines.get(rid); + float prior = 0.0f; + if (engine != null) { + if (engine.isDeleted()) { continue; } // cached result from a deleted resource prior = engines.get(rid).getPrior(); } Float top = topEngines.get(rid); @@ -201,7 +203,7 @@ public void scoreResourceSelection(String query, ResourceIndex engines, int max, * @param max * @param start */ - private void selectBestResources(int max, int start) { + public void selectBestResources(int max, int start) { String rid, previousRid = null; int rFound = 0; int rNeeded = start + max; diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index fe7e266..7ad2b0d 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -751,12 +751,22 @@ public int getAllowance() { return (int) currentAllowance; } + private float getExactPrior() { + float prior = 0.0f; + if (this.prior != null) { + prior = this.prior; + } + return prior; + } + public float getPrior() { - if (this.prior == null) { - return 0.0f; - } else { - return this.prior; - } + float prior = 0.0f; + if (this.prior != null) { + prior = this.prior; + } + prior += this.nrOfOk * 0.00001f; // add a tiny amount of success... + prior -= this.nrOfError * 0.00001f; + return prior; } public int getNrOfErrors() { @@ -1004,7 +1014,18 @@ public void updateHealth(JSONObject health) throws ParseException { public int compareTo(Resource e2) { Float score1 = getPrior(); Float score2 = e2.getPrior(); - return score1.compareTo(score2); + int compare = score1.compareTo(score2); + if (compare != 0) { + return compare; + } else { + String rid1 = getId(); // we need a full ordering + String rid2 = e2.getId(); + if (rid1 != null && rid2 != null) { + return rid1.compareTo(rid2); + } else { + return 0; + } + } } @@ -1027,7 +1048,7 @@ public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? if (!stringEquals(this.getUserTemplate(), e.getUserTemplate())) return false; if (!stringEquals(this.getSuggestTemplate(), e.getSuggestTemplate())) return false; if (this.getRate() != e.getRate()) return false; - if (Math.abs(this.getPrior() - e.getPrior()) > 0.0001f) return false; + if (Math.abs(this.getExactPrior() - e.getExactPrior()) > 0.001f) return false; if (!listEquals(this.getExtractors(), e.getExtractors())) return false; if (!mapEquals(this.getHeaders(), e.getHeaders())) return false; return true; diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index d43e92d..336c1ac 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -68,7 +68,7 @@ public class ResourceIndex { private String lastFlushed = null; /** - * Reads resources from index (if they exist) + * Creates index or reads resources from index (if it exist) * @param path path where the Searsia index resides * @param filename index file name * @throws IOException @@ -171,18 +171,25 @@ public Resource getMyself() { return this.me; } - + /** + * Delete resource from index (not used, instead use resource.deleted) + * @param id + * @throws IOException + */ public void delete(String id) throws IOException { Resource engine = get(id); if (engine == null) { - throw new IOException("Resouce '" + id + "' not found"); + throw new IOException("Resource '" + id + "' not found"); } this.engines.remove(id); this.writer.deleteDocuments(new Term("id", id)); this.writer.commit(); } - - + + /** + * Adds resource to index or update it. + * @param engine + */ public void put(Resource engine) { if (this.mother != null && engine.getId().equals(this.mother.getId())) { throw new RuntimeException("Mother id conflict: " + engine.getId()); @@ -198,22 +205,35 @@ public void put(Resource engine) { } } + /** + * Checks existence of resource + * @param id + * @return + */ public boolean containsKey(String id) { return this.engines.containsKey(id); } - + /** + * Get resource + * @param id + * @return + */ public Resource get(String id) { return this.engines.get(id); } + /** + * Get a random resource. If it is not there, return the mother. + * @return + */ public Resource getRandom() { Object[] keys = this.engines.keySet().toArray(); if (keys.length > 0) { int nr = random.nextInt(keys.length); int i = nr + 1; Resource engine = this.engines.get(keys[nr]); - while (engine.isDeleted() && i != nr) { + while (engine.isDeleted() && i != nr) { // if deleted, pick next if (i >= keys.length) { i = 0; } engine = this.engines.get(keys[i]); i += 1; @@ -226,17 +246,20 @@ public Resource getRandom() { } // Efficiency can be gained here? - public Map topValues(String queryString, int max) { - Float[] topScores = new Float[max]; + public Map topValuesNotDeleted(String queryString, int max) { + float[] topScores = new float[max]; Resource[] topEngines = new Resource[max]; int size = 0; float lastScore = -99.0f; + String lastId = ""; for (Resource engine: this.engines.values()) { + if (engine.isDeleted()) { continue; } float score = engine.score(queryString) + engine.getPrior(); - if (size < max || score > lastScore) { + String id = engine.getId(); + if (size < max || (score > lastScore || (score == lastScore && id.compareTo(lastId) > 0))) { if (size < max) size++; int index = size - 1; - while(index > 0 && topScores[index - 1] < score) { + while(index > 0 && (topScores[index - 1] < score || (topScores[index - 1] == score && id.compareTo(topEngines[index - 1].getId()) > 0))) { topScores[index] = topScores[index - 1]; topEngines[index] = topEngines[index - 1]; index -= 1; @@ -244,6 +267,7 @@ public Map topValues(String queryString, int max) { topScores[index] = score; topEngines[index] = engine; lastScore = topScores[size - 1]; + lastId = topEngines[size - 1].getId(); } } Map result = new LinkedHashMap(); @@ -279,7 +303,7 @@ public float maxPrior() { float max = 0.0f; for (Resource e: this.engines.values()) { if (e.getPrior() > max) { - max = e.getPrior(); + max = e.getPrior(); } } return max; @@ -351,6 +375,7 @@ public JSONObject toJsonHealth() { int countOk = 0, countError = 0; for (Resource engine: this.engines.values()) { + if (engine.isDeleted()) { continue; } String error = engine.getLastError(); if (engine.isHealthy()) { countOk += 1; diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 188f740..f46612b 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -189,9 +189,8 @@ private Response getLocalResults(String query, int max, int start) { } catch (Exception e) { LOGGER.warn(e); } - } else { // own results? Do resource ranking. - result.scoreResourceSelection(query, engines, max, start); } + result.scoreResourceSelection(query, engines, max, start); LOGGER.info("Local: " + query); } else { // no query: create a 'resource only' result, plus health report result = new SearchResult(); diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index b217740..1c792d2 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -8,7 +8,7 @@ public class SearchResultTest { @Test - public void test1() { + public void testSimple() { SearchResult sr = new SearchResult(); Hit h = new Hit(); h.put("title", "boo"); @@ -17,13 +17,13 @@ public void test1() { } @Test - public void test2() { + public void testEmpty() { SearchResult sr = new SearchResult(); Assert.assertEquals("{\"hits\":[]}", sr.toJson().toString()); } @Test - public void test3() { + public void testSampleAndRerank() { SearchResult sr = new SearchResult(); Hit h = new Hit("The ultimate test", "Oh yeah", "http://searsia.org", "http://searsia.org/images/search.png"); @@ -44,4 +44,23 @@ public void test3() { Assert.assertEquals("Query matches zero results", sr.getHits().size(), 0); } + @Test + public void testCast() { + float score = 0.1f; + Hit h1 = new Hit(); + h1.put("score", score); + Assert.assertEquals(score, h1.getScore(), 0.0001f); + Hit h2 = new Hit(); + h2.put("score", Float.toString(score)); + Assert.assertEquals(score, h2.getScore(), 0.0001f); + Hit h3 = new Hit(); + h3.put("score", "wrong means zero"); + Assert.assertEquals(0.0f, h3.getScore(), 0.0001f); + Hit h4 = new Hit("{\"title\":\"boo\",\"score\":1.0}"); + Assert.assertEquals(1.0f, h4.getScore(), 0.0001f); + Hit h5 = new Hit("{\"title\":\"boo\",\"score\":1}"); + Assert.assertEquals(1.0f, h5.getScore(), 0.0001f); + Hit h6 = new Hit("{\"title\":\"boo\",\"score\":9.7E-4}"); + Assert.assertTrue(h6.getScore() > 0.0f); + } } diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index 9c9ad36..50e33ee 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -76,8 +76,8 @@ public static void checkFiles() throws IOException, XPathExpressionException, JS Assert.assertTrue("No private parameters expected", e4.getJsonPrivateParameters() == null); Resource e6 = engines.get(newby().getId()); Assert.assertTrue("Private parameters expected", e6.getJsonPrivateParameters() != null); - Assert.assertTrue("Top 1", engines.topValues("anything", 1).size() == 1); - Assert.assertTrue("Top 2", engines.topValues(null, 2).size() == 2); + Assert.assertTrue("Top 1", engines.topValuesNotDeleted("anything", 1).size() == 1); + Assert.assertTrue("Top 2", engines.topValuesNotDeleted(null, 2).size() == 2); } @Test From 0c5b2921b27d8b2610a4aa18595afe74f05c873a Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 13 Jul 2017 13:57:03 +0200 Subject: [PATCH 42/51] paging fix --- src/main/java/org/searsia/web/Search.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index f46612b..c10e3cd 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -89,7 +89,7 @@ public Response query(@PathParam("resourceid") String resourceid, } catch (NumberFormatException e) { max = 10; } - if (max > 1000) { max = 1000; } + if (max > 200) { max = 200; } // FedWeb14 has about 150 if (max < 1) { max = 1; } } if (pageOffset != null) { @@ -99,7 +99,6 @@ public Response query(@PathParam("resourceid") String resourceid, } catch (NumberFormatException e) { start = 0; } - if (start > 99) { start = 99; } if (start < 0) { start = 0; } } return getLocalResults(searchTerms, max, start); @@ -165,7 +164,7 @@ private Response getRemoteResults(String resourceid, String query) { } } - private Response getLocalResults(String query, int max, int start) { + private Response getLocalResults(String query, int max, int start) { JSONObject json = null, healthJson = null; Resource mother = engines.getMother(); Resource me = engines.getMyself(); From 782e10cae1f2ac24903a2b0f42f69a80b369d44b Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 8 Sep 2017 11:43:26 +0200 Subject: [PATCH 43/51] Remove HTML, no redirects, and more tests --- src/main/java/org/searsia/Hit.java | 12 ++++++++--- src/main/java/org/searsia/web/Redirect.java | 13 ++++++++++++ .../java/org/searsia/engine/ResourceTest.java | 21 +++++++++++++++++-- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index a170640..0db42df 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -51,7 +51,12 @@ public Hit(JSONObject json) { Iterator keys = json.keys(); while (keys.hasNext()) { String key = (String) keys.next(); - map.put(key, json.get(key)); + Object value = json.get(key); + if (value instanceof String) { + map.put(key, noHTML((String) value)); + } else if (value instanceof Number || value instanceof Boolean) { + map.put(key, value); + } } } @@ -165,8 +170,9 @@ public String toString() { return map.entrySet().toString(); } - private String noHTML(String value) { - value = value.replaceAll("<[^>]+>", ""); // no HTML + private String noHTML(String value) { // TODO: also in TextExtractor?? + value = value.replaceAll("(?i)]*>||||", ""); // No HTML, please: spans removed + value = value.replaceAll("<[^>]+>|&#?[0-9a-zA-Z]{1,9};", ""); // no HTML return value.replaceAll("[<>]", ""); } diff --git a/src/main/java/org/searsia/web/Redirect.java b/src/main/java/org/searsia/web/Redirect.java index bf63683..a3060e5 100644 --- a/src/main/java/org/searsia/web/Redirect.java +++ b/src/main/java/org/searsia/web/Redirect.java @@ -4,8 +4,11 @@ import javax.ws.rs.GET; import javax.ws.rs.Path; +import javax.ws.rs.Produces; import javax.ws.rs.core.Response; +import org.searsia.SearchResult; + @Path("/") public class Redirect { @@ -17,6 +20,16 @@ public Redirect(String id) throws IOException { } @GET + @Produces(SearchResult.SEARSIA_MIME_ENCODING) + public Response notFound() { + return SearsiaApplication.responseError(404, "Not found"); + } + + /** + * Redirect, not used because it does not always behave well in + * case web servers do a simple rewrite of URLs. + * @return + */ public Response redirect() { return Response .status(301) diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index 8b91363..2239888 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -65,6 +65,14 @@ public void testSearchXml2() throws XPathExpressionException, SearchException { Assert.assertFalse("Parser timed out", System.currentTimeMillis() - startTime > 10000); } + @Test + public void testSearchXml3() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("http://searsia.org/searsia/wiki/cse1{q}.json").updateFromAPI(); + SearchResult result = se1.search("life"); + Assert.assertEquals("application/xml", se1.getMimeType()); + Assert.assertEquals(10, result.getHits().size()); + } + @Test public void testSearchJson() throws XPathExpressionException, SearchException { Resource se = new Resource("file:src/test/resources/searsia.json").updateFromAPI(); @@ -84,13 +92,22 @@ public void testSearchJson2() throws XPathExpressionException, SearchException { } @Test - public void testSearchJson3() throws XPathExpressionException, SearchException { + public void testSearchJsonStrangeKeys() throws XPathExpressionException, SearchException { Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); SearchResult result = se.search("strange keys"); Assert.assertEquals(1, result.getHits().size()); } - @Test + @Test + public void testSearchJsonHtmlAndlinks() throws XPathExpressionException, SearchException { + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); + SearchResult result = se.search("html and links"); + Assert.assertEquals(2, result.getHits().size()); + Assert.assertEquals("Another test for Searsia", result.getHits().get(0).getTitle()); + Assert.assertEquals("mailto:info@searsia.org", result.getHits().get(1).getString("url")); // TODO getUrl instead of getString + } + + @Test public void testSearchJavascript() throws XPathExpressionException, SearchException { Resource se = new Resource("file:src/test/resources/javascript.json").updateFromAPI(); String debug = "xml"; From bc8cb19745d84b7b05e77b9e532da4e773912b60 Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 20 Sep 2017 09:10:46 +0200 Subject: [PATCH 44/51] Errors and results * Main: better error message * Resource: fix counting rate-limit errors * SearchResult: return resuls if no samples from resource --- src/main/java/org/searsia/Main.java | 6 +++++- src/main/java/org/searsia/SearchResult.java | 4 ++-- src/main/java/org/searsia/engine/Resource.java | 10 ++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 8c50590..9c83c0d 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -242,7 +242,11 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie System.out.flush(); } if (result.getHits().isEmpty()) { - throw new SearchException("No results for test query."); + String tip = ""; + if (mother.getRerank() != null) { + tip = " Try removing rerank."; + } + throw new SearchException("No results for test query." + tip); } if (result.getHits().size() < 10) { printMessage("Warning: less than 10 results for query: " + result.getQuery() + "; see \"testquery\" or \"rerank\".", isQuiet); diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index a7b363b..6d1cda4 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -217,8 +217,8 @@ public void selectBestResources(int max, int start) { if (rFound > rNeeded) { break; } } i += 1; - } - if (rFound <= start) { + } + if (rFound < start) { this.hits.clear(); } else { this.hits = this.hits.subList(first, i); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 7ad2b0d..c3f4f53 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -296,14 +296,12 @@ public SearchResult search(String query) throws SearchException { public SearchResult search(String query, String debug) throws SearchException { - if (rateLimitReached()) { - this.lastMessage = "Too many queries"; - this.lastUsedError = new Date().getTime(); - throw new SearchException(this.lastMessage); - } SearchResult result; try { - if (this.urlAPITemplate == null) { + if (rateLimitReached()) { + throw new SearchException("Too many queries"); + } + if (this.urlAPITemplate == null) { throw new SearchException("No API Template"); } String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); From 1c5b2cb9246629256741094e706f217464f36f84 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 21 Sep 2017 19:11:22 +0200 Subject: [PATCH 45/51] proxy and lenient urls --- src/main/java/org/searsia/web/OpenSearch.java | 4 +- src/main/java/org/searsia/web/Proxy.java | 77 +++++++++++++++---- src/main/java/org/searsia/web/Search.java | 3 - 3 files changed, 64 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/searsia/web/OpenSearch.java b/src/main/java/org/searsia/web/OpenSearch.java index ff053c1..ff26098 100644 --- a/src/main/java/org/searsia/web/OpenSearch.java +++ b/src/main/java/org/searsia/web/OpenSearch.java @@ -33,7 +33,7 @@ * @author hiemstra * */ -@Path("{resourceid}/opensearch.xml") +@Path("{resourceid}") public class OpenSearch { private ResourceIndex engines; @@ -42,7 +42,7 @@ public OpenSearch(ResourceIndex engines) throws IOException { this.engines = engines; } - @GET + @GET @Path("opensearch.xml") @Produces("application/opensearchdescription+xml; charset=utf-8") public Response get(@PathParam("resourceid") String resourceid) { Resource engine; diff --git a/src/main/java/org/searsia/web/Proxy.java b/src/main/java/org/searsia/web/Proxy.java index f7c26e7..5e3ef67 100644 --- a/src/main/java/org/searsia/web/Proxy.java +++ b/src/main/java/org/searsia/web/Proxy.java @@ -1,10 +1,13 @@ package org.searsia.web; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; +import java.util.HashMap; +import java.util.Map; import javax.ws.rs.GET; import javax.ws.rs.Path; @@ -15,36 +18,81 @@ import javax.ws.rs.core.Response; import javax.ws.rs.core.Response.ResponseBuilder; +import org.searsia.engine.Resource; import org.searsia.index.ResourceIndex; -@Path("{resourceid}/proxy") +@Path("{resourceid}") public class Proxy { private ResourceIndex engines; + private Map iconStore = new HashMap(); public Proxy(ResourceIndex engines) throws IOException { this.engines = engines; } - @GET + @GET @Path("proxy") public Response query(@PathParam("resourceid") String resourceid, @QueryParam("url") String url, @Context HttpHeaders headers) { try { - if (url != null && (engines.getMyself().getId().equals(resourceid) || engines.get(resourceid) != null)) { - if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { - return Response.status(304).build(); // cheating! Maybe really check if it is modified? - } else { - return getWebResponse(url); - } + if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { + return Response.notModified().build(); } else { - return SearsiaApplication.responseError(404, "Resource not found: " + resourceid); + return getWebResponse(url); + } + } catch (Exception e) { + return Response.status(503).build(); + } + } + + @GET @Path("icon") + public Response icon(@PathParam("resourceid") String resourceid, @Context HttpHeaders headers) { + try { + if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { + return Response.notModified().build(); + } else { + return getWebIcon(resourceid); } } catch (Exception e) { - return SearsiaApplication.responseError(503, "Unavailable: " + e.getMessage()); + return Response.status(503).build(); // unavailable } } + private Response getWebIcon(String resourceid) throws IOException { + Resource engine = engines.get(resourceid); + if (engine == null) { + engine = engines.getMyself(); + if (engine == null || !engine.getId().equals(resourceid)) { + return Response.status(404).build(); + } + } + String iconFile = engine.getFavicon(); + if (iconFile == null) { + return Response.status(503).build(); + } + return getIconResponse(iconFile); + } + + private Response getIconResponse(String urlString) throws IOException { + HttpURLConnection http = getHttp(urlString); + InputStream stream = http.getInputStream(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; // Experiment with this value + int bytesRead; + while ((bytesRead = stream.read(buffer)) != -1) { + baos.write(buffer, 0, bytesRead); + } + return Response.ok(baos.toByteArray(), "image/png").build(); + } + + private Response getWebResponse(String urlString) throws IOException { + HttpURLConnection http = getHttp(urlString); + InputStream stream = http.getInputStream(); + return responseWithHeaders(http, stream).build(); + } + + private HttpURLConnection getHttp(String urlString) throws IOException { URL url = new URL(urlString); URLConnection connection = url.openConnection(); connection.setRequestProperty("User-Agent", "Searsia/1.0"); @@ -55,10 +103,9 @@ private Response getWebResponse(String urlString) throws IOException { http.setInstanceFollowRedirects(true); http.setRequestMethod("GET"); http.connect(); - InputStream stream = http.getInputStream(); - return responseWithHeaders(http, stream).build(); + return http; } - + private ResponseBuilder responseWithHeaders(HttpURLConnection http, InputStream stream) { ResponseBuilder builder = Response.ok(stream); String field = http.getHeaderField("Content-Type"); @@ -73,5 +120,5 @@ private ResponseBuilder responseWithHeaders(HttpURLConnection http, InputStream if (field != null) builder.header("Last-Modified", field); return builder; } - -} + +} \ No newline at end of file diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index c10e3cd..5b2a9c0 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -74,9 +74,6 @@ public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q") String searchTerms, @QueryParam("resources") String countResources, @QueryParam("page") String pageOffset) { - if (!resourceid.endsWith(".json")) { - return SearsiaApplication.responseError(404, "Not found: " + resourceid); - } resourceid = resourceid.replaceAll("\\.json$", ""); Resource me = engines.getMyself(); if (!resourceid.equals(me.getId())) { From b22c8b4741743809c407ef3f006fc92b9a0912c5 Mon Sep 17 00:00:00 2001 From: Searsia Date: Thu, 21 Sep 2017 19:14:36 +0200 Subject: [PATCH 46/51] restricted error messages --- src/main/java/org/searsia/Main.java | 2 +- .../java/org/searsia/engine/SearchException.java | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 9c83c0d..336bf25 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -343,7 +343,7 @@ public static void main(String[] args) { testMother(mother, options.getTestOutput(), options.isQuiet()); printMessage("Test succeeded.", options.isQuiet()); } catch (Exception e) { - fatalError("Test failed: " + e.getMessage()); + fatalError("Test failed: " + e.getLocalizedMessage()); } } else { printMessage("Starting: " + myself.getName() + " (" + myself.getId() + ")", options.isQuiet()); diff --git a/src/main/java/org/searsia/engine/SearchException.java b/src/main/java/org/searsia/engine/SearchException.java index 46c0806..2430777 100644 --- a/src/main/java/org/searsia/engine/SearchException.java +++ b/src/main/java/org/searsia/engine/SearchException.java @@ -31,4 +31,17 @@ public SearchException(String message) { super(message); } + @Override + public String getMessage() { + String message = super.getMessage(); + message = message.replaceAll("^[A-Za-z\\.]*\\.", ""); // removes Java package names + message = message.replaceAll(":? ?https?:[^ ]+", ""); // removes URLs (which may contain API keys) + return message; + } + + @Override + public String getLocalizedMessage() { // misusing Localization for full error message + return super.getMessage(); + } + } From d97a86c86cfacb815f48915dc04a512e13452925 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 22 Sep 2017 11:32:30 +0200 Subject: [PATCH 47/51] v1, proxy, health, share options --- pom.xml | 2 +- src/main/java/org/searsia/Main.java | 4 +- src/main/java/org/searsia/SearsiaOptions.java | 26 ++++-- .../java/org/searsia/engine/Resource.java | 22 +++-- src/main/java/org/searsia/web/OpenSearch.java | 10 ++- src/main/java/org/searsia/web/Proxy.java | 81 ++++++++++++++----- src/main/java/org/searsia/web/Search.java | 34 +++++--- .../org/searsia/web/SearsiaApplication.java | 14 +++- src/test/java/org/searsia/web/SearchTest.java | 20 ++--- 9 files changed, 143 insertions(+), 70 deletions(-) diff --git a/pom.xml b/pom.xml index bbcd173..045c05f 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.searsia searsiaserver - 1.0.0 + 1.0.1 3 diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 336bf25..674723f 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -159,7 +159,7 @@ private static String removeFileNameUri(String uri) { private static String normalizedUriToTemplate(String uri, String rid) { if (uri != null) { if (uri.endsWith("/") ) { - uri += rid + ".json?q={q}"; + uri += rid + "?q={q}"; } else if (!uri.contains("{q")) { // check for tests on searsia.org uri += "?q={q}"; } @@ -385,7 +385,7 @@ public static void main(String[] args) { // Start the web server String myURI = removeFileNameUri(options.getMyURI()); try { - SearsiaApplication app = new SearsiaApplication(index, engines); + SearsiaApplication app = new SearsiaApplication(index, engines, options); server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), app); } catch (Exception e) { fatalError("Server failed: " + e.getMessage()); diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 3f1611d..53bf233 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -36,6 +36,7 @@ public class SearsiaOptions { /* See setDefaults() below */ + private Boolean anonymous; private String test; private Boolean quiet; private Boolean help; @@ -57,6 +58,7 @@ public class SearsiaOptions { */ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedURLException { Options options = new Options(); + options.addOption("a", "anonymous",false, "Anonymous traffic by proxying all calls."); // TODO options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); options.addOption("d", "dontshare",false, "Do not share resource definitions."); // TODO options.addOption("e", "export", false, "Export index to stdout and exit."); @@ -65,7 +67,7 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedU options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); options.addOption("m", "mother", true, "Set url of mother's web service end point."); options.addOption("n", "nohealth", false, "Do not share health report."); - options.addOption("p", "path", true, "Set directory path to store the index."); // TODO + options.addOption("p", "path", true, "Set directory path to store the index."); options.addOption("q", "quiet", false, "No output to console."); options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response', 'all')."); options.addOption("u", "url", true, "Set url of my web service endpoint."); @@ -75,7 +77,13 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedU myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); } } - + + /** + * Default options, to be used for unit tests only. + */ + public SearsiaOptions() { + setDefaults(); + } private static String lastDir(String urlString) throws MalformedURLException { urlString = urlString.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); @@ -92,6 +100,7 @@ private static String lastDir(String urlString) throws MalformedURLException { private void setDefaults() { + anonymous = false; test = null; // no test help = false; quiet = false; @@ -117,10 +126,9 @@ private boolean pathExists(String path) { private String friendlyIndexPath() { String path; String file = "searsia"; + String os = System.getProperty("os.name").toLowerCase(); String home = System.getProperty("user.home"); if (home == null || !pathExists(home)) home = "."; - - String os = System.getProperty("os.name").toLowerCase(); if (os.contains("win")) { // On Windows path = System.getenv("AppData"); if (!pathExists(path)) { @@ -152,7 +160,9 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } catch (ParseException e) { throw new IllegalArgumentException(e.getMessage() + " (use '-h' for help)"); } - + if (cmd.hasOption("a")) { + anonymous = true; + } if (cmd.hasOption("c")) { cacheSize = new Integer(cmd.getOptionValue("c")); if (cacheSize < 30) { @@ -200,7 +210,6 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (!motherTemplate.matches("^https?://.*|^file:.*")) { motherTemplate = "file:" + motherTemplate.replace("\\", "/"); // TODO C:\file on Windows? } - } if (cmd.hasOption("h") || cmd.getArgs().length < 0 || !cmd.hasOption("m")) { if (!cmd.hasOption("m")) { @@ -280,6 +289,10 @@ public String getIndexPath() { return indexPath; } + public Boolean isAnonymous() { + return anonymous; + } + public Boolean isQuiet() { return quiet; } @@ -310,6 +323,7 @@ public String toString() { result += "\n Poll Interval = " + getPollInterval(); result += "\n Cache Size = " + getCacheSize(); result += "\n Test Output = " + getTestOutput(); + result += "\n Anonymous = " + isAnonymous(); result += "\n Do Not Share = " + isNotShared(); result += "\n No Health Rep.= " + isNoHealthReport(); return result; diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index c3f4f53..cb40fae 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -957,7 +957,7 @@ public JSONObject toJsonEngine() { public JSONObject toJsonEngineDontShare() { JSONObject engine = new JSONObject(); - if (id != null) engine.put("id", id); + if (id != null) engine.put("id", id); if (deleted) { engine.put("deleted", true); } else { @@ -994,17 +994,15 @@ public JSONObject toJsonHealth() { * @throws ParseException */ public void updateHealth(JSONObject health) throws ParseException { - //try { - Integer num = health.getInt("requestsok"); - if (num != null) this.nrOfOk = num; - num = health.getInt("requestserr"); - if (num != null) this.nrOfError = num; - this.lastUsedOk = dateFormat.parse(health.getString("lastsuccess")).getTime(); - this.lastUsedError = dateFormat.parse(health.getString("lasterror")).getTime(); - this.lastUpdated = dateFormat.parse(health.getString("lastupdated")).getTime(); - this.upsince = dateFormat.parse(health.getString("upsince")).getTime(); - if (health.has("lastmessage")) this.lastMessage = health.getString("lastmessage"); - // } catch (Exception e) { } // TODO: woops? + Integer num = health.getInt("requestsok"); + if (num != null) this.nrOfOk = num; + num = health.getInt("requestserr"); + if (num != null) this.nrOfError = num; + this.lastUsedOk = dateFormat.parse(health.getString("lastsuccess")).getTime(); + this.lastUsedError = dateFormat.parse(health.getString("lasterror")).getTime(); + this.lastUpdated = dateFormat.parse(health.getString("lastupdated")).getTime(); + this.upsince = dateFormat.parse(health.getString("upsince")).getTime(); + if (health.has("lastmessage")) this.lastMessage = health.getString("lastmessage"); } diff --git a/src/main/java/org/searsia/web/OpenSearch.java b/src/main/java/org/searsia/web/OpenSearch.java index ff26098..d89be39 100644 --- a/src/main/java/org/searsia/web/OpenSearch.java +++ b/src/main/java/org/searsia/web/OpenSearch.java @@ -37,9 +37,11 @@ public class OpenSearch { private ResourceIndex engines; + private boolean dontshare; - public OpenSearch(ResourceIndex engines) throws IOException { - this.engines = engines; + public OpenSearch(ResourceIndex engines, boolean dontshare) throws IOException { + this.engines = engines; + this.dontshare = dontshare; } @GET @Path("opensearch.xml") @@ -87,7 +89,9 @@ private String engineXML(Resource engine) { response += "\n"; response += " " + xmlEncode(shortName) + "\n"; response += " Search the web with " + xmlEncode(shortName) + "\n"; - response += " \n"; + if(!dontshare) { + response += " \n"; + } if (userTemplate != null) response += " \n"; if (suggestTemplate != null) response += " \n"; if (testQuery != null) response += " \n"; diff --git a/src/main/java/org/searsia/web/Proxy.java b/src/main/java/org/searsia/web/Proxy.java index 5e3ef67..ccd69c0 100644 --- a/src/main/java/org/searsia/web/Proxy.java +++ b/src/main/java/org/searsia/web/Proxy.java @@ -1,3 +1,19 @@ +/* + * Copyright 2016-2017 Searsia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.searsia.web; import java.io.ByteArrayOutputStream; @@ -6,8 +22,13 @@ import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; import java.util.HashMap; +import java.util.Locale; import java.util.Map; +import java.util.TimeZone; import javax.ws.rs.GET; import javax.ws.rs.Path; @@ -21,40 +42,52 @@ import org.searsia.engine.Resource; import org.searsia.index.ResourceIndex; - +/** + * Provides a proxy for any (image) url + * and a special caching proxy for the resources' fav icons. + */ @Path("{resourceid}") public class Proxy { private ResourceIndex engines; - private Map iconStore = new HashMap(); - + private Map iconStore = new HashMap(); + private String lastModified = null; + public Proxy(ResourceIndex engines) throws IOException { + DateFormat dateFormat = new SimpleDateFormat("EEE, FF MMM yyyy hh:mm:ss zzz", Locale.ROOT); + dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); + this.lastModified = dateFormat.format(new Date()); this.engines = engines; } @GET @Path("proxy") - public Response query(@PathParam("resourceid") String resourceid, @QueryParam("url") String url, @Context HttpHeaders headers) { + public Response query(@PathParam("resourceid") String resourceid, + @QueryParam("url") String url, + @Context HttpHeaders headers) { try { - if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { + if (headers.getRequestHeader("If-Modified-Since") != null + || headers.getRequestHeader("If-None-Match") != null) { return Response.notModified().build(); } else { - return getWebResponse(url); + return getStreamResponse(url); } } catch (Exception e) { - return Response.status(503).build(); + return Response.status(503).build(); // 503 = unavailable } } @GET @Path("icon") - public Response icon(@PathParam("resourceid") String resourceid, @Context HttpHeaders headers) { + public Response icon(@PathParam("resourceid") String resourceid, + @Context HttpHeaders headers) { try { - if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { + if (headers.getRequestHeader("If-Modified-Since") != null + || headers.getRequestHeader("If-None-Match") != null) { return Response.notModified().build(); } else { return getWebIcon(resourceid); } } catch (Exception e) { - return Response.status(503).build(); // unavailable + return Response.status(503).build(); } } @@ -70,26 +103,34 @@ private Response getWebIcon(String resourceid) throws IOException { if (iconFile == null) { return Response.status(503).build(); } - return getIconResponse(iconFile); + ResponseBuilder builder = iconStore.get(resourceid); + if (builder == null) { + builder = getCachedBuilder(iconFile); + iconStore.put(resourceid, builder); + } + return builder.build(); } - private Response getIconResponse(String urlString) throws IOException { + private ResponseBuilder getCachedBuilder(String urlString) throws IOException { HttpURLConnection http = getHttp(urlString); + String contentType = http.getHeaderField("Content-Type"); + if (contentType == null) { + contentType = "image/png"; + } InputStream stream = http.getInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[1024]; // Experiment with this value + byte[] buffer = new byte[1024]; int bytesRead; while ((bytesRead = stream.read(buffer)) != -1) { baos.write(buffer, 0, bytesRead); } - return Response.ok(baos.toByteArray(), "image/png").build(); + return Response.ok(baos.toByteArray(), contentType) + .header("Last-Modified", lastModified); } - - private Response getWebResponse(String urlString) throws IOException { + private Response getStreamResponse(String urlString) throws IOException { HttpURLConnection http = getHttp(urlString); - InputStream stream = http.getInputStream(); - return responseWithHeaders(http, stream).build(); + return getStreamBuilder(http).build(); } private HttpURLConnection getHttp(String urlString) throws IOException { @@ -106,8 +147,8 @@ private HttpURLConnection getHttp(String urlString) throws IOException { return http; } - private ResponseBuilder responseWithHeaders(HttpURLConnection http, InputStream stream) { - ResponseBuilder builder = Response.ok(stream); + private ResponseBuilder getStreamBuilder(HttpURLConnection http) throws IOException { + ResponseBuilder builder = Response.ok(http.getInputStream()); String field = http.getHeaderField("Content-Type"); if (field != null) builder.header("Content-Type", field); field = http.getHeaderField("Content-Length"); diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 5b2a9c0..652a0d5 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -32,6 +32,7 @@ import org.json.JSONObject; import org.searsia.SearchResult; +import org.searsia.SearsiaOptions; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.engine.Resource; @@ -51,13 +52,15 @@ public class Search { private ResourceIndex engines; private SearchResultIndex index; + private SearsiaOptions options; private long nrOfQueriesOk = 0; private long nrOfQueriesError = 0; - public Search(SearchResultIndex index, ResourceIndex engines) throws IOException { + public Search(SearchResultIndex index, ResourceIndex engines, SearsiaOptions options) throws IOException { this.engines = engines; - this.index = index; + this.index = index; + this.options = options; } @OPTIONS @@ -134,9 +137,7 @@ private Response getRemoteResults(String resourceid, String query) { if (result != null) { boolean censorQueryResourceId = true; json = result.toJson(censorQueryResourceId); - json.put("resource", engine.toJson()); LOGGER.info("Cache " + resourceid + ": " + query); - return SearsiaApplication.responseOk(json); } else { try { result = engine.search(query); @@ -144,9 +145,7 @@ private Response getRemoteResults(String resourceid, String query) { json = result.toJson(); // first json for response, so result.addResourceDate(engine.getId()); // response will not have resource id + date index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) - json.put("resource", engine.toJson()); LOGGER.info("Query " + resourceid + ": " + query); - return SearsiaApplication.responseOk(json); } catch (Exception e) { String message = "Resource " + resourceid + " unavailable: " + e.getMessage(); LOGGER.warn(message); @@ -154,11 +153,18 @@ private Response getRemoteResults(String resourceid, String query) { } } } else { - json = new JSONObject().put("resource", engine.toJson()); - json.put("health", engine.toJsonHealth()); + json = new JSONObject(); + if (!options.isNoHealthReport()) { + json.put("health", engine.toJsonHealth()); + } LOGGER.info("Resource " + resourceid + "."); - return SearsiaApplication.responseOk(json); } + if (options.isNotShared()) { + json.put("resource", engine.toJsonEngineDontShare()); + } else { + json.put("resource", engine.toJson()); + } + return SearsiaApplication.responseOk(json); } private Response getLocalResults(String query, int max, int start) { @@ -191,10 +197,12 @@ private Response getLocalResults(String query, int max, int start) { } else { // no query: create a 'resource only' result, plus health report result = new SearchResult(); result.scoreResourceSelection(null, engines, max, start); - healthJson = engines.toJsonHealth(); - healthJson.put("requestsok", this.nrOfQueriesOk); - healthJson.put("requestserr", this.nrOfQueriesError); - healthJson.put("upsince", startTime); + if (!this.options.isNoHealthReport()) { + healthJson = engines.toJsonHealth(); + healthJson.put("requestsok", this.nrOfQueriesOk); + healthJson.put("requestserr", this.nrOfQueriesError); + healthJson.put("upsince", startTime); + } LOGGER.info("Local."); } json = result.toJson(); diff --git a/src/main/java/org/searsia/web/SearsiaApplication.java b/src/main/java/org/searsia/web/SearsiaApplication.java index fc69e02..1098069 100644 --- a/src/main/java/org/searsia/web/SearsiaApplication.java +++ b/src/main/java/org/searsia/web/SearsiaApplication.java @@ -22,6 +22,7 @@ import org.glassfish.jersey.server.ResourceConfig; import org.json.JSONObject; +import org.searsia.SearsiaOptions; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; @@ -32,7 +33,7 @@ */ public class SearsiaApplication extends ResourceConfig { - public static final String VERSION = "v1.0.0"; + public static final String VERSION = "v1.0.1"; protected static Response responseOk(JSONObject json) { json.put("searsia", VERSION); @@ -64,11 +65,16 @@ protected static Response jsonResponse(int status, JSONObject json) { .build(); } - public SearsiaApplication(SearchResultIndex index, ResourceIndex engines) throws IOException { + public SearsiaApplication(SearchResultIndex index, + ResourceIndex engines, + SearsiaOptions options) throws IOException { super(); java.util.logging.Logger.getLogger("").setLevel(java.util.logging.Level.WARNING); - register(new Search(index, engines)); - register(new OpenSearch(engines)); + register(new Search(index, engines, options)); + register(new OpenSearch(engines, options.isNotShared())); + if (options.isAnonymous()) { + register(new Proxy(engines)); + } register(new Redirect(engines.getMyself().getId())); } diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 0d86ee6..3870b4f 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -18,6 +18,7 @@ import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; +import org.searsia.SearsiaOptions; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.web.Search; @@ -32,8 +33,9 @@ public class SearchTest { private static final String INDEX = "test2"; private static SearchResultIndex index; private static ResourceIndex engines; - - + private static SearsiaOptions options; + + private static Resource wiki() throws XPathExpressionException, JSONException { return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wiki{q}.json\", \"id\":\"wiki\"}")); } @@ -67,6 +69,7 @@ public static void setUp() throws Exception { LOGGER.setLevel(Level.ALL); index = new SearchResultIndex(PATH, INDEX, 10); engines = new ResourceIndex(PATH, INDEX); + options = new SearsiaOptions(); engines.putMother(wiki()); engines.put(wrong()); engines.put(ok()); @@ -80,7 +83,7 @@ public static void lastThing() throws IOException { @Test // returns 'my' resource description public void test() throws IOException { - Search search = new Search(index, engines); + Search search = new Search(index, engines, options); Response response = search.query("wiki.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); @@ -92,7 +95,7 @@ public void test() throws IOException { @Test // returns local search results for 'searsia' public void testQuery() throws IOException { - Search search = new Search(index, engines); + Search search = new Search(index, engines, options); Response response = search.query("wiki.json", "searsia search for noobs", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); @@ -114,7 +117,7 @@ public void testQuery() throws IOException { @Test // returns local resource 'wrong' public void testResource() throws IOException, XPathExpressionException, JSONException { - Search search = new Search(index, engines); + Search search = new Search(index, engines, options); Response response = search.query("wrong.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); @@ -126,7 +129,7 @@ public void testResource() throws IOException, XPathExpressionException, JSONExc @Test // returns resource 'wikididyoumean' (from mother) public void testResourceUnknown() throws IOException { - Search search = new Search(index, engines); + Search search = new Search(index, engines, options); Response response = search.query("wikididyoumean.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); @@ -138,7 +141,7 @@ public void testResourceUnknown() throws IOException { @Test // returns results for the engine 'wrong' (which does not exist) public void testError() throws IOException { - Search search = new Search(index, engines); + Search search = new Search(index, engines, options); Response response = search.query("wrong.json", "testquery", null, null); int status = response.getStatus(); Assert.assertEquals(503, status); @@ -146,7 +149,7 @@ public void testError() throws IOException { @Test // returns results for the engine 'wikifull1' public void testOk() throws IOException, XPathExpressionException, JSONException { - Search search = new Search(index, engines); + Search search = new Search(index, engines, options); Response response = search.query("wikifull1.json", "informat", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); @@ -174,5 +177,4 @@ public void testOk() throws IOException, XPathExpressionException, JSONException LOGGER.trace("No result: " + json); } - } From e10e50b4f6f2d7f208a3d2da8dc52cbf1e1f3862 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 29 Sep 2017 16:59:05 +0200 Subject: [PATCH 48/51] minor proxy and api change --- src/main/java/org/searsia/Main.java | 3 +- src/main/java/org/searsia/SearsiaOptions.java | 28 +++++------ src/main/java/org/searsia/web/OpenSearch.java | 13 +++--- src/main/java/org/searsia/web/Proxy.java | 46 +++++++++---------- src/main/java/org/searsia/web/Redirect.java | 2 +- src/main/java/org/searsia/web/Search.java | 35 +++++++------- src/test/java/org/searsia/web/SearchTest.java | 14 ++++++ 7 files changed, 80 insertions(+), 61 deletions(-) diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 674723f..33adccb 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -394,7 +394,8 @@ public static void main(String[] args) { // Start the update daemon if not testing if (options.getTestOutput() == null) { - printMessage("API end point: " + normalizedUriToTemplate(myURI, myself.getId()), options.isQuiet()); + String myAPI = normalizedUriToTemplate(myURI + "searsia/", myself.getId()); + printMessage("API end point: " + myAPI, options.isQuiet()); printMessage("Use Ctrl+c to stop.", options.isQuiet()); try { searsiaDaemon(index, engines, options); diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 53bf233..ac1257f 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -73,8 +73,8 @@ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedU options.addOption("u", "url", true, "Set url of my web service endpoint."); setDefaults(); parse(options, args); - if (myURI == null && motherTemplate != null) { - myURI = "http://localhost:16842/searsia/" + lastDir(motherTemplate); + if (myURI == null) { + myURI = "http://localhost:16842/"; } } @@ -85,20 +85,22 @@ public SearsiaOptions() { setDefaults(); } - private static String lastDir(String urlString) throws MalformedURLException { + private String rootDir() { + String rootDir = "searsia"; + String urlString = getMotherTemplate(); urlString = urlString.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); - URL url = new URL(urlString); - String path = url.getPath(); - if (path != null && path.contains("/")) { - path = path.replaceAll("\\/[^\\/]*$", ""); // remove file - path = path.replaceAll("^.+\\/", ""); // remove trailing directories - return path + "/"; - } else { - return ""; - } + try { + URL url = new URL(urlString); + String path = url.getPath(); + if (path != null && path.contains("/")) { + path = path.replaceAll("\\/[^\\/]*$", ""); // remove file + path = path.replaceAll("^.+\\/", ""); // remove trailing directories + rootDir = path + "/"; + } + } catch (MalformedURLException e) { } + return rootDir; } - private void setDefaults() { anonymous = false; test = null; // no test diff --git a/src/main/java/org/searsia/web/OpenSearch.java b/src/main/java/org/searsia/web/OpenSearch.java index d89be39..f308d97 100644 --- a/src/main/java/org/searsia/web/OpenSearch.java +++ b/src/main/java/org/searsia/web/OpenSearch.java @@ -33,7 +33,7 @@ * @author hiemstra * */ -@Path("{resourceid}") +@Path("opensearch") public class OpenSearch { private ResourceIndex engines; @@ -44,18 +44,19 @@ public OpenSearch(ResourceIndex engines, boolean dontshare) throws IOException { this.dontshare = dontshare; } - @GET @Path("opensearch.xml") + @GET @Path("{resourceid}") @Produces("application/opensearchdescription+xml; charset=utf-8") public Response get(@PathParam("resourceid") String resourceid) { - Resource engine; + resourceid = resourceid.replaceAll("\\.xml$", ""); + Resource engine = null; if (resourceid.equals(engines.getMyself().getId())) { engine = engines.getMyself(); } else { engine = engines.get(resourceid); } if (engine != null) { - String response = engineXML(engine); - return Response.ok(response).build(); + String xmlString = engineXML(engine); + return Response.ok(xmlString).build(); } else { return SearsiaApplication.responseError(404, "Not found: " + resourceid); } @@ -89,7 +90,7 @@ private String engineXML(Resource engine) { response += "\n"; response += " " + xmlEncode(shortName) + "\n"; response += " Search the web with " + xmlEncode(shortName) + "\n"; - if(!dontshare) { + if(!dontshare && apiTemplate != null) { // TODO: own api or foward API? response += " \n"; } if (userTemplate != null) response += " \n"; diff --git a/src/main/java/org/searsia/web/Proxy.java b/src/main/java/org/searsia/web/Proxy.java index ccd69c0..11710c6 100644 --- a/src/main/java/org/searsia/web/Proxy.java +++ b/src/main/java/org/searsia/web/Proxy.java @@ -46,7 +46,7 @@ * Provides a proxy for any (image) url * and a special caching proxy for the resources' fav icons. */ -@Path("{resourceid}") +@Path("images") public class Proxy { private ResourceIndex engines; @@ -60,23 +60,25 @@ public Proxy(ResourceIndex engines) throws IOException { this.engines = engines; } - @GET @Path("proxy") - public Response query(@PathParam("resourceid") String resourceid, - @QueryParam("url") String url, + @GET + public Response query(@QueryParam("url") String url, @Context HttpHeaders headers) { + if (url == null) { + return Response.status(404).build(); + } try { if (headers.getRequestHeader("If-Modified-Since") != null || headers.getRequestHeader("If-None-Match") != null) { return Response.notModified().build(); } else { - return getStreamResponse(url); + return getStreamBuilder(url).build(); } } catch (Exception e) { return Response.status(503).build(); // 503 = unavailable } } - @GET @Path("icon") + @GET @Path("{resourceid}") public Response icon(@PathParam("resourceid") String resourceid, @Context HttpHeaders headers) { try { @@ -125,12 +127,23 @@ private ResponseBuilder getCachedBuilder(String urlString) throws IOException { baos.write(buffer, 0, bytesRead); } return Response.ok(baos.toByteArray(), contentType) - .header("Last-Modified", lastModified); + .header("Last-Modified", lastModified); } - private Response getStreamResponse(String urlString) throws IOException { + private ResponseBuilder getStreamBuilder(String urlString) throws IOException { HttpURLConnection http = getHttp(urlString); - return getStreamBuilder(http).build(); + ResponseBuilder builder = Response.ok(http.getInputStream()); + String field = http.getHeaderField("Content-Type"); + if (field != null) builder.header("Content-Type", field); + field = http.getHeaderField("Content-Length"); + if (field != null) builder.header("Content-Length", field); + field = http.getHeaderField("Expires"); + if (field != null) builder.header("Expires", field); + field = http.getHeaderField("Cache-Control"); + if (field != null) builder.header("Cache-Control", field); + field = http.getHeaderField("Last-Modified"); + if (field != null) builder.header("Last-Modified", field); + return builder; } private HttpURLConnection getHttp(String urlString) throws IOException { @@ -146,20 +159,5 @@ private HttpURLConnection getHttp(String urlString) throws IOException { http.connect(); return http; } - - private ResponseBuilder getStreamBuilder(HttpURLConnection http) throws IOException { - ResponseBuilder builder = Response.ok(http.getInputStream()); - String field = http.getHeaderField("Content-Type"); - if (field != null) builder.header("Content-Type", field); - field = http.getHeaderField("Content-Length"); - if (field != null) builder.header("Content-Length", field); - field = http.getHeaderField("Expires"); - if (field != null) builder.header("Expires", field); - field = http.getHeaderField("Cache-Control"); - if (field != null) builder.header("Cache-Control", field); - field = http.getHeaderField("Last-Modified"); - if (field != null) builder.header("Last-Modified", field); - return builder; - } } \ No newline at end of file diff --git a/src/main/java/org/searsia/web/Redirect.java b/src/main/java/org/searsia/web/Redirect.java index a3060e5..475b830 100644 --- a/src/main/java/org/searsia/web/Redirect.java +++ b/src/main/java/org/searsia/web/Redirect.java @@ -10,7 +10,7 @@ import org.searsia.SearchResult; -@Path("/") +@Path("searsia") public class Redirect { String id; diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 652a0d5..5b24bc3 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -43,7 +43,8 @@ * * @author Dolf Trieschnigg and Djoerd Hiemstra */ -@Path("{resourceid}") + +@Path("searsia") public class Search { private final static org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(Search.class); @@ -52,18 +53,20 @@ public class Search { private ResourceIndex engines; private SearchResultIndex index; - private SearsiaOptions options; + private boolean health; + private boolean shared; private long nrOfQueriesOk = 0; private long nrOfQueriesError = 0; public Search(SearchResultIndex index, ResourceIndex engines, SearsiaOptions options) throws IOException { - this.engines = engines; - this.index = index; - this.options = options; + this.engines = engines; + this.index = index; + this.health = !options.isNoHealthReport(); + this.shared = !options.isNotShared(); } - @OPTIONS + @OPTIONS @Path("{resourceid}") public Response options() { return Response.status(Response.Status.NO_CONTENT) .header("Access-Control-Allow-Origin", "*") @@ -71,12 +74,12 @@ public Response options() { .build(); } - @GET + @GET @Path("{resourceid}") @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response query(@PathParam("resourceid") String resourceid, - @QueryParam("q") String searchTerms, - @QueryParam("resources") String countResources, - @QueryParam("page") String pageOffset) { + public Response query(@PathParam("resourceid") String resourceid, + @QueryParam("q") String searchTerms, + @QueryParam("resources") String countResources, + @QueryParam("page") String pageOffset) { resourceid = resourceid.replaceAll("\\.json$", ""); Resource me = engines.getMyself(); if (!resourceid.equals(me.getId())) { @@ -154,15 +157,15 @@ private Response getRemoteResults(String resourceid, String query) { } } else { json = new JSONObject(); - if (!options.isNoHealthReport()) { + if (this.health) { json.put("health", engine.toJsonHealth()); } LOGGER.info("Resource " + resourceid + "."); } - if (options.isNotShared()) { - json.put("resource", engine.toJsonEngineDontShare()); - } else { + if (this.shared) { json.put("resource", engine.toJson()); + } else { + json.put("resource", engine.toJsonEngineDontShare()); } return SearsiaApplication.responseOk(json); } @@ -197,7 +200,7 @@ private Response getLocalResults(String query, int max, int start) { } else { // no query: create a 'resource only' result, plus health report result = new SearchResult(); result.scoreResourceSelection(null, engines, max, start); - if (!this.options.isNoHealthReport()) { + if (this.health) { healthJson = engines.toJsonHealth(); healthJson.put("requestsok", this.nrOfQueriesOk); healthJson.put("requestserr", this.nrOfQueriesError); diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index 3870b4f..b02fe28 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -124,9 +124,23 @@ public void testResource() throws IOException, XPathExpressionException, JSONExc JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); + Assert.assertTrue(json.has("health")); Assert.assertEquals(wrong().getAPITemplate(), resource.get("apitemplate")); } + @Test // returns local resource 'wrong' without apitemplate and health + public void testResourceNoSharing() throws IOException, XPathExpressionException, JSONException { + String[] args = {"-d", "-n", "-m=http://searsia.org/searsia/wiki/wiki{q}.json"}; + SearsiaOptions newOptions = new SearsiaOptions(args); + Search search = new Search(index, engines, newOptions); + Response response = search.query("wrong.json", "", null, null); + String entity = (String) response.getEntity(); + JSONObject json = new JSONObject(entity); + JSONObject resource = (JSONObject) json.get("resource"); + Assert.assertFalse(json.has("health")); + Assert.assertFalse(resource.has("apitemplate")); + } + @Test // returns resource 'wikididyoumean' (from mother) public void testResourceUnknown() throws IOException { Search search = new Search(index, engines, options); From 0a02111e970df0c55f96b892e28bdd1ba0c24f2e Mon Sep 17 00:00:00 2001 From: Searsia Date: Tue, 10 Oct 2017 19:04:25 +0200 Subject: [PATCH 49/51] resource index more lenient on index error --- .../java/org/searsia/index/ResourceIndex.java | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 336c1ac..d7506fb 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -20,10 +20,13 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.text.ParseException; import java.util.LinkedHashMap; import java.util.Map; import java.util.Random; +import javax.xml.xpath.XPathExpressionException; + import org.apache.log4j.Logger; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; @@ -72,16 +75,20 @@ public class ResourceIndex { * @param path path where the Searsia index resides * @param filename index file name * @throws IOException - * @throws JSONException */ - public ResourceIndex(String path, String filename) throws IOException, JSONException { + public ResourceIndex(String path, String filename) throws IOException { this.meFile = Paths.get(path, filename + ".json"); this.indexDir = Paths.get(path, filename + "_sources"); if (meFile.toFile().exists()) { - this.me = readMyselfFile(meFile); + try { + this.me = readMyselfFile(meFile); + } catch (IOException e) { + LOGGER.warn("Myself not found: " + e.getMessage()); + meFile.toFile().delete(); + } } if (this.indexDir.toFile().exists()) { - readResourceIndex(); + readResourceIndex(); } else { this.indexDir.toFile().mkdir(); } @@ -98,7 +105,7 @@ private void writeMyselfFile(Resource engine) throws IOException { } - private Resource readMyselfFile(Path meFile) throws IOException, JSONException { + private Resource readMyselfFile(Path meFile) throws IOException { String content = new String(Files.readAllBytes(meFile)); Resource me = null; try { @@ -106,6 +113,8 @@ private Resource readMyselfFile(Path meFile) throws IOException, JSONException { me = new Resource(json); } catch (javax.xml.xpath.XPathExpressionException e) { throw new IOException(e); + } catch (JSONException e) { + throw new IOException(e); } return me; } @@ -126,21 +135,24 @@ private void readResourceIndex() throws IOException { ScoreDoc[] hits = searcher.search(new MatchAllDocsQuery(), MAX_SOURCE_CACHE).scoreDocs; for (ScoreDoc hit: hits) { Document doc = searcher.doc(hit.doc); - JSONObject json = new JSONObject(doc.get("json")); - Resource engine = new Resource((JSONObject) json.get("resource")); - if (json.has("health")) { - engine.updateHealth((JSONObject) json.get("health")); - String lastUpdated = engine.getLastUpdatedString(); - if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { - this.lastFlushed = lastUpdated; + try{ + JSONObject json = new JSONObject(doc.get("json")); + Resource engine = new Resource((JSONObject) json.get("resource")); + if (json.has("health")) { + engine.updateHealth((JSONObject) json.get("health")); + String lastUpdated = engine.getLastUpdatedString(); + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { + this.lastFlushed = lastUpdated; + } } + this.engines.put(engine.getId(), engine); + } catch (XPathExpressionException | JSONException | ParseException e) { + LOGGER.warn("Garbled index: " + e.getLocalizedMessage()); } - this.engines.put(engine.getId(), engine); } - } catch (Exception e) { - throw new IOException(e.getMessage()); - } - finally { + } catch (IOException e) { + throw new IOException(e); + } finally { reader.close(); } } From b86f8f4ae3d31747c9bc61b27934b1aa9a4b47c1 Mon Sep 17 00:00:00 2001 From: Searsia Date: Wed, 22 Nov 2017 19:23:09 +0100 Subject: [PATCH 50/51] Signatures, Health, Reranking --- pom.xml | 4 +- src/main/java/org/searsia/SearchResult.java | 25 +++- .../java/org/searsia/engine/Resource.java | 40 +++++- .../java/org/searsia/engine/Signatures.java | 127 ++++++++++++++++++ .../java/org/searsia/SearchResultTest.java | 6 +- .../org/searsia/engine/SignaturesTest.java | 26 ++++ 6 files changed, 218 insertions(+), 10 deletions(-) create mode 100644 src/main/java/org/searsia/engine/Signatures.java create mode 100644 src/test/java/org/searsia/engine/SignaturesTest.java diff --git a/pom.xml b/pom.xml index 045c05f..4831956 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ searsiaserver 1.0.1 - 3 + 3.0 UTF-8 @@ -19,7 +19,7 @@ org.apache.maven.plugins maven-assembly-plugin - 2.6 + 3.1.0 jar-with-dependencies diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 6d1cda4..26dd90c 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -229,11 +229,21 @@ public void selectBestResources(int max, int start) { public void scoreReranking(String query, String model) { if ("random".equals(model)) { scoreRerankingRandom(); + } else if ("bestrandom".equals(model)) { + scoreRerankingBestRandom(query); } else { scoreRerankingRest(query); } } + private void scoreRerankingBestRandom(String query) { + scoreRerankingRandom(); + scoreRerankingGeneral(query, 10); + } + + private void scoreRerankingRest(String query) { + scoreRerankingGeneral(query, 0); + } private void scoreRerankingRandom() { Hit hit; @@ -247,21 +257,28 @@ private void scoreRerankingRandom() { } } - - private void scoreRerankingRest(String query) { + private void scoreRerankingGeneral(String query, int count) { SearchResult newResult = new SearchResult(); Map queryTerms = new HashMap(); for (String term: query.toLowerCase().split(TOKENIZER)) { - queryTerms.put(term, 0.01f); // TODO df from Lucene index? + queryTerms.put(term, 0.1f); // TODO idf from Lucene index }; for (Hit hit: this.hits) { float score = 0.0f; String text = hit.toIndexVersion(); + for (String term: queryTerms.keySet()) { + queryTerms.put(term, 0.1f); + } for (String term: text.toLowerCase().split(TOKENIZER)) { if (queryTerms.containsKey(term)) { - score += 1.0f; // TODO: single query term multiple times? + score += queryTerms.get(term); + queryTerms.put(term, 0.0f); } } + if (count > 0) { + score += 0.01f; + count -= 1; + } if (score > 0.001f) { hit.put("score", score); newResult.addHit(hit); diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index cb40fae..6fa86ba 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -80,6 +80,7 @@ public class Resource implements Comparable { private String urlSuggestTemplate = null; private String mimeType = null; private String postString = null; + private String signature = null; private String postQueryEncode = null; private String favicon = null; private String banner = null; @@ -121,6 +122,7 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { if (jo.has("mimetype")) this.mimeType = jo.getString("mimetype"); if (jo.has("post")) this.postString = jo.getString("post"); if (jo.has("postencode")) this.postQueryEncode = jo.getString("postencode"); + if (jo.has("signature")) this.signature = jo.getString("signature"); if (jo.has("name")) this.name = jo.getString("name"); if (jo.has("testquery")) this.testQuery = jo.getString("testquery"); if (jo.has("urltemplate")) this.urlUserTemplate = jo.getString("urltemplate"); @@ -533,6 +535,10 @@ private String fillTemplate(String template, String query) throws SearchExceptio String param = url.substring(url.indexOf("{"), url.indexOf("}") + 1); throw new SearchException("Missing url parameter " + param); } + String signature = getSignatureName(); + if (signature != null) { + url = Signatures.sign(url, signature, getSignatureKey()); + } return url; } @@ -659,6 +665,35 @@ public String getName() { return this.name; } + public String getSignature() { + return this.signature; + } + + public String getSignatureName() { + if (this.signature == null) { + return null; + } + int begin = this.signature.indexOf("("); + if (begin == -1) { + return this.signature; + } else { + return this.signature.substring(0, begin); + } + } + + public String getSignatureKey() { + int begin = this.signature.indexOf("("); + if (begin == -1) { + return null; + } else { + String key = this.signature.substring(begin + 1, this.signature.length() -1); + if (key.startsWith("{")) { + key = this.privateParameters.get(key.substring(1, key.length() - 1)); + } + return key; + } + } + public String getUserTemplate() { return this.urlUserTemplate; } @@ -817,7 +852,7 @@ public Long getLastUsedSecondsAgo() { } public boolean isHealthy() { - return this.lastUsedOk >= this.lastUsedError; + return this.lastUsedOk >= this.lastUsedError || this.nrOfError == 0; } @@ -886,6 +921,7 @@ public void updateWith(Resource e2) { // TODO: bad idea in multi-threaded app!? this.id = e2.id; this.deleted = e2.deleted; this.name = e2.name; + this.signature = e2.signature; this.urlUserTemplate = e2.urlUserTemplate; this.favicon = e2.favicon; this.banner = e2.banner; @@ -923,6 +959,7 @@ public JSONObject toJsonEngine() { engine.put("deleted", true); } else { if (name != null) engine.put("name", name); + if (signature != null) engine.put("signature", signature); if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); if (favicon != null) engine.put("favicon", favicon); if (banner != null) engine.put("banner", banner); @@ -1032,6 +1069,7 @@ public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? if (!stringEquals(this.getId(), e.getId())) return false; if (this.isDeleted() != e.isDeleted()) return false; if (!stringEquals(this.getName(), e.getName())) return false; + if (!stringEquals(this.getSignature(), e.getSignature())) return false; if (!stringEquals(this.getMimeType(), e.getMimeType())) return false; if (!stringEquals(this.getRerank(), e.getRerank())) return false; if (!stringEquals(this.getFavicon(), e.getFavicon())) return false; diff --git a/src/main/java/org/searsia/engine/Signatures.java b/src/main/java/org/searsia/engine/Signatures.java new file mode 100644 index 0000000..a911c07 --- /dev/null +++ b/src/main/java/org/searsia/engine/Signatures.java @@ -0,0 +1,127 @@ +package org.searsia.engine; + +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLEncoder; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Base64; +import java.util.Calendar; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedMap; +import java.util.TimeZone; +import java.util.TreeMap; + +import javax.crypto.Mac; +import javax.crypto.spec.SecretKeySpec; + + +public class Signatures { + + private static final String UTF8_CHARSET = "UTF-8"; + private static final String HMAC_SHA256_ALGORITHM = "HmacSHA256"; + + /** + * Signing a web request using Amazon's HmacSHA256. + * Example code from: + * http://docs.aws.amazon.com/AWSECommerceService/latest/DG/AuthJavaSampleSig2.html + * For more information: + * https://tools.ietf.org/html/rfc2104 + */ + public static String sign(String urlString, String algorithm, String secretKey) { + if (algorithm != null && !algorithm.equals(HMAC_SHA256_ALGORITHM)) { + throw new RuntimeException("Unsupported signature: " + algorithm); + } + if (secretKey == null) { + throw new RuntimeException("Signature key not found."); + } + Mac mac = null; + URL url = null; + try { + byte[] secretyKeyBytes = secretKey.getBytes(UTF8_CHARSET); + SecretKeySpec secretKeySpec = new SecretKeySpec(secretyKeyBytes, HMAC_SHA256_ALGORITHM); + mac = Mac.getInstance(HMAC_SHA256_ALGORITHM); + mac.init(secretKeySpec); + url = new URL(urlString); + } catch (Exception e) { // UnsupportedEncodingException, NoSuchAlgorithmException, InvalidKeyException, MalformedURLException + throw new RuntimeException(e); + } + String protocol = url.getProtocol(); + String host = url.getHost(); + String path = url.getPath(); + String query = url.getQuery(); + + Map params = new HashMap(); + for (String pair: query.split("&")) { + String[] keyValue = pair.split("="); + params.put(keyValue[0], keyValue[1]); + } + params.putIfAbsent("Timestamp", timestamp()); + + SortedMap sortedParamMap = new TreeMap(params); + String canonicalQS = canonicalize(sortedParamMap); + String toSign = "GET\n" + host + "\n" + path + "\n" + canonicalQS; + String hmac = hmac(mac, toSign); + String sig = percentEncodeRfc3986(hmac); + return protocol + "://" + host + path + "?" + canonicalQS + "&Signature=" + sig; + } + + private static String timestamp() { + String timestamp = null; + Calendar cal = Calendar.getInstance(); + DateFormat dfm = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + dfm.setTimeZone(TimeZone.getTimeZone("GMT")); + timestamp = dfm.format(cal.getTime()); + return timestamp; + } + + private static String hmac(Mac mac, String stringToSign) { + String signature = null; + byte[] data; + byte[] rawHmac; + try { + data = stringToSign.getBytes(UTF8_CHARSET); + rawHmac = mac.doFinal(data); + Base64.Encoder encoder = Base64.getEncoder(); + signature = new String(encoder.encode(rawHmac)); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + return signature; + } + + private static String canonicalize(SortedMap sortedParamMap) { + if (sortedParamMap.isEmpty()) { + return ""; + } + StringBuffer buffer = new StringBuffer(); + Iterator> iter = sortedParamMap.entrySet().iterator(); + while (iter.hasNext()) { + Map.Entry kvpair = iter.next(); + buffer.append(percentEncodeRfc3986(kvpair.getKey())); + buffer.append("="); + buffer.append(percentEncodeRfc3986(kvpair.getValue())); + if (iter.hasNext()) { + buffer.append("&"); + } + } + String canonical = buffer.toString(); + return canonical; + } + + private static String percentEncodeRfc3986(String s) { + String out; + try { + out = URLEncoder.encode(s, UTF8_CHARSET) + .replace("+", "%20") + .replace("*", "%2A") + .replace("%7E", "~"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + return out; + } + +} diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index 1c792d2..f460123 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -37,11 +37,11 @@ public void testSampleAndRerank() { String term = sr.randomTerm(notThis); Assert.assertFalse("Same random term", term.equals(notThis)); Assert.assertTrue("Index contains random term: " + term, terms.contains(term)); - Assert.assertEquals("Total nr of hits", sr.getHits().size(), 2); + Assert.assertEquals("Total nr of hits", 2, sr.getHits().size()); sr.scoreReranking("test", "or"); - Assert.assertEquals("Nr of hits after reranking", sr.getHits().size(), 2); + Assert.assertEquals("Nr of hits after reranking", 2, sr.getHits().size()); sr.scoreReranking("doesnotmatch", "or"); - Assert.assertEquals("Query matches zero results", sr.getHits().size(), 0); + Assert.assertEquals("Query matches zero results", 0, sr.getHits().size()); } @Test diff --git a/src/test/java/org/searsia/engine/SignaturesTest.java b/src/test/java/org/searsia/engine/SignaturesTest.java new file mode 100644 index 0000000..89f3dda --- /dev/null +++ b/src/test/java/org/searsia/engine/SignaturesTest.java @@ -0,0 +1,26 @@ +package org.searsia.engine; + +import org.junit.Assert; +import org.junit.Test; + +import org.searsia.engine.Signatures; + +public class SignaturesTest { + + /** + * Signing a web request using Amazon's HMAC-SHA256. + * Example string from: + * http://docs.aws.amazon.com/AWSECommerceService/latest/DG/rest-signature.html + * For more information: + * https://tools.ietf.org/html/rfc2104 + */ + @Test + public void testAmazonHMACSHA256() { + String secretKey = "1234567890"; + String requestUrl = "http://webservices.amazon.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=AKIAIOSFODNN7EXAMPLE&AssociateTag=mytag-20&Operation=ItemLookup&ItemId=0679722769&ResponseGroup=Images,ItemAttributes,Offers,Reviews&Version=2013-08-01&Timestamp=2014-08-18T12:00:00Z"; + String targetUrl = "http://webservices.amazon.com/onca/xml?AWSAccessKeyId=AKIAIOSFODNN7EXAMPLE&AssociateTag=mytag-20&ItemId=0679722769&Operation=ItemLookup&ResponseGroup=Images%2CItemAttributes%2COffers%2CReviews&Service=AWSECommerceService&Timestamp=2014-08-18T12%3A00%3A00Z&Version=2013-08-01&Signature=j7bZM0LXZ9eXeZruTqWm2DIvDYVUU3wxPPpp%2BiXxzQc%3D"; + String signedUrl = Signatures.sign(requestUrl, "HmacSHA256", secretKey); + Assert.assertEquals("Signed request", targetUrl, signedUrl); + } + +} From e9508098d2976470d321dc0864da73bb656ccaa0 Mon Sep 17 00:00:00 2001 From: Searsia Date: Fri, 8 Dec 2017 12:53:18 +0100 Subject: [PATCH 51/51] Preparing for release * updated README * new version number * test all using paging now * supports opensearch url templates * improved error messages * lenient accept headers default * removed proxy and anonymous (moved to client) --- README.md | 19 +- pom.xml | 2 +- src/main/java/org/searsia/Main.java | 83 ++++++--- src/main/java/org/searsia/SearsiaOptions.java | 36 +--- .../java/org/searsia/engine/Resource.java | 37 +++- src/main/java/org/searsia/web/OpenSearch.java | 2 +- src/main/java/org/searsia/web/Proxy.java | 163 ------------------ src/main/java/org/searsia/web/Search.java | 10 +- .../org/searsia/web/SearsiaApplication.java | 5 +- .../java/org/searsia/engine/ResourceTest.java | 12 +- .../org/searsia/index/TestResourceIndex.java | 6 +- src/test/java/org/searsia/web/SearchTest.java | 10 +- src/test/resources/exampleSearchResult.json | 2 +- src/test/resources/hiemstra.json | 4 +- src/test/resources/hiemstracrazy.json | 4 +- src/test/resources/hiemstrapost.json | 4 +- src/test/resources/hiemstraxml.json | 4 +- src/test/resources/index.json | 4 +- src/test/resources/javascript.json | 4 +- src/test/resources/randomid.json | 2 +- src/test/resources/wrong.json | 4 +- 21 files changed, 142 insertions(+), 275 deletions(-) delete mode 100644 src/main/java/org/searsia/web/Proxy.java diff --git a/README.md b/README.md index f4dcb8d..219da8e 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,18 @@ Searsia Server ============== http://searsia.org -Usage: +Usage: + Build with: `mvn package` -+ Run with: `java -jar target/searsiaserver.jar` ++ Run with: `java -jar target/searsiaserver.jar -m ` + Done. -Connect to the server with the [Federated Web Search Client][1]. -More information can be found in the [Searsia Documentation][2], -or you may ask a question under [Searsia Server Issues][3]. +The option `-m` is required: It connects your server to an +existing Searsia server, see [Searsia server options][1]. +Connect to your server with the [Federated Web Search Client][2]. +More information can be found in the [Searsia Documentation][3], +or you may ask a question under [Searsia Server Issues][4]. -[1]: http://github.com/searsia/searsiaclient "Searsia Client" -[2]: http://searsia.org "Searsia Documentation" -[3]: http://github.com/searsia/searsiaserver/issues "Issues" +[1]: http://searsia.org/start.html#server +[2]: http://github.com/searsia/searsiaclient "Searsia Client" +[3]: http://searsia.org "Searsia Documentation" +[4]: http://github.com/searsia/searsiaserver/issues "Issues" diff --git a/pom.xml b/pom.xml index 4831956..a2656f3 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.searsia searsiaserver - 1.0.1 + 1.0.2 3.0 diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 33adccb..70d0760 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -22,6 +22,8 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.security.MessageDigest; +import java.util.HashMap; +import java.util.Map; import java.util.Random; import org.apache.log4j.Appender; @@ -159,9 +161,9 @@ private static String removeFileNameUri(String uri) { private static String normalizedUriToTemplate(String uri, String rid) { if (uri != null) { if (uri.endsWith("/") ) { - uri += rid + "?q={q}"; - } else if (!uri.contains("{q")) { // check for tests on searsia.org - uri += "?q={q}"; + uri += rid + "?q={searchTerms}&page={startPage?}"; + } else if (!uri.contains("{q") && !uri.contains("{searchTerms")) { // check for tests on searsia.org + uri += "?q={searchTerms}&page={startPage?}"; } } @@ -207,25 +209,45 @@ public static String getHashString(String inputString) { private static void testAll(Resource mother, SearchResult result, Boolean isQuiet) throws SearchException { int nrFailed = 0; - for (Hit hit: result.getHits()) { - if (hit.getRid() != null) { - try { - Resource engine = mother.searchResource(hit.getRid()); - testMother(engine, "none", isQuiet); - } catch (Exception e) { - nrFailed += 1; - printMessage("Test failed: " + e.getMessage(), isQuiet); + boolean isDone = false; + int startPage = mother.getIndexOffset(); + Map tested = new HashMap(); + tested.put(mother.getId(), true); + while (!result.getHits().isEmpty() && !isDone) { + isDone = true; + for (Hit hit: result.getHits()) { + String rid = hit.getRid(); + if (rid != null && !tested.containsKey(rid)) { + tested.put(rid, true); + isDone = false; + Resource engine = null; + try { + engine = mother.searchResource(hit.getRid()); + testEngine(engine, "none", isQuiet); + } catch (Exception e) { + nrFailed += 1; + if (engine == null) { // resource not found, so test did not even start + printMessage("Testing: " + hit.getRid(), isQuiet); + } + printMessage("Test failed: " + e.getMessage(), isQuiet); + } } - } - } + } + startPage += 1; + try { + result = mother.search(mother.getTestQuery(), "all", startPage); + } catch (Exception e) { + throw new SearchException("Mother error: " + e.getMessage()); + } + } if (nrFailed > 0) { throw new SearchException(nrFailed + " engines failed."); } } - private static void testMother(Resource mother, String debugInfo, Boolean isQuiet) throws SearchException { - printMessage("Testing: " + mother.getName() + " (" + mother.getId() + ")", isQuiet); + private static void testEngine(Resource mother, String debugInfo, Boolean isQuiet) throws SearchException { + printMessage("Testing: " + mother.getId() + " (" + mother.getName() + ")", isQuiet); SearchResult result = null; result = mother.search(mother.getTestQuery(), debugInfo); if (!isQuiet) { @@ -249,12 +271,22 @@ private static void testMother(Resource mother, String debugInfo, Boolean isQuie throw new SearchException("No results for test query." + tip); } if (result.getHits().size() < 10) { - printMessage("Warning: less than 10 results for query: " + result.getQuery() + "; see \"testquery\" or \"rerank\".", isQuiet); + printMessage("Warning: less than 10 results for query '" + result.getQuery() + "'; see \"testquery\" or \"rerank\".", isQuiet); } else if (result.getHits().size() > 49) { - printMessage("Warning: more than 49 results for query: " + result.getQuery(), isQuiet); + printMessage("Warning: more than 49 results for query '" + result.getQuery() + "'", isQuiet); } if (debugInfo.equals("all")) { - testAll(mother, result, isQuiet); + String rid = null; + if (result.getResource() != null) { + rid = result.getResource().getId(); + } + if (rid != null && rid.equals(mother.getId())) { // do not trust resources if the mother API provides another ID than the mother ID + testAll(mother, result, isQuiet); + } else if (rid == null ){ + printMessage("Warning: no resources available.", isQuiet); + } else { + printMessage("Warning: no resources. ID '" + mother.getId() + "' changed to '" + rid + "'", isQuiet); + } } } @@ -317,15 +349,20 @@ public static void main(String[] args) { if (!options.getMotherTemplate().matches(".*" + mother.getId() + "[^/]*$")) { fatalError("API Template (" + options.getMotherTemplate() + "): file name must contain id (" + mother.getId() +")"); } - if (version != null && !version.startsWith("v1")) { - fatalError("Wrong major Searsia version " + version + ": Must be v1.0.0 or higher."); + if (version == null || !version.startsWith("v1")) { + fatalError("Wrong major Searsia version. Must be v1.x.x."); } if (mother.getAPITemplate() == null) { mother.setUrlAPITemplate(options.getMotherTemplate()); - } else if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { - printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); + } else { + if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { + printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); + } + if (mother.getAPITemplate().contains("{q")) { + printMessage("Warning: API Template parameter {q} is deprecated. Use {searchTerms}.", options.isQuiet()); + } } myself = mother.getLocalResource(); String fileName = myself.getId() + "_" + getHashString(mother.getAPITemplate()); @@ -340,7 +377,7 @@ public static void main(String[] args) { path = tmpDir; } try { - testMother(mother, options.getTestOutput(), options.isQuiet()); + testEngine(mother, options.getTestOutput(), options.isQuiet()); printMessage("Test succeeded.", options.isQuiet()); } catch (Exception e) { fatalError("Test failed: " + e.getLocalizedMessage()); diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index ac1257f..6bc4503 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -18,7 +18,6 @@ import java.io.File; import java.net.MalformedURLException; -import java.net.URL; import org.apache.log4j.Level; import org.apache.commons.cli.DefaultParser; @@ -36,7 +35,6 @@ public class SearsiaOptions { /* See setDefaults() below */ - private Boolean anonymous; private String test; private Boolean quiet; private Boolean help; @@ -58,19 +56,18 @@ public class SearsiaOptions { */ public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedURLException { Options options = new Options(); - options.addOption("a", "anonymous",false, "Anonymous traffic by proxying all calls."); // TODO options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); - options.addOption("d", "dontshare",false, "Do not share resource definitions."); // TODO + options.addOption("d", "dontshare",false, "Do not share resource definitions."); options.addOption("e", "export", false, "Export index to stdout and exit."); options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); - options.addOption("m", "mother", true, "Set url of mother's web service end point."); + options.addOption("m", "mother", true, "Set url of mother's api web service end point."); options.addOption("n", "nohealth", false, "Do not share health report."); options.addOption("p", "path", true, "Set directory path to store the index."); options.addOption("q", "quiet", false, "No output to console."); options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response', 'all')."); - options.addOption("u", "url", true, "Set url of my web service endpoint."); + options.addOption("u", "url", true, "Set url of my api web service endpoint."); setDefaults(); parse(options, args); if (myURI == null) { @@ -85,24 +82,7 @@ public SearsiaOptions() { setDefaults(); } - private String rootDir() { - String rootDir = "searsia"; - String urlString = getMotherTemplate(); - urlString = urlString.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); - try { - URL url = new URL(urlString); - String path = url.getPath(); - if (path != null && path.contains("/")) { - path = path.replaceAll("\\/[^\\/]*$", ""); // remove file - path = path.replaceAll("^.+\\/", ""); // remove trailing directories - rootDir = path + "/"; - } - } catch (MalformedURLException e) { } - return rootDir; - } - private void setDefaults() { - anonymous = false; test = null; // no test help = false; quiet = false; @@ -162,9 +142,6 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } catch (ParseException e) { throw new IllegalArgumentException(e.getMessage() + " (use '-h' for help)"); } - if (cmd.hasOption("a")) { - anonymous = true; - } if (cmd.hasOption("c")) { cacheSize = new Integer(cmd.getOptionValue("c")); if (cacheSize < 30) { @@ -215,7 +192,7 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti } if (cmd.hasOption("h") || cmd.getArgs().length < 0 || !cmd.hasOption("m")) { if (!cmd.hasOption("m")) { - System.out.println("Please provide mother's url template (use '-m')."); + System.out.println("Please provide mother's api url template (use '-m')."); } help(options); help = true; @@ -291,10 +268,6 @@ public String getIndexPath() { return indexPath; } - public Boolean isAnonymous() { - return anonymous; - } - public Boolean isQuiet() { return quiet; } @@ -325,7 +298,6 @@ public String toString() { result += "\n Poll Interval = " + getPollInterval(); result += "\n Cache Size = " + getCacheSize(); result += "\n Test Output = " + getTestOutput(); - result += "\n Anonymous = " + isAnonymous(); result += "\n Do Not Share = " + isNotShared(); result += "\n No Health Rep.= " + isNoHealthReport(); return result; diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index 6fa86ba..db76d28 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -70,7 +70,7 @@ public class Resource implements Comparable { private final static int defaultPER = 86400000; // unit: miliseconds (86400000 miliseconds is one day) private final static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); - // TODO: private static final Pattern queryPattern = Pattern.compile("\\{q\\??\\}"); + // TODO: private static final Pattern queryPattern = Pattern.compile("\\{searchTerms\??\\}"); // data to be set by JSON private String id = null; @@ -293,11 +293,15 @@ public SearchResult randomSearch() throws SearchException { public SearchResult search(String query) throws SearchException { - return search(query, null); + return search(query, null, null); } public SearchResult search(String query, String debug) throws SearchException { + return search(query, debug, null); + } + + public SearchResult search(String query, String debug, Integer startPage) throws SearchException { SearchResult result; try { if (rateLimitReached()) { @@ -306,7 +310,7 @@ public SearchResult search(String query, String debug) throws SearchException { if (this.urlAPITemplate == null) { throw new SearchException("No API Template"); } - String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); + String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8"), startPage); String postString = ""; String postQuery; if (this.postString != null && !this.postString.equals("")) { @@ -321,7 +325,7 @@ public SearchResult search(String query, String debug) throws SearchException { } else { postQuery = URLEncoder.encode(query, "UTF-8"); } - postString = fillTemplate(this.postString, postQuery); + postString = fillTemplate(this.postString, postQuery, startPage); } String page = getCompletePage(url, postString, this.headers); if (this.mimeType != null && this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { @@ -376,7 +380,7 @@ public Resource searchResource(String resourceid) throws SearchException { try { String newRid = URLEncoder.encode(resourceid, "UTF-8"); url = url.substring(0, lastIndex) + url.substring(lastIndex).replaceFirst(rid, newRid); - url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}|\\{q\\}", ""); // remove optional parameters and query + url = fillTemplate(url, "", null); String jsonPage = getCompletePage(url, this.postString, this.headers); JSONObject json = new JSONObject(jsonPage); if (json.has("resource")) { @@ -462,7 +466,7 @@ private SearchResult xpathSearch(String url, String page, String debug) } XPathFactory xFactory = XPathFactory.newInstance(); XPath xpath = xFactory.newXPath(); - NodeList xmlNodeList = (NodeList) xpath.evaluate(itemXpath, document, XPathConstants.NODESET); + NodeList xmlNodeList = (NodeList) xpath.evaluate(this.itemXpath, document, XPathConstants.NODESET); for (int i = 0; i < xmlNodeList.getLength() && i < 30; i++) { Node item = xmlNodeList.item(i); result.addHit(extractHit(item)); @@ -525,11 +529,22 @@ private Document parseDocumentXML(String xmlString) { } private String fillTemplate(String template, String query) throws SearchException { + return fillTemplate(template, query, null); + } + + private String fillTemplate(String template, String query, Integer startPage) throws SearchException { String url = template; for (String param: getPrivateParameterKeys()) { url = url.replaceAll("\\{" + param + "\\??\\}", getPrivateParameter(param)); } - url = url.replaceAll("\\{q\\??\\}", query); + url = url.replaceAll("\\{searchTerms\\??\\}", query); // opensearch standard + url = url.replaceAll("\\{q\\??\\}", query); // old Searsia + if (startPage == null) { + startPage = this.getIndexOffset(); + url = url.replaceAll("\\{startPage\\}", startPage.toString()); + } else { + url = url.replaceAll("\\{startPage\\??\\}", startPage.toString()); + } url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters if (url.matches(".*\\{[0-9A-Za-z\\-_]+\\}.*")) { String param = url.substring(url.indexOf("{"), url.indexOf("}") + 1); @@ -544,6 +559,7 @@ private String fillTemplate(String template, String query) throws SearchExceptio private SearchException createPrivateSearchException(Exception e) { String message = e.toString(); + message = message.replaceAll("java\\.[a-z]+\\.", ""); for (String param: getPrivateParameterKeys()) { message = message.replaceAll(getPrivateParameter(param), "{" + param + "}"); } @@ -570,11 +586,10 @@ private boolean rateLimitReached() { } } - private URLConnection setConnectionProperties(URL url, Map headers) throws IOException { URLConnection connection = url.openConnection(); connection.setRequestProperty("User-Agent", "Searsia/1.0"); - connection.setRequestProperty("Accept", this.mimeType); //TODO: "*/*" + connection.setRequestProperty("Accept", this.mimeType + "; q=1.0, */*; q=0.5"); connection.setRequestProperty("Accept-Language", "en-US,en;q=0.5"); // TODO: from browser? for (Map.Entry entry : headers.entrySet()) { String value = entry.getValue(); @@ -851,6 +866,10 @@ public Long getLastUsedSecondsAgo() { return secondsAgo(this.lastUsed); } + public int getIndexOffset() { + return 1; // TODO: indexOffSet of opensearch url template syntax + } + public boolean isHealthy() { return this.lastUsedOk >= this.lastUsedError || this.nrOfError == 0; } diff --git a/src/main/java/org/searsia/web/OpenSearch.java b/src/main/java/org/searsia/web/OpenSearch.java index f308d97..b628327 100644 --- a/src/main/java/org/searsia/web/OpenSearch.java +++ b/src/main/java/org/searsia/web/OpenSearch.java @@ -70,7 +70,7 @@ private String xmlEncode(String text) { } private String templateEncode(String url) { - url = url.replaceAll("\\{q", "{searchTerms"); + url = url.replaceAll("\\{q", "{searchTerms"); // backwards compatible with Searsia v0.x return xmlEncode(url); } diff --git a/src/main/java/org/searsia/web/Proxy.java b/src/main/java/org/searsia/web/Proxy.java deleted file mode 100644 index 11710c6..0000000 --- a/src/main/java/org/searsia/web/Proxy.java +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright 2016-2017 Searsia - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.searsia.web; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.HttpURLConnection; -import java.net.URL; -import java.net.URLConnection; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; -import java.util.TimeZone; - -import javax.ws.rs.GET; -import javax.ws.rs.Path; -import javax.ws.rs.PathParam; -import javax.ws.rs.QueryParam; -import javax.ws.rs.core.Context; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.Response; -import javax.ws.rs.core.Response.ResponseBuilder; - -import org.searsia.engine.Resource; -import org.searsia.index.ResourceIndex; - -/** - * Provides a proxy for any (image) url - * and a special caching proxy for the resources' fav icons. - */ -@Path("images") -public class Proxy { - - private ResourceIndex engines; - private Map iconStore = new HashMap(); - private String lastModified = null; - - public Proxy(ResourceIndex engines) throws IOException { - DateFormat dateFormat = new SimpleDateFormat("EEE, FF MMM yyyy hh:mm:ss zzz", Locale.ROOT); - dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); - this.lastModified = dateFormat.format(new Date()); - this.engines = engines; - } - - @GET - public Response query(@QueryParam("url") String url, - @Context HttpHeaders headers) { - if (url == null) { - return Response.status(404).build(); - } - try { - if (headers.getRequestHeader("If-Modified-Since") != null - || headers.getRequestHeader("If-None-Match") != null) { - return Response.notModified().build(); - } else { - return getStreamBuilder(url).build(); - } - } catch (Exception e) { - return Response.status(503).build(); // 503 = unavailable - } - } - - @GET @Path("{resourceid}") - public Response icon(@PathParam("resourceid") String resourceid, - @Context HttpHeaders headers) { - try { - if (headers.getRequestHeader("If-Modified-Since") != null - || headers.getRequestHeader("If-None-Match") != null) { - return Response.notModified().build(); - } else { - return getWebIcon(resourceid); - } - } catch (Exception e) { - return Response.status(503).build(); - } - } - - private Response getWebIcon(String resourceid) throws IOException { - Resource engine = engines.get(resourceid); - if (engine == null) { - engine = engines.getMyself(); - if (engine == null || !engine.getId().equals(resourceid)) { - return Response.status(404).build(); - } - } - String iconFile = engine.getFavicon(); - if (iconFile == null) { - return Response.status(503).build(); - } - ResponseBuilder builder = iconStore.get(resourceid); - if (builder == null) { - builder = getCachedBuilder(iconFile); - iconStore.put(resourceid, builder); - } - return builder.build(); - } - - private ResponseBuilder getCachedBuilder(String urlString) throws IOException { - HttpURLConnection http = getHttp(urlString); - String contentType = http.getHeaderField("Content-Type"); - if (contentType == null) { - contentType = "image/png"; - } - InputStream stream = http.getInputStream(); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[1024]; - int bytesRead; - while ((bytesRead = stream.read(buffer)) != -1) { - baos.write(buffer, 0, bytesRead); - } - return Response.ok(baos.toByteArray(), contentType) - .header("Last-Modified", lastModified); - } - - private ResponseBuilder getStreamBuilder(String urlString) throws IOException { - HttpURLConnection http = getHttp(urlString); - ResponseBuilder builder = Response.ok(http.getInputStream()); - String field = http.getHeaderField("Content-Type"); - if (field != null) builder.header("Content-Type", field); - field = http.getHeaderField("Content-Length"); - if (field != null) builder.header("Content-Length", field); - field = http.getHeaderField("Expires"); - if (field != null) builder.header("Expires", field); - field = http.getHeaderField("Cache-Control"); - if (field != null) builder.header("Cache-Control", field); - field = http.getHeaderField("Last-Modified"); - if (field != null) builder.header("Last-Modified", field); - return builder; - } - - private HttpURLConnection getHttp(String urlString) throws IOException { - URL url = new URL(urlString); - URLConnection connection = url.openConnection(); - connection.setRequestProperty("User-Agent", "Searsia/1.0"); - connection.setRequestProperty("Accept", "*/*"); - connection.setReadTimeout(4000); - connection.setConnectTimeout(4000); - HttpURLConnection http = (HttpURLConnection) connection; - http.setInstanceFollowRedirects(true); - http.setRequestMethod("GET"); - http.connect(); - return http; - } - -} \ No newline at end of file diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 5b24bc3..ca9dc65 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -74,12 +74,14 @@ public Response options() { .build(); } + // TODO: gives 406 not acceptable whith "Accept: application/json" + @GET @Path("{resourceid}") @Produces(SearchResult.SEARSIA_MIME_ENCODING) public Response query(@PathParam("resourceid") String resourceid, @QueryParam("q") String searchTerms, @QueryParam("resources") String countResources, - @QueryParam("page") String pageOffset) { + @QueryParam("page") String startPage) { resourceid = resourceid.replaceAll("\\.json$", ""); Resource me = engines.getMyself(); if (!resourceid.equals(me.getId())) { @@ -95,10 +97,10 @@ public Response query(@PathParam("resourceid") String resourceid, if (max > 200) { max = 200; } // FedWeb14 has about 150 if (max < 1) { max = 1; } } - if (pageOffset != null) { + if (startPage != null) { try { - start = Integer.parseInt(pageOffset); - start = (start - 1) * max; // openSearch standard default starts at 1 + start = Integer.parseInt(startPage); + start = (start - me.getIndexOffset()) * max; // openSearch standard default starts at 1 } catch (NumberFormatException e) { start = 0; } diff --git a/src/main/java/org/searsia/web/SearsiaApplication.java b/src/main/java/org/searsia/web/SearsiaApplication.java index 1098069..eac13d0 100644 --- a/src/main/java/org/searsia/web/SearsiaApplication.java +++ b/src/main/java/org/searsia/web/SearsiaApplication.java @@ -33,7 +33,7 @@ */ public class SearsiaApplication extends ResourceConfig { - public static final String VERSION = "v1.0.1"; + public static final String VERSION = "v1.0.2"; protected static Response responseOk(JSONObject json) { json.put("searsia", VERSION); @@ -72,9 +72,6 @@ public SearsiaApplication(SearchResultIndex index, java.util.logging.Logger.getLogger("").setLevel(java.util.logging.Level.WARNING); register(new Search(index, engines, options)); register(new OpenSearch(engines, options.isNotShared())); - if (options.isAnonymous()) { - register(new Proxy(engines)); - } register(new Redirect(engines.getMyself().getId())); } diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index 2239888..df14e73 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -48,7 +48,7 @@ public void testSearchPost() throws XPathExpressionException, SearchException { @Test public void testSearchXml() throws XPathExpressionException, SearchException { - Resource se1 = new Resource("http://searsia.org/searsia/wiki/index{q}.json").updateFromAPI(); + Resource se1 = new Resource("http://searsia.org/searsia/wiki/index{searchTerms}.json").updateFromAPI(); Resource se2 = se1.searchResource("wikifull1"); SearchResult result = se2.search("informat"); Assert.assertEquals("application/xml", se2.getMimeType()); @@ -67,7 +67,7 @@ public void testSearchXml2() throws XPathExpressionException, SearchException { @Test public void testSearchXml3() throws XPathExpressionException, SearchException { - Resource se1 = new Resource("http://searsia.org/searsia/wiki/cse1{q}.json").updateFromAPI(); + Resource se1 = new Resource("http://searsia.org/searsia/wiki/cse1{searchTerms}.json").updateFromAPI(); SearchResult result = se1.search("life"); Assert.assertEquals("application/xml", se1.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -85,7 +85,7 @@ public void testSearchJson() throws XPathExpressionException, SearchException { @Test public void testSearchJson2() throws XPathExpressionException, SearchException { - Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json"); SearchResult result = se.search("json"); Assert.assertEquals(1, result.getHits().size()); Assert.assertEquals("extra content", result.getHits().get(0).getString("content")); @@ -93,14 +93,14 @@ public void testSearchJson2() throws XPathExpressionException, SearchException { @Test public void testSearchJsonStrangeKeys() throws XPathExpressionException, SearchException { - Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json"); SearchResult result = se.search("strange keys"); Assert.assertEquals(1, result.getHits().size()); } @Test public void testSearchJsonHtmlAndlinks() throws XPathExpressionException, SearchException { - Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{q}.json"); + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json"); SearchResult result = se.search("html and links"); Assert.assertEquals(2, result.getHits().size()); Assert.assertEquals("Another test for Searsia", result.getHits().get(0).getTitle()); @@ -118,7 +118,7 @@ public void testSearchJavascript() throws XPathExpressionException, SearchExcept @Test public void testSearchSearsiaEmpty() throws XPathExpressionException, SearchException { - Resource se = new Resource("http://searsia.org/searsia/wiki/index{q}.json").updateFromAPI(); + Resource se = new Resource("http://searsia.org/searsia/wiki/index{searchTerms}.json").updateFromAPI(); SearchResult result = se.searchWithoutQuery(); Assert.assertTrue(result.getHits().size() > 0); } diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index 50e33ee..52cd597 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -39,21 +39,21 @@ public static void lastThing() throws IOException, XPathExpressionException, JSO private static Resource utwente() throws XPathExpressionException, JSONException { JSONObject json = new JSONObject( - "{\"apitemplate\":\"http://utwente.nl/search?q={q}\",\"id\":\"567\",\"name\":\"UT\"}" + "{\"apitemplate\":\"http://utwente.nl/search?q={searchTerms}\",\"id\":\"567\",\"name\":\"UT\"}" ); return new Resource(json); } private static Resource searsia() throws XPathExpressionException, JSONException { JSONObject json = new JSONObject( - "{\"apitemplate\":\"http://searsia.com/?q={q}\",\"id\":\"1234\",\"privateparameters\":{\"api\":\"topsecret\"}}" + "{\"apitemplate\":\"http://searsia.com/?q={searchTerms}\",\"id\":\"1234\",\"privateparameters\":{\"api\":\"topsecret\"}}" ); return new Resource(json); } private static Resource newby() throws XPathExpressionException, JSONException { JSONObject json = new JSONObject( - "{\"apitemplate\":\"http://new.com/?q={q}\",\"id\":\"new\",\"privateparameters\":{\"apikey\":\"secret\"}}" + "{\"apitemplate\":\"http://new.com/?q={searchTerms}\",\"id\":\"new\",\"privateparameters\":{\"apikey\":\"secret\"}}" ); return new Resource(json); } diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index b02fe28..e01d667 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -37,15 +37,15 @@ public class SearchTest { private static Resource wiki() throws XPathExpressionException, JSONException { - return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wiki{q}.json\", \"id\":\"wiki\"}")); + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wiki{searchTerms}.json\", \"id\":\"wiki\"}")); } private static Resource wrong() throws XPathExpressionException, JSONException { - return new Resource(new JSONObject("{\"apitemplate\":\"http://reallyreallydoesnotexist.com/wrong?q={q}\", \"id\":\"wrong\"}")); + return new Resource(new JSONObject("{\"apitemplate\":\"http://reallyreallydoesnotexist.com/wrong?q={searchTerms}\", \"id\":\"wrong\"}")); } private static Resource ok() throws XPathExpressionException, JSONException { - return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wikifull1{q}.json\", \"id\":\"wikifull1\"}")); + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json\", \"id\":\"wikifull1\"}")); } private static Resource okDeleted() throws XPathExpressionException, JSONException { @@ -53,7 +53,7 @@ private static Resource okDeleted() throws XPathExpressionException, JSONExcepti } private static Resource me() throws XPathExpressionException, JSONException { - return new Resource(new JSONObject("{\"apitemplate\":\"http://me.org?q={q}\", \"id\":\"wiki\"}")); + return new Resource(new JSONObject("{\"apitemplate\":\"http://me.org?q={searchTerms}\", \"id\":\"wiki\"}")); } @BeforeClass @@ -130,7 +130,7 @@ public void testResource() throws IOException, XPathExpressionException, JSONExc @Test // returns local resource 'wrong' without apitemplate and health public void testResourceNoSharing() throws IOException, XPathExpressionException, JSONException { - String[] args = {"-d", "-n", "-m=http://searsia.org/searsia/wiki/wiki{q}.json"}; + String[] args = {"-d", "-n", "-m=http://searsia.org/searsia/wiki/wiki{searchTerms}.json"}; SearsiaOptions newOptions = new SearsiaOptions(args); Search search = new Search(index, engines, newOptions); Response response = search.query("wrong.json", "", null, null); diff --git a/src/test/resources/exampleSearchResult.json b/src/test/resources/exampleSearchResult.json index 16f5ce9..92088ba 100644 --- a/src/test/resources/exampleSearchResult.json +++ b/src/test/resources/exampleSearchResult.json @@ -53,7 +53,7 @@ ], "resource": { "id": "nl.utwente.hiemstra", - "urltemplate": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/?s={q}", + "urltemplate": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/?s={searchTerms}", "favicon": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/images\/ut.ico", "name": "Djoerd Hiemstra" }, diff --git a/src/test/resources/hiemstra.json b/src/test/resources/hiemstra.json index d547439..064fec1 100644 --- a/src/test/resources/hiemstra.json +++ b/src/test/resources/hiemstra.json @@ -1,7 +1,7 @@ { "resource": { "id": "hiemstra", - "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}", "extractors": { "description": "./h3/following-sibling::text()", "title": "./h3", @@ -19,6 +19,6 @@ "apikey": "SECRET!!" }, "testquery": "searsia", - "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" } } diff --git a/src/test/resources/hiemstracrazy.json b/src/test/resources/hiemstracrazy.json index a42f883..85706d5 100644 --- a/src/test/resources/hiemstracrazy.json +++ b/src/test/resources/hiemstracrazy.json @@ -1,7 +1,7 @@ { "resource": { "id": "hiemstrapost", - "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}", "extractors": { "description": "./h3/following-sibling::text()", "title": "./h3", @@ -24,6 +24,6 @@ "apikey": "SECRET!!" }, "testquery": "searsia", - "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" } } diff --git a/src/test/resources/hiemstrapost.json b/src/test/resources/hiemstrapost.json index 763de35..63a01ae 100644 --- a/src/test/resources/hiemstrapost.json +++ b/src/test/resources/hiemstrapost.json @@ -2,7 +2,7 @@ "resource": { "id": "hiemstra", "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/", - "post": "s={q}&api={apikey}&p={p?}", + "post": "s={searchTerms}&api={apikey}&paged={startPage?}", "extractors": { "description": "./h3/following-sibling::text()", "title": "./h3", @@ -17,6 +17,6 @@ "apikey": "SECRET!!" }, "testquery": "searsia", - "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" } } diff --git a/src/test/resources/hiemstraxml.json b/src/test/resources/hiemstraxml.json index deb2b31..c1ad8cd 100644 --- a/src/test/resources/hiemstraxml.json +++ b/src/test/resources/hiemstraxml.json @@ -1,7 +1,7 @@ { "resource": { "id": "hiemstraxml", - "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&p={p?}", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&paged={startPage?}", "extractors": { "description": "./h3/following-sibling::text()", "title": "./h3", @@ -12,6 +12,6 @@ "mimetype": "application/xml", "prior": 0.3, "testquery": "searsia", - "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" } } diff --git a/src/test/resources/index.json b/src/test/resources/index.json index fffc537..11bd1a9 100644 --- a/src/test/resources/index.json +++ b/src/test/resources/index.json @@ -1,6 +1,6 @@ { "resource": { - "apitemplate": "http://searsia.org/searsia/wiki/index{q}.json", - "id": "index", + "apitemplate": "http://searsia.org/searsia/wiki/index{searchTerms}.json", + "id": "index" } } diff --git a/src/test/resources/javascript.json b/src/test/resources/javascript.json index d50beac..54eb1c4 100644 --- a/src/test/resources/javascript.json +++ b/src/test/resources/javascript.json @@ -1,7 +1,7 @@ { "resource": { "id": "javascript", - "apitemplate": "http://searsia.org/searsia/wiki/wikifull1{q}.js", + "apitemplate": "http://searsia.org/searsia/wiki/wikifull1{searchTerms}.js", "itempath": "//hits", "extractors": { "description": "./description", @@ -9,6 +9,6 @@ "url": "./url" }, "mimetype": "application/x-javascript", - "name": "Wiki Full 1", + "name": "Wiki Full 1" } } diff --git a/src/test/resources/randomid.json b/src/test/resources/randomid.json index c1f2e39..5380e4e 100644 --- a/src/test/resources/randomid.json +++ b/src/test/resources/randomid.json @@ -1,6 +1,6 @@ { "resource": { - "apitemplate": "http://searsia.org/searsia/wiki/wikididyoumean{q?}.json", + "apitemplate": "http://searsia.org/searsia/wiki/wikididyoumean{searchTerms?}.json", "id": "randomid", "mimetype": "application/searsia+json", "testquery": "searsia" diff --git a/src/test/resources/wrong.json b/src/test/resources/wrong.json index 8e275d5..4120cf4 100644 --- a/src/test/resources/wrong.json +++ b/src/test/resources/wrong.json @@ -1,7 +1,7 @@ { "resource": { "id": "hiemstra", - "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/WRONG?s={q}&api={apikey}&p={p?}", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/WRONG?s={searchTerms}&api={apikey}&paged={startPage?}", "extractors": { "description": "./h3/following-sibling::text()", "title": "./h3", @@ -19,6 +19,6 @@ "apikey": "SECRET!!" }, "testquery": "searsia", - "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}" + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" } }