diff --git a/README.md b/README.md index f4dcb8d..219da8e 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,18 @@ Searsia Server ============== http://searsia.org -Usage: +Usage: + Build with: `mvn package` -+ Run with: `java -jar target/searsiaserver.jar` ++ Run with: `java -jar target/searsiaserver.jar -m ` + Done. -Connect to the server with the [Federated Web Search Client][1]. -More information can be found in the [Searsia Documentation][2], -or you may ask a question under [Searsia Server Issues][3]. +The option `-m` is required: It connects your server to an +existing Searsia server, see [Searsia server options][1]. +Connect to your server with the [Federated Web Search Client][2]. +More information can be found in the [Searsia Documentation][3], +or you may ask a question under [Searsia Server Issues][4]. -[1]: http://github.com/searsia/searsiaclient "Searsia Client" -[2]: http://searsia.org "Searsia Documentation" -[3]: http://github.com/searsia/searsiaserver/issues "Issues" +[1]: http://searsia.org/start.html#server +[2]: http://github.com/searsia/searsiaclient "Searsia Client" +[3]: http://searsia.org "Searsia Documentation" +[4]: http://github.com/searsia/searsiaserver/issues "Issues" diff --git a/pom.xml b/pom.xml index 9b98878..a2656f3 100644 --- a/pom.xml +++ b/pom.xml @@ -3,9 +3,9 @@ 4.0.0 org.searsia searsiaserver - 0.4.1 + 1.0.2 - 3 + 3.0 UTF-8 @@ -19,7 +19,7 @@ org.apache.maven.plugins maven-assembly-plugin - 2.6 + 3.1.0 jar-with-dependencies diff --git a/src/main/java/org/searsia/Hit.java b/src/main/java/org/searsia/Hit.java index 71ddb4e..0db42df 100644 --- a/src/main/java/org/searsia/Hit.java +++ b/src/main/java/org/searsia/Hit.java @@ -23,6 +23,13 @@ import org.json.JSONObject; +/** + * A single search hit. A hit can have any field. + * Standard fields are "title", "description", "url, "favicon", "image". + * + * @author Djoerd Hiemstra and Dolf Trieschnigg + */ + public class Hit implements Comparable { private Map map; @@ -44,7 +51,12 @@ public Hit(JSONObject json) { Iterator keys = json.keys(); while (keys.hasNext()) { String key = (String) keys.next(); - map.put(key, json.get(key)); + Object value = json.get(key); + if (value instanceof String) { + map.put(key, noHTML((String) value)); + } else if (value instanceof Number || value instanceof Boolean) { + map.put(key, value); + } } } @@ -71,10 +83,10 @@ public void setScore(float score) { map.put("score", score); } - public void setRank(int rank) { - map.put("rank", rank); - } - + public void setResourceScore(float score) { + map.put("rscore", score); + } + public void setTitle(String title) { map.put("title", title); } @@ -87,27 +99,52 @@ public void setUrl(String url) { map.put("url", url); } + /** + * This id of will be used the Lucene index. + * One url may be indexed multiple times, + * once for each resource id (rid). + * @return unique identifier + */ public String getId() { String result = (String) map.get("url"); + String rid = ""; if (result == null) { result = (String) map.get("title"); - } - return result; - } - - public float getScore() { - Float score = (Float) map.get("score"); - if (score == null) { - return 0.0f; } else { - return score; + rid = (String) map.get("rid"); + if (rid == null) { + rid = ""; + } } + return rid + "@" + result; } - - public Integer getRank() { - return (Integer) map.get("rank"); + + private float getFloatValue(String field) { + Float score = 0.0f; + Object scoreObject = map.get(field); + if (scoreObject instanceof Float) { + score = (float) scoreObject; + } else if (scoreObject instanceof Double) { + score = new Float((double) scoreObject); + } else if (scoreObject instanceof Integer) { + score = new Float((int) scoreObject); + } else if (scoreObject instanceof String) { + try { + score = Float.parseFloat((String) scoreObject); + } catch (NumberFormatException e) { } + } + return score; + } + + + public float getScore() { + return getFloatValue("score"); } + public float getResourceScore() { + return getFloatValue("rscore"); + } + public Object get(String field) { return map.get(field); } @@ -124,29 +161,50 @@ public String getTitle() { return (String) map.get("title"); } + public String getRid() { + return (String) map.get("rid"); + } + @Override public String toString() { return map.entrySet().toString(); } - private String noHTML(String value) { - value = value.replaceAll("<[^>]+>", ""); // no HTML + private String noHTML(String value) { // TODO: also in TextExtractor?? + value = value.replaceAll("(?i)]*>||||", ""); // No HTML, please: spans removed + value = value.replaceAll("<[^>]+>|&#?[0-9a-zA-Z]{1,9};", ""); // no HTML return value.replaceAll("[<>]", ""); } - public JSONObject toJson() { + public JSONObject toJson() { + JSONObject json = new JSONObject(); + for (Entry e: map.entrySet()) { + Object value = e.getValue(); + if (value instanceof String) { + value = noHTML((String) value); + } + json.put(e.getKey(), value); + } + return json; + } + + + public JSONObject toJsonNoQueryResourceId() { JSONObject json = new JSONObject(); for (Entry e: map.entrySet()) { Object value = e.getValue(); if (value instanceof String) { value = noHTML((String) value); } - json.put(e.getKey(), value); + String key = e.getKey(); + if (!key.equals("query") && !key.equals("rid")) { + json.put(key, value); + } } return json; } - public String toIndexVersion() { // TODO: special treatment for urls, etc. + public String toIndexVersion() { // TODO: special treatment for urls, etc. and StringBuilder String result = ""; for (Object s : map.values()) { if (s instanceof String) { @@ -156,20 +214,36 @@ public String toIndexVersion() { // TODO: special treatment for urls, etc. return result.trim(); } + public String toTitleDescriptionIndexVersion() { + String result = (String) this.get("title"); + String desc = (String) this.get("description"); + if (result == null) { result = ""; } + if (desc != null) { + result += " " + desc; + } + return result.trim(); + } + @Override public int compareTo(Hit hit2) { - Float score1 = getScore(); - Float score2 = hit2.getScore(); - if (score1.compareTo(score2) == 0) { - Integer rank1 = getRank(); - Integer rank2 = hit2.getRank(); - if (rank1 != null && rank2 != null) { - return rank2.compareTo(rank1); // yes reversed! - } else { - return 0; - } + Float score1 = getResourceScore(); // order by best resources + Float score2 = hit2.getResourceScore(); + int compare = score1.compareTo(score2); + if (compare != 0) { + return compare; } else { - return score1.compareTo(score2); + String rid1 = getRid(); // if two resources the same score + String rid2 = hit2.getRid(); + if (rid1 != null && rid2 != null) { + compare = rid1.compareTo(rid2); + } + if (compare != 0) { + return compare; + } else { + score1 = getScore(); // cannot be null + score2 = hit2.getScore(); + return score1.compareTo(score2); + } } } diff --git a/src/main/java/org/searsia/Main.java b/src/main/java/org/searsia/Main.java index 5b9857f..70d0760 100644 --- a/src/main/java/org/searsia/Main.java +++ b/src/main/java/org/searsia/Main.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; +import java.security.MessageDigest; +import java.util.HashMap; +import java.util.Map; import java.util.Random; import org.apache.log4j.Appender; @@ -33,7 +33,6 @@ import org.apache.log4j.PatternLayout; import org.glassfish.grizzly.http.server.HttpServer; import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory; -import org.json.JSONObject; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.web.SearsiaApplication; @@ -42,7 +41,14 @@ /** - * Searsia Main class + * Searsia Main class. Does the following actions: + * + * 1. Connect to mother peer; + * 2. If it runs in test mode, test the mother, print results and exit; + * 3. Open/create Lucene indexes; + * 4. Get the 10 top resources if not existing or too old; + * 5. Run the web server; + * 6. Run the daemon to periodically poll the mother and resources. * * Start as: java -jar target/searsiaserver.jar * More info: java -jar target/searsiaserver.jar --help @@ -53,92 +59,236 @@ public class Main { private static final Logger LOGGER = Logger.getLogger("org.searsia"); - private static final DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); - private static Random random = new Random(); + private static Random random = new Random(); private static void searsiaDaemon(SearchResultIndex index, ResourceIndex engines, - int pollInterval) throws InterruptedException { - Resource mother = engines.getMother(); - Resource engine = null; + SearsiaOptions options) throws InterruptedException { + Resource mother = engines.getMother(); + Resource engine = null; + int pollInterval = options.getPollInterval(); while(true) { Thread.sleep(pollInterval * 1000); try { - if (!index.check()) { + if (!index.checkFlush()) { SearchResult result = null; if (mother != null && random.nextBoolean()) { // sample mostly from mother engine = mother; + LOGGER.trace("Next: mother sample"); result = engine.randomSearch(); + Resource newmother = result.getResource(); + if (newmother != null && newmother.getId().equals(mother.getId())) { + if (newmother.getAPITemplate() == null) { + newmother.setUrlAPITemplate(mother.getAPITemplate()); + } + engines.putMother(newmother); + engines.putMyself(newmother.getLocalResource()); + } else { + LOGGER.warn("Unable to update mother: Did ids change?"); + } + getResources(mother, result, engines); } else { engine = engines.getRandom(); + LOGGER.trace("Next sample: " + engine.getId()); result = engine.randomSearch(); - result.removeResourceRank(); // only trust your mother - result.addQueryResourceRankDate(engine.getId()); + result.removeResource(); // only trust your mother + result.addResourceDate(engine.getId()); } index.offer(result); - logSample(engine.getId(), result.getQuery()); + LOGGER.info("Sampled " + engine.getId() + ": " + result.getQuery()); + } + } catch (Exception e) { + if (engine != null) { + LOGGER.warn("Sampling " + engine.getId() + " failed: " + e.getMessage()); + } else { + LOGGER.warn("Flushing index to disk failed:" + e.getMessage()); } - } catch (Exception e) { - logWarning("Sampling " + engine.getId() + " failed: " + e.getMessage()); } } } - private static void getResources(Resource mother, SearchResult result, ResourceIndex engines) { + private static int getResources(Resource mother, SearchResult result, ResourceIndex engines) { int i = 0; for (Hit hit: result.getHits()) { String rid = hit.getString("rid"); - if (rid != null && !engines.containsKey(rid)) { - Resource engine; - i += 1; - try { - engine = mother.searchResource(rid); - } catch (SearchException e) { - System.err.println("Warning: Unable to get resources from " + mother.getId()); - break; - } - try { - engines.put(engine); - } catch(Exception e) { - System.err.println("Error: " + e.getMessage()); - System.exit(1); - } + if (rid != null ) { + Resource engine = engines.get(rid); + if (engine == null || engine.getLastUpdatedSecondsAgo() > 7200) { // TODO: option for 7200 ? + i += 1; + try { + engine = mother.searchResource(rid); + } catch (SearchException e) { + LOGGER.warn("Warning: Update failed: " + e.getMessage()); + } + if (engine != null && rid.equals(engine.getId())) { + engines.put(engine); + if (engine.isDeleted()) { + LOGGER.debug("Deleted: " + rid); + } else { + LOGGER.debug("Updated: " + rid); + } + } else { + LOGGER.warn("Warning: Resource not found: " + rid); + } + } } if (i > 10) { - break; // not more than the first 10. + break; // not more than the first 10 per check } } + engines.flush(); + return i; } - - private static String uriToTemplate(String uri) { - if (!(uri == null) && !(uri.contains("{q"))) { - if (!uri.endsWith("/")) { - uri += "/"; - } - uri += "search?q={q?}&r={r?}"; - } + private static boolean sameTemplates(String uri1, String uri2, String myId) { + if (uri1 == null) { + return (uri2 == null); + } else { + uri1 = uri1.replaceAll("\\?.*$", ""); + uri2 = uri2.replaceAll("\\?.*$", ""); + return uri1.equals(uri2); + } + } + + private static String removeFileNameUri(String uri) { + if (uri != null) { + uri = uri.replaceAll("\\/[^\\/]+$", "/"); + } + return uri; + } + + private static String normalizedUriToTemplate(String uri, String rid) { + if (uri != null) { + if (uri.endsWith("/") ) { + uri += rid + "?q={searchTerms}&page={startPage?}"; + } else if (!uri.contains("{q") && !uri.contains("{searchTerms")) { // check for tests on searsia.org + uri += "?q={searchTerms}&page={startPage?}"; + } + + } return uri; } - - private static void logWarning(String message) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - r.put("message", message); - LOGGER.warn(r.toString()); + private static void printMessage(String message, Boolean isQuiet) { + if (!isQuiet) { + System.err.println(message); + } } - - - private static void logSample(String resourceid, String query) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - r.put("sample", resourceid); - r.put("query", query); - LOGGER.info(r.toString()); + + private static void fatalError(String message) { + System.err.println("ERROR: " + message); + System.exit(1); } + /** + * For a unique filename (public because used in searsiafedweb) + * @param inputString + * @return Unique hash + */ + public static String getHashString(String inputString) { + MessageDigest md; + byte[] hash; + try { + md = MessageDigest.getInstance("MD5"); + } catch (java.security.NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + try { + hash = md.digest(inputString.getBytes("UTF-8")); + } catch (java.io.UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + StringBuilder sb = new StringBuilder(); + for(byte b : hash){ + sb.append(String.format("%02x", b & 0xff)); + } + return sb.toString(); + } + + + private static void testAll(Resource mother, SearchResult result, Boolean isQuiet) throws SearchException { + int nrFailed = 0; + boolean isDone = false; + int startPage = mother.getIndexOffset(); + Map tested = new HashMap(); + tested.put(mother.getId(), true); + while (!result.getHits().isEmpty() && !isDone) { + isDone = true; + for (Hit hit: result.getHits()) { + String rid = hit.getRid(); + if (rid != null && !tested.containsKey(rid)) { + tested.put(rid, true); + isDone = false; + Resource engine = null; + try { + engine = mother.searchResource(hit.getRid()); + testEngine(engine, "none", isQuiet); + } catch (Exception e) { + nrFailed += 1; + if (engine == null) { // resource not found, so test did not even start + printMessage("Testing: " + hit.getRid(), isQuiet); + } + printMessage("Test failed: " + e.getMessage(), isQuiet); + } + } + } + startPage += 1; + try { + result = mother.search(mother.getTestQuery(), "all", startPage); + } catch (Exception e) { + throw new SearchException("Mother error: " + e.getMessage()); + } + } + if (nrFailed > 0) { + throw new SearchException(nrFailed + " engines failed."); + } + } + + + private static void testEngine(Resource mother, String debugInfo, Boolean isQuiet) throws SearchException { + printMessage("Testing: " + mother.getId() + " (" + mother.getName() + ")", isQuiet); + SearchResult result = null; + result = mother.search(mother.getTestQuery(), debugInfo); + if (!isQuiet) { + if (debugInfo.equals("json")) { + System.out.println(result.toJson().toString(2)); + } else if (debugInfo.equals("xml") || debugInfo.equals("response")) { + String debugOut = result.getDebugOut(); + if (debugOut == null) { + System.out.println ("Warning: No " + debugInfo + " output."); + } else { + System.out.println(debugOut); + } + } + System.out.flush(); + } + if (result.getHits().isEmpty()) { + String tip = ""; + if (mother.getRerank() != null) { + tip = " Try removing rerank."; + } + throw new SearchException("No results for test query." + tip); + } + if (result.getHits().size() < 10) { + printMessage("Warning: less than 10 results for query '" + result.getQuery() + "'; see \"testquery\" or \"rerank\".", isQuiet); + } else if (result.getHits().size() > 49) { + printMessage("Warning: more than 49 results for query '" + result.getQuery() + "'", isQuiet); + } + if (debugInfo.equals("all")) { + String rid = null; + if (result.getResource() != null) { + rid = result.getResource().getId(); + } + if (rid != null && rid.equals(mother.getId())) { // do not trust resources if the mother API provides another ID than the mother ID + testAll(mother, result, isQuiet); + } else if (rid == null ){ + printMessage("Warning: no resources available.", isQuiet); + } else { + printMessage("Warning: no resources. ID '" + mother.getId() + "' changed to '" + rid + "'", isQuiet); + } + } + } /** * Attaches a rolling file logger for search queries @@ -147,23 +297,23 @@ private static void logSample(String resourceid, String query) { * @param filename * @throws IOException */ - private static void setupQueryLogger(String path, String filename, Level level) throws IOException { - Path querylogDir = Paths.get(path, filename + "_log"); - if (!Files.exists(querylogDir)) { - Files.createDirectories(querylogDir); + private static void setupLogger(String path, String filename, Level level) throws IOException { + Path logDir = Paths.get(path, filename + "_log"); + if (!Files.exists(logDir)) { + Files.createDirectories(logDir); } Appender appender = new DailyRollingFileAppender( - new PatternLayout("%m%n"), - querylogDir.resolve("queries.log").toString(), + new PatternLayout("%p %d{ISO8601} %m%n"), + logDir.resolve("searsia.log").toString(), "'.'yyyy-MM-dd"); LOGGER.addAppender(appender); LOGGER.setLevel(level); - logWarning("Searsia restart"); + LOGGER.warn("Searsia restart"); } public static void main(String[] args) { - ResourceIndex engines = null; + ResourceIndex engines = null; SearchResultIndex index = null; SearsiaOptions options = null; HttpServer server = null; @@ -171,139 +321,123 @@ public static void main(String[] args) { // Get options. This will also set the default options. try { options = new SearsiaOptions(args); - } catch (IllegalArgumentException e) { - System.exit(1); + } catch (Exception e) { + fatalError(e.getMessage()); } - - if (!options.isQuiet()) { - System.err.println("Searsia server " + SearsiaApplication.VERSION); + if (options.isHelp()) { + System.exit(0); } + printMessage("Searsia server " + SearsiaApplication.VERSION, options.isQuiet()); + + + // Connect to the mother engine and gather information from the mother. + Resource myself = null; + Resource mother = null; + Resource connect = new Resource(options.getMotherTemplate()); + String version = null; + SearchResult result = null; + try { + result = connect.searchWithoutQuery(); + mother = result.getResource(); + version = result.getVersion(); + } catch (SearchException e) { + fatalError("Connection failed: " + e.getMessage()); + } + if (mother == null) { + fatalError("Initialization failed: JSONObject[\"resource\"] not found."); + } + if (!options.getMotherTemplate().matches(".*" + mother.getId() + "[^/]*$")) { + fatalError("API Template (" + options.getMotherTemplate() + "): file name must contain id (" + mother.getId() +")"); + } + if (version == null || !version.startsWith("v1")) { + fatalError("Wrong major Searsia version. Must be v1.x.x."); + } - // Connect to the mother engine and gather information from the mother. - String motherTemplate = options.getMotherTemplate(); - Resource mother = null; - SearchResult result = null; - if (motherTemplate != null) { - mother = new Resource(motherTemplate); - try { - result = mother.search(); - } catch (SearchException e) { - System.err.println("Error: Connection failed: " + e.getMessage()); - System.exit(1); - } - Resource newMother = result.getResource(); - if (newMother != null) { - String id = newMother.getId(); - if (id != null) { - mother.changeId(id); - } - mother.setPrior(newMother.getPrior()); - mother.setName (newMother.getName()); - mother.setFavicon(newMother.getFavicon()); - mother.setBanner(newMother.getBanner()); - mother.setTestQuery(newMother.getTestQuery()); - mother.setUrlUserTemplate(newMother.getUserTemplate()); - mother.setUrlSuggestTemplate(newMother.getSuggestTemplate()); - } - if (!options.isQuiet()) { - System.err.println("Connected to: " + mother.getId()); + + if (mother.getAPITemplate() == null) { + mother.setUrlAPITemplate(options.getMotherTemplate()); + } else { + if (!sameTemplates(mother.getAPITemplate(), options.getMotherTemplate(), mother.getId())) { + printMessage("Warning: Mother changed to " + mother.getAPITemplate(), options.isQuiet()); + } + if (mother.getAPITemplate().contains("{q")) { + printMessage("Warning: API Template parameter {q} is deprecated. Use {searchTerms}.", options.isQuiet()); } } + myself = mother.getLocalResource(); + String fileName = myself.getId() + "_" + getHashString(mother.getAPITemplate()); + String path = options.getIndexPath(); + Level level = options.getLoggerLevel(); - // This is about me: - String myURI = options.getMyURI(); - String myTemplate = uriToTemplate(myURI); - Resource me = null; - String myId = options.getMyName(); - if (myId == null) { - if (motherTemplate != null) { - myId = mother.getId(); // no Id and mother? Take my mother's name - me = new Resource(myTemplate, myId); - } else { - me = new Resource(myTemplate); - myId = me.getId(); // no Id and no mother?, this will result in a random Id - } + + // If test is set, test the mother + if (options.getTestOutput() != null) { + String tmpDir = System.getProperty("java.io.tmpdir"); + if (tmpDir != null) { + path = tmpDir; + } + try { + testEngine(mother, options.getTestOutput(), options.isQuiet()); + printMessage("Test succeeded.", options.isQuiet()); + } catch (Exception e) { + fatalError("Test failed: " + e.getLocalizedMessage()); + } } else { - me = new Resource(myTemplate, myId); + printMessage("Starting: " + myself.getName() + " (" + myself.getId() + ")", options.isQuiet()); } - String prefix; - if (motherTemplate != null) { - prefix = mother.getMD5(); - } else { - prefix = "local"; - } - - - // Create or open indexes. The index name combines the mother unique MD5 with the local Id; - // MD5, so we will not mix indexes of we have two mothers with the same name. - String fileName = prefix + "_" + myId; - String path = options.getIndexPath(); - Level level = options.getLoggerLevel(); - try { - engines = new ResourceIndex(path, fileName); - index = new SearchResultIndex(path, fileName, options.getCacheSize()); - setupQueryLogger(path, fileName, level); - } catch (Exception e) { - System.err.println("Setup failed: " + e.getMessage()); - System.exit(1); - } - - // My mother: Remember her, and ask her for advice - if (mother != null) { - try { - engines.putMother(mother); - } catch (Exception e) { - System.err.println("Error: " + e.getMessage()); - System.exit(1); - } - getResources(mother, result, engines); - } - // Myself: - Resource newMe = engines.getMyself(); - if (newMe != null) { - me.setName (newMe.getName()); - me.setFavicon(newMe.getFavicon()); - me.setBanner(newMe.getBanner()); - me.setTestQuery(newMe.getTestQuery()); - me.setUrlUserTemplate(newMe.getUserTemplate()); - me.setUrlSuggestTemplate(newMe.getSuggestTemplate()); - } else if (mother != null) { - me.setName(mother.getName()); - me.setFavicon(mother.getFavicon()); // first time? get images from mother. - me.setBanner(mother.getBanner()); - me.setUrlSuggestTemplate(mother.getSuggestTemplate()); - } - me.setPrior(engines.maxPrior()); - try { - engines.putMyself(me); - } catch (Exception e) { - System.err.println("Error: " + e.getMessage()); - System.exit(1); + // Create or open indexes. The filename appends the MD5 of the id so we don't confuse indexes + try { + setupLogger(path, fileName, level); + engines = new ResourceIndex(path, fileName); + index = new SearchResultIndex(path, fileName, options.getCacheSize()); + } catch (Exception e) { + fatalError("Setup failed: " + e.getMessage()); } - + engines.putMother(mother); + engines.putMyself(myself); + + getResources(mother, result, engines); + + // Export index and exit + if (options.isExport()) { + String encoding = System.getProperties().getProperty("file.encoding"); + if (encoding == null || !encoding.equals("UTF-8")) { + printMessage("Warning: Unknown encoding. Set JVM encoding with '-Dfile.encoding=UTF-8'", options.isQuiet()); + } + printMessage("Exporting index...", options.isQuiet()); + try { + engines.dump(); + engines.close(); + index.dump(); + index.close(); + } catch (IOException e) { + fatalError("Index export failed: " + e.getMessage()); + } + printMessage("Done.", options.isQuiet()); + System.exit(0); + } // Start the web server - Boolean openWide = options.openedWide(); + String myURI = removeFileNameUri(options.getMyURI()); try { - server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), - new SearsiaApplication(index, engines, openWide)); + SearsiaApplication app = new SearsiaApplication(index, engines, options); + server = GrizzlyHttpServerFactory.createHttpServer(URI.create(myURI), app); } catch (Exception e) { - System.err.println("Server failed: " + e.getMessage()); - System.exit(1); - } - if (!options.isQuiet()) { - System.err.println("API template: " + uriToTemplate(myURI)); - System.err.println("Use Ctrl+c to stop."); + fatalError("Server failed: " + e.getMessage()); } - // Start the update daemon - if (!options.isExit()) { + + // Start the update daemon if not testing + if (options.getTestOutput() == null) { + String myAPI = normalizedUriToTemplate(myURI + "searsia/", myself.getId()); + printMessage("API end point: " + myAPI, options.isQuiet()); + printMessage("Use Ctrl+c to stop.", options.isQuiet()); try { - searsiaDaemon(index, engines, options.getPollInterval()); - } catch (InterruptedException e) { } + searsiaDaemon(index, engines, options); + } catch (InterruptedException e) { } } - server.shutdownNow(); + server.shutdownNow(); // Catch ctrl+c: http://www.waelchatila.com/2006/01/13/1137143896635.html } -} +} diff --git a/src/main/java/org/searsia/SearchResult.java b/src/main/java/org/searsia/SearchResult.java index 88f6f06..26dd90c 100644 --- a/src/main/java/org/searsia/SearchResult.java +++ b/src/main/java/org/searsia/SearchResult.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,13 @@ import org.searsia.index.ResourceIndex; import org.searsia.engine.Resource; +/** + * A Searsia Search result page, + * consisting of "hits", a "query" and a "resource". + * + * @author Djoerd Hiemstra and Dolf Trieschnigg + */ + public class SearchResult { public static final String SEARSIA_MIME_TYPE = "application/searsia+json"; public static final String SEARSIA_MIME_ENCODING = SEARSIA_MIME_TYPE + "; charset=utf-8"; @@ -39,8 +46,10 @@ public class SearchResult { private List hits; private Random random; private Resource resource; - private String xmlOut; + private String debugOut; private String query; + private String resourceId; + private String version; public SearchResult() { this(null); @@ -51,7 +60,8 @@ public SearchResult(Hit hit) { this.random = new Random(); this.resource = null; this.query = null; - this.xmlOut = null; + this.version = null; + this.debugOut = null; if (hit != null) { this.hits.add(hit); } @@ -72,13 +82,29 @@ public void setResource(Resource resource) { public Resource getResource() { return this.resource; } + + public void setResourceId(String resourceId) { + this.resourceId = resourceId; + } + + public String getResourceId() { + return this.resourceId; + } + + public String getVersion() { + return this.version; + } + + public void setVersion(String version) { + this.version = version; + } - public void setXmlOut(String xmlOut) { - this.xmlOut = xmlOut; + public void setDebugOut(String debugOut) { + this.debugOut = debugOut; } - public String getXmlOut() { - return this.xmlOut; + public String getDebugOut() { + return this.debugOut; } public void setQuery(String query) { @@ -90,84 +116,169 @@ public String getQuery() { } // TODO: maybe a list of query-resource pairs, if result found by multiple engines for multiple queries. - public void addQueryResourceRankDate(String resourceID) { - int rank = 1; - String query = getQuery(); + public void addResourceDate(String resourceID) { for (Hit hit: this.hits) { - hit.putIfEmpty("query", query); hit.putIfEmpty("rid", resourceID); // TODO: if unknown rid, then replace! - hit.putIfEmpty("rank", rank++); hit.putIfEmpty("foundBefore", df.format(new Date())); } } - public void removeResourceRank() { + public void removeResource() { for (Hit hit: this.hits) { hit.remove("rid"); - hit.remove("rank"); + hit.remove("query"); // for legacy reasons, we added the query to the result before } } + + /* ******************************************************************* + * Code below reranks search results for resource selection + * *******************************************************************/ + - // TODO: needs a proper implementation, refactoring, and research ;-) - // Scoring follows these rules: - // 1. hits are ordered such that the first hit per rid determines the resource ranking - // 2. if a resource has a exact query match, then these are ranked highest (given rule 1) - // 3. order by score (given rule 1 and rule 2) - // 4. TODO: not more than x (=10?) hits per resource - // 5. stop after 20 resources - public void scoreResourceSelection(String query, ResourceIndex engines) { - final float bias = 1.0f; - Map maxScore = new HashMap(); - Map topEngines = engines.topValues(query, 20); + /** + * New resource ranker, adds rscore. + * @param query + * @param engines + */ + public void scoreResourceSelection(String query, ResourceIndex engines, int max, int start) { + SearchResult newResult = new SearchResult(); + final float boost = 0.05f; + final int maxSize = max + start; + Map maxScores = new HashMap(); + Map resourceReturned = new HashMap(); + Map topEngines = engines.topValuesNotDeleted(query, maxSize); for (Hit hit: this.hits) { String rid = hit.getString("rid"); if (rid != null) { - float prior = 0.0f; - if (engines.containsKey(rid)) { + Resource engine = engines.get(rid); + float prior = 0.0f; + if (engine != null) { + if (engine.isDeleted()) { continue; } // cached result from a deleted resource prior = engines.get(rid).getPrior(); } - float score = hit.getScore() * bias + prior; - Float top = topEngines.get(rid); - if (top != null) { - if (top > score) { - score = top; - } - topEngines.remove(rid); + Float top = topEngines.get(rid); + if (top != null) { + if (top > prior) { + prior = top; + } + topEngines.remove(rid); + } + Integer returned = resourceReturned.get(rid); + if (returned == null) { + returned = 0; + } + resourceReturned.put(rid, returned + 1); + Float score = prior + hit.getScore() * boost; + Float maxScore = maxScores.get(rid); + if (maxScore == null || maxScore < score) { + maxScore = score; + maxScores.put(rid, maxScore); + returned = 0; // this is the best one, so we will add it below no matter what } - Float max = maxScore.get(rid); - if (max == null || max < score) { - maxScore.put(rid, score); - max = score; - } hit.setScore(score); - //hit.put("rscore", max); + hit.setResourceScore(maxScore); + if (returned < 4) { // at most 4 results per resource + newResult.addHit(hit); + } + } else { + hit.setResourceScore(hit.getScore() * boost); + newResult.addHit(hit); } } - for (String rid: topEngines.keySet()) { - Hit hit = new Hit(); + for (String rid: topEngines.keySet()) { + Hit hit = new Hit(); hit.put("rid", rid); hit.setScore(topEngines.get(rid)); - //hit.put("rscore", topEngines.get(rid)); - this.hits.add(hit); - } + hit.setResourceScore(topEngines.get(rid)); + newResult.addHit(hit); + } + this.hits = newResult.hits; Collections.sort(this.hits, Collections.reverseOrder()); + selectBestResources(max, start); // TODO: efficiently combine this with sort? + } + + /** + * Selects the 'max' best resources, starting at resource 'start' + * Hits MUST be sorted already on rid (rscore). + * @param max + * @param start + */ + public void selectBestResources(int max, int start) { + String rid, previousRid = null; + int rFound = 0; + int rNeeded = start + max; + int first = 0, i = 0; + for (Hit hit: this.hits) { + rid = hit.getRid(); + if (rid != null && !rid.equals(previousRid)) { + previousRid = rid; + if (start > 0 && rFound == start) { first = i; } + rFound += 1; + if (rFound > rNeeded) { break; } + } + i += 1; + } + if (rFound < start) { + this.hits.clear(); + } else { + this.hits = this.hits.subList(first, i); + } } - public void scoreReranking(String query, String model) { // TODO use model + public void scoreReranking(String query, String model) { + if ("random".equals(model)) { + scoreRerankingRandom(); + } else if ("bestrandom".equals(model)) { + scoreRerankingBestRandom(query); + } else { + scoreRerankingRest(query); + } + } + + private void scoreRerankingBestRandom(String query) { + scoreRerankingRandom(); + scoreRerankingGeneral(query, 10); + } + + private void scoreRerankingRest(String query) { + scoreRerankingGeneral(query, 0); + } + + private void scoreRerankingRandom() { + Hit hit; + int i, j, + size = this.hits.size(); + for (i = 0; i < size; i += 1) { + j = random.nextInt(size); + hit = this.hits.get(i); + this.hits.set(i, this.hits.get(j)); + this.hits.set(j, hit); + } + } + + private void scoreRerankingGeneral(String query, int count) { SearchResult newResult = new SearchResult(); Map queryTerms = new HashMap(); for (String term: query.toLowerCase().split(TOKENIZER)) { - queryTerms.put(term, 0.01f); // TODO df from Lucene index? + queryTerms.put(term, 0.1f); // TODO idf from Lucene index }; for (Hit hit: this.hits) { float score = 0.0f; String text = hit.toIndexVersion(); + for (String term: queryTerms.keySet()) { + queryTerms.put(term, 0.1f); + } for (String term: text.toLowerCase().split(TOKENIZER)) { if (queryTerms.containsKey(term)) { - score += 1.0f; + score += queryTerms.get(term); + queryTerms.put(term, 0.0f); } } + if (count > 0) { + score += 0.01f; + count -= 1; + } if (score > 0.001f) { hit.put("score", score); newResult.addHit(hit); @@ -178,25 +289,48 @@ public void scoreReranking(String query, String model) { // TODO use model } - public String randomTerm() { + /* ******************************************************************* + * End of reranking code + * *******************************************************************/ + + + public String randomTerm(String notThisOne) { // TODO: keep track of more previous random queries? int size = this.hits.size(); if (size > 0) { int nr = random.nextInt(this.hits.size()); - String text = this.hits.get(nr).toIndexVersion(); + String text = this.hits.get(nr).toTitleDescriptionIndexVersion().toLowerCase(); String terms[] = text.split(TOKENIZER); // TODO Lucene tokenizer? nr = random.nextInt(terms.length); - return terms[nr]; + String thisOne = terms[nr]; + int i = nr; + while (thisOne.length() < 1 || notThisOne.equals(thisOne)) { + if (i + 1 >= terms.length) { i = 0; } + else { i += 1; } + if (i == nr) { return null; } + thisOne = terms[i]; + } + return thisOne; } else { return null; } } - public JSONObject toJson() { + public JSONObject toJson() { + return toJson(false); + } + + public JSONObject toJson(boolean censorQueryResourceId) { JSONObject r = new JSONObject(); r.put("hits", new JSONArray()); for (Hit hit: hits) { - r.append("hits", hit.toJson()); + if (censorQueryResourceId) { + r.append("hits", hit.toJsonNoQueryResourceId()); + } else { + r.append("hits", hit.toJson()); + } } return r; } + + } diff --git a/src/main/java/org/searsia/SearsiaOptions.java b/src/main/java/org/searsia/SearsiaOptions.java index 8a2bd91..6bc4503 100644 --- a/src/main/java/org/searsia/SearsiaOptions.java +++ b/src/main/java/org/searsia/SearsiaOptions.java @@ -17,8 +17,9 @@ package org.searsia; import java.io.File; -import org.apache.log4j.Level; +import java.net.MalformedURLException; +import org.apache.log4j.Level; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -27,57 +28,73 @@ import org.apache.commons.cli.ParseException; /** - * Searsia Server options + * Searsia Server options. * @author Djoerd Hiemstra * */ public class SearsiaOptions { /* See setDefaults() below */ - private Boolean openWide; - private Boolean exit; + private String test; private Boolean quiet; + private Boolean help; + private Boolean dontshare; + private Boolean export; + private Boolean nohealth; private int cacheSize; private int pollInterval; private int logLevel; private String myURI; private String motherTemplate; private String indexPath; - private String myName; /** - * Takes command line options and sensible defaults - * + * Takes command line options and sensible defaults. + * @param args Command Line options + * @throws IllegalArgumentException + * @throws MalformedURLException */ - public SearsiaOptions(String[] args) throws IllegalArgumentException { + public SearsiaOptions(String[] args) throws IllegalArgumentException, MalformedURLException { Options options = new Options(); options.addOption("c", "cache", true, "Set cache size (integer: number of result pages)."); - options.addOption("e", "exit", false, "Exit immediately after startup."); + options.addOption("d", "dontshare",false, "Do not share resource definitions."); + options.addOption("e", "export", false, "Export index to stdout and exit."); options.addOption("h", "help", false, "Show help."); options.addOption("i", "interval", true, "Set poll interval (integer: in seconds)."); options.addOption("l", "log", true, "Set log level (0=off, 1=error, 2=warn=default, 3=info, 4=debug)."); - options.addOption("m", "mother", true, "Set api template of the mother. ('none' for standalone)"); - options.addOption("n", "name", true, "Set my id (name)."); - options.addOption("o", "open", false, "Open the system for on-line updates (be careful!)"); - options.addOption("p", "path", true, "Set index path."); - options.addOption("q", "quiet", false, "No output on console."); - options.addOption("u", "url", true, "Set url of my web service endpoint."); + options.addOption("m", "mother", true, "Set url of mother's api web service end point."); + options.addOption("n", "nohealth", false, "Do not share health report."); + options.addOption("p", "path", true, "Set directory path to store the index."); + options.addOption("q", "quiet", false, "No output to console."); + options.addOption("t", "test", true, "Print test output and exit (string: 'json', 'xml', 'response', 'all')."); + options.addOption("u", "url", true, "Set url of my api web service endpoint."); setDefaults(); parse(options, args); + if (myURI == null) { + myURI = "http://localhost:16842/"; + } } - + /** + * Default options, to be used for unit tests only. + */ + public SearsiaOptions() { + setDefaults(); + } + private void setDefaults() { - openWide = false; - exit = false; + test = null; // no test + help = false; quiet = false; + dontshare = false; + export = false; + nohealth = false; cacheSize = 500; pollInterval = 120; logLevel = 2; - myURI = "http://localhost:16842/searsia/"; - motherTemplate = "https://search.utwente.nl/searsia/search?q={q?}&r={r?}"; + myURI = null; // is set in constructor + motherTemplate = null; indexPath = friendlyIndexPath(); - myName = null; } @@ -91,10 +108,9 @@ private boolean pathExists(String path) { private String friendlyIndexPath() { String path; String file = "searsia"; + String os = System.getProperty("os.name").toLowerCase(); String home = System.getProperty("user.home"); if (home == null || !pathExists(home)) home = "."; - - String os = System.getProperty("os.name").toLowerCase(); if (os.contains("win")) { // On Windows path = System.getenv("AppData"); if (!pathExists(path)) { @@ -124,49 +140,31 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti try { cmd = parser.parse(options, args); } catch (ParseException e) { - help(options); - throw new IllegalArgumentException(e); + throw new IllegalArgumentException(e.getMessage() + " (use '-h' for help)"); } - if (cmd.hasOption("c")) { cacheSize = new Integer(cmd.getOptionValue("c")); if (cacheSize < 30) { cacheSize = 30; } } - if (cmd.hasOption("e")) { - exit = true; - } - if (cmd.hasOption("h") || cmd.getArgs().length > 0) { - help(options); - throw new IllegalArgumentException("Help!"); // misusing exceptions :-( - } - try { - if (cmd.hasOption("i")) { - pollInterval = new Integer(cmd.getOptionValue("i")); - if (pollInterval < 5) { - pollInterval = 5; - } - } - if (cmd.hasOption("l")) { - logLevel = new Integer(cmd.getOptionValue("l")); - if (logLevel < 0) { - logLevel = 0; - } + if (cmd.hasOption("t")) { + test = cmd.getOptionValue("t").toLowerCase(); + if (!(test.equals("json") || test.equals("xml") || test.equals("response") || test.equals("all"))) { + throw new IllegalArgumentException("Test output must be one of 'json', 'xml', 'response', or 'all'."); } - } catch (IllegalArgumentException e) { - help(options); - throw new IllegalArgumentException(e); } - if (cmd.hasOption("m")) { - motherTemplate = cmd.getOptionValue("m"); - if (motherTemplate.equals("none")) motherTemplate = null; - } - if (cmd.hasOption("n")) { - myName = cmd.getOptionValue("n"); + if (cmd.hasOption("i")) { + pollInterval = new Integer(cmd.getOptionValue("i")); + if (pollInterval < 10) { + pollInterval = 10; + } } - if (cmd.hasOption("o")) { - openWide = true; + if (cmd.hasOption("l")) { + logLevel = new Integer(cmd.getOptionValue("l")); + if (logLevel < 0) { + logLevel = 0; + } } if (cmd.hasOption("p")) { indexPath = cmd.getOptionValue("p"); @@ -174,29 +172,70 @@ private void parse(Options options, String[] args) throws IllegalArgumentExcepti if (cmd.hasOption("q")) { quiet = true; } + if (cmd.hasOption("d")) { + dontshare = true; + } + if (cmd.hasOption("e")) { + export = true; + } + if (cmd.hasOption("n")) { + nohealth = true; + } if (cmd.hasOption("u")) { myURI = cmd.getOptionValue("u"); } + if (cmd.hasOption("m")) { + motherTemplate = cmd.getOptionValue("m"); + if (!motherTemplate.matches("^https?://.*|^file:.*")) { + motherTemplate = "file:" + motherTemplate.replace("\\", "/"); // TODO C:\file on Windows? + } + } + if (cmd.hasOption("h") || cmd.getArgs().length < 0 || !cmd.hasOption("m")) { + if (!cmd.hasOption("m")) { + System.out.println("Please provide mother's api url template (use '-m')."); + } + help(options); + help = true; + } } private void help(Options options) { HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp("SearsiaServer", options); + formatter.printHelp("searsiaserver", options); } + /** + * Get the size of the SearchResult cache. + * @return cache size + */ public int getCacheSize() { return cacheSize; } - public Boolean isExit() { - return exit; + /** + * Get the test that needs to be executed. + * Possible values: "json", "xml", "response", "all", or null (no test) + * @return test + */ + public String getTestOutput() { + return test; } + /** + * Get log level, a value between 0 and 5 + * Possible values: 0=off, 1=error, 2=warn (default), 3=info, 4=debug, 5=trace + * @return log level + */ public int getLogLevel() { return logLevel; } + /** + * Get the log4j level. + * Possible values: off, error, warn (default), info, debug, trace + * @return log4j level + */ public Level getLoggerLevel() { switch(logLevel) { case 0 : return Level.OFF; @@ -209,6 +248,10 @@ public Level getLoggerLevel() { } } + /** + * Get poll interval (in seconds). + * @return poll interval. + */ public int getPollInterval() { return pollInterval; } @@ -225,29 +268,38 @@ public String getIndexPath() { return indexPath; } - public String getMyName() { - return myName; + public Boolean isQuiet() { + return quiet; } - public Boolean openedWide() { - return openWide; + public Boolean isNotShared() { + return dontshare; } - public Boolean isQuiet() { - return quiet; + public Boolean isExport() { + return export; + } + + public Boolean isNoHealthReport() { + return nohealth; } + public Boolean isHelp() { + return help; + } + @Override public String toString() { String result = "SearsiaOptions:"; result += "\n Log Level = " + getLoggerLevel(); result += "\n Base Url = " + getMyURI(); result += "\n Mother = " + getMotherTemplate(); - result += "\n Index Name = " + getMyName(); result += "\n Index Path = " + getIndexPath(); result += "\n Poll Interval = " + getPollInterval(); - result += "\n Allows update = " + openedWide(); result += "\n Cache Size = " + getCacheSize(); + result += "\n Test Output = " + getTestOutput(); + result += "\n Do Not Share = " + isNotShared(); + result += "\n No Health Rep.= " + isNoHealthReport(); return result; } diff --git a/src/main/java/org/searsia/engine/DOMBuilder.java b/src/main/java/org/searsia/engine/DOMBuilder.java index 6a738b8..0a0a1be 100644 --- a/src/main/java/org/searsia/engine/DOMBuilder.java +++ b/src/main/java/org/searsia/engine/DOMBuilder.java @@ -1,5 +1,6 @@ /* - * Copyright Walter Kasper + * Jsoup2DOM Copyright Walter Kasper + * Json2DOC Copyright Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,6 @@ package org.searsia.engine; -import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.HashMap; @@ -31,15 +31,19 @@ import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import org.json.JSONArray; +import org.json.JSONObject; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.InputSource; /** - * Returns a W3C DOM for a Jsoup parsed document. + * Returns a W3C DOM for a Jsoup parsed document or a Json parsed document. + * + * @author Walter Kasper + * @author Djoerd Hiemstra * - * @author Walter Kasper */ public class DOMBuilder { @@ -47,9 +51,9 @@ public class DOMBuilder { * Returns a W3C DOM that exposes the content as the supplied XML string. * @param xmlString The XML string to parse. * @return A W3C Document. - * @throws + * @throws RuntimeException if not well-formed */ - public static Document string2DOM(String xmlString) throws IOException { + public static Document string2DOM(String xmlString) { Document document = null; @@ -65,18 +69,27 @@ public static Document string2DOM(String xmlString) throws IOException { DocumentBuilder docBuilder = factory.newDocumentBuilder(); document = docBuilder.parse(new InputSource(new StringReader(xmlString))); } catch (Exception e) { - throw new IOException(e); + throw new RuntimeException(e); } return document; } + /** + * Returns an XML string for a W3C Document + * @param document A W3C Document + * @return XML string + */ public static String DOM2String(Document document) { TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer; try { transformer = tf.newTransformer(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); + transformer.setOutputProperty(OutputKeys.METHOD, "xml"); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter writer = new StringWriter(); transformer.transform(new DOMSource(document), new StreamResult(writer)); String output = writer.getBuffer().toString(); @@ -104,7 +117,7 @@ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { /* Create a document to contain the content. */ document = docBuilder.newDocument(); - createDOM(jsoupDocument, document, document, new HashMap()); + createDOMfromJsoup(jsoupDocument, document, document, new HashMap()); } catch (ParserConfigurationException pce) { throw new RuntimeException(pce); @@ -112,24 +125,52 @@ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { return document; } + + /** + * Returns a W3C DOM that exposes the same content as the supplied JSON Object into a W3C DOM. + * @param jsonDocument The JSON Object to convert. + * @return A W3C Document. + */ + public static Document json2DOM(JSONObject jsonDocument) { + + Document document = null; + + try { + + /* Obtain the document builder for the configured XML parser. */ + DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); + + /* Create a document to contain the content. */ + document = docBuilder.newDocument(); + org.w3c.dom.Element _e = document.createElement("root"); + document.appendChild(_e); + createDOMfromJSONObject(jsonDocument, _e, document); + } catch (ParserConfigurationException pce) { + throw new RuntimeException(pce); + } + return document; + } + + /** * The internal helper that copies content from the specified Jsoup Node into a W3C {@link Node}. * @param node The Jsoup node containing the content to copy to the specified W3C {@link Node}. * @param out The W3C {@link Node} that receives the DOM content. */ - private static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map ns) { + private static void createDOMfromJsoup(org.jsoup.nodes.Node node, Node out, Document doc, Map ns) { if (node instanceof org.jsoup.nodes.Document) { org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node); for (org.jsoup.nodes.Node n : d.childNodes()) { - createDOM(n, out,doc,ns); + createDOMfromJsoup(n, out,doc,ns); } } else if (node instanceof org.jsoup.nodes.Element) { org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node); - org.w3c.dom.Element _e = doc.createElement(e.tagName()); + org.w3c.dom.Element _e = doc.createElement(correctXML(e.tagName())); out.appendChild(_e); org.jsoup.nodes.Attributes atts = e.attributes(); @@ -153,14 +194,14 @@ else if (!attPrefix.equals("xml")) { } } try { - _e.setAttribute(attName, a.getValue()); + _e.setAttribute(attName, a.getValue()); } catch (DOMException domExcept) { - continue; + continue; } } for (org.jsoup.nodes.Node n : e.childNodes()) { - createDOM(n, _e, doc,ns); + createDOMfromJsoup(n, _e, doc,ns); } } else if (node instanceof org.jsoup.nodes.TextNode) { @@ -193,4 +234,68 @@ private static String getLocalName(String name) { return name; } -} + /** + * The internal helpers that copy content from the specified JSON Object into a W3C {@link Node}. + * @param json The JSON object containing the content to copy to the specified W3C {@link Node}. + * @param out The W3C {@link Node} that receives the DOM content. + */ + private static void createDOMfromJSONObject(JSONObject json, Node out, Document doc) { + String [] names = JSONObject.getNames(json); + if (names != null) { + for (String name : names) { + Object object = json.get(name); + if (object instanceof JSONArray) { + createDOMfromJSONArray((JSONArray) object, out, doc, name); + } else if (object instanceof JSONObject) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONObject((JSONObject) object, _e, doc); + } else { + createDOMfromJSONPrimitive(object, out, doc, name); + } + } + } + } + + private static void createDOMfromJSONArray(JSONArray json, Node out, Document doc, String name) { + for (Object o: json) { + if (o instanceof JSONArray) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONArray((JSONArray) o, _e, doc, "list"); + } else if (o instanceof JSONObject) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + createDOMfromJSONObject((JSONObject) o, _e, doc); + } else { + createDOMfromJSONPrimitive(o, out, doc, name); + } + } + } + + private static void createDOMfromJSONPrimitive(Object object, Node out, Document doc, String name) { + org.w3c.dom.Element _e = doc.createElement(correctXML(name)); + out.appendChild(_e); + if (object instanceof String) { + _e.appendChild(doc.createTextNode((String) object)); + } else if (object instanceof Boolean) { + _e.appendChild(doc.createTextNode(object.toString())); + } else if (object instanceof Integer) { + _e.appendChild(doc.createTextNode(Integer.toString((Integer) object))); + } else if (object instanceof Double) { + _e.appendChild(doc.createTextNode(Double.toString((Double) object))); + } + } + + /** + * XML Element names can contain letters, digits, hyphens, underscores, and periods + * Element names must start with a letter or underscore + * @param name XML element name + * @return correct XML element name + */ + private static String correctXML(String name) { + name = name.replaceAll("[^A-Z0-9a-z\\-_\\.]|^([^A-Za-z_])", "_$1"); + return name; + } + +} \ No newline at end of file diff --git a/src/main/java/org/searsia/engine/Resource.java b/src/main/java/org/searsia/engine/Resource.java index dd5e657..db76d28 100644 --- a/src/main/java/org/searsia/engine/Resource.java +++ b/src/main/java/org/searsia/engine/Resource.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,31 +18,35 @@ import java.io.BufferedReader; import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; +import java.net.URLConnection; import java.net.URL; import java.net.URLEncoder; -import java.security.MessageDigest; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; +import java.util.Set; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; -import org.apache.log4j.Logger; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; -import org.json.XML; import org.jsoup.Jsoup; import org.w3c.dom.Document; import org.w3c.dom.Node; @@ -50,18 +54,25 @@ import org.searsia.Hit; import org.searsia.SearchResult; +/** + * A Searsia Resource: A wrapper for external search engines. It can read results from + * engines that produce results in: HTML, XML, JSON, and SEARSIA JSON. + * + * @author Djoerd Hiemstra and Dolf Trieschnigg + */ public class Resource implements Comparable { public final static String defaultTestQuery = "searsia"; - private final static Logger LOGGER = Logger.getLogger(Resource.class.getName()); + public final static String goneErrorMessage = "Searsia Gone"; // For rate limiting: Default = 1000 queries per day private final static int defaultRATE = 1000; // unit: queries private final static int defaultPER = 86400000; // unit: miliseconds (86400000 miliseconds is one day) + private final static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); - // TODO: private static final Pattern queryPattern = Pattern.compile("\\{q\\??\\}"); + // TODO: private static final Pattern queryPattern = Pattern.compile("\\{searchTerms\??\\}"); - // data to be set by setters + // data to be set by JSON private String id = null; private String name = null; private String urlAPITemplate = null; @@ -69,36 +80,40 @@ public class Resource implements Comparable { private String urlSuggestTemplate = null; private String mimeType = null; private String postString = null; + private String signature = null; private String postQueryEncode = null; private String favicon = null; private String banner = null; private String itemXpath = null; private String testQuery = defaultTestQuery; private List extractors = new ArrayList<>(); - private Map headers = new LinkedHashMap<>(); + private Map headers = new LinkedHashMap<>(); private Map privateParameters = new LinkedHashMap<>(); private Float prior = null; private String rerank = null; private int rate = defaultRATE; + private boolean deleted = false; + + // internal data shared for health report + private String nextQuery = null; + private String lastMessage = null; + private double allowance = defaultRATE / 2; + private long lastUsed = new Date().getTime(); // Unix time + private long lastUsedOk = lastUsed; + private long lastUsedError = lastUsed; + private long lastUpdated = lastUsed; + private long upsince = lastUsed; + private int nrOfError = 0; + private int nrOfOk = 0; - // internal data not to be shared - private String nextQuery = null; - private double allowance = defaultRATE / 2; - private long lastCheck = new Date().getTime(); // Unix time - - - public Resource(String urlAPITemplate, String id) { + public Resource(String urlAPITemplate) { this.urlAPITemplate = urlAPITemplate; - this.id = id; + this.id = null; this.name = null; this.mimeType = SearchResult.SEARSIA_MIME_TYPE; this.testQuery = defaultTestQuery; } - public Resource(String urlAPITemplate) { - this(urlAPITemplate, getHashString(urlAPITemplate)); - } - public Resource(JSONObject jo) throws XPathExpressionException, JSONException { this.mimeType = SearchResult.SEARSIA_MIME_TYPE; this.testQuery = defaultTestQuery; @@ -107,6 +122,7 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { if (jo.has("mimetype")) this.mimeType = jo.getString("mimetype"); if (jo.has("post")) this.postString = jo.getString("post"); if (jo.has("postencode")) this.postQueryEncode = jo.getString("postencode"); + if (jo.has("signature")) this.signature = jo.getString("signature"); if (jo.has("name")) this.name = jo.getString("name"); if (jo.has("testquery")) this.testQuery = jo.getString("testquery"); if (jo.has("urltemplate")) this.urlUserTemplate = jo.getString("urltemplate"); @@ -115,6 +131,7 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { if (jo.has("rerank")) this.rerank = jo.getString("rerank"); if (jo.has("banner")) this.banner = jo.getString("banner"); if (jo.has("itempath")) this.itemXpath = jo.getString("itempath"); + if (jo.has("deleted")) this.deleted = jo.getBoolean("deleted"); if (jo.has("prior")) this.prior = new Float(jo.getDouble("prior")); if (jo.has("maxqueriesperday")) this.rate = jo.getInt("maxqueriesperday"); if (jo.has("extractors")) { @@ -133,26 +150,32 @@ public Resource(JSONObject jo) throws XPathExpressionException, JSONException { addHeader((String) key, (String) json.get(key)); } } - if (jo.has("parameters")) { - JSONObject json = (JSONObject) jo.get("parameters"); + if (jo.has("privateparameters")) { + JSONObject json = (JSONObject) jo.get("privateparameters"); Iterator keys = json.keys(); while (keys.hasNext()) { String key = (String) keys.next(); addPrivateParameter((String) key, (String) json.get(key)); } } - if (this.urlAPITemplate == null) { - throw new IllegalArgumentException("Missing API Template"); + if (this.urlAPITemplate != null && this.urlAPITemplate.startsWith("file")) { + throw new IllegalArgumentException("Illegal 'file' API Template"); } if (this.id == null) { throw new IllegalArgumentException("Missing Identifier"); } } + public void setUrlAPITemplate(String urlTemplate) { this.urlAPITemplate = urlTemplate; } - + + + /* + * Setters no longer used: Everything now via JSON Objects + * + public void setUrlUserTemplate(String urlTemplate) { this.urlUserTemplate = urlTemplate; } @@ -193,20 +216,6 @@ public void setItemXpath(String itemXpath) { this.itemXpath = itemXpath; } - public void addExtractor(TextExtractor ... e) { - for (TextExtractor ee: e) { - this.extractors.add(ee); - } - } - - public void addHeader(String key, String value) { - this.headers.put(key, value); - } - - public void addPrivateParameter(String key, String value) { - this.privateParameters.put(key, value); - } - public void setPrior(float prior) { this.prior = prior; } @@ -218,33 +227,90 @@ public void setRate(int maxQueriesPerDay) { public void setRerank(String rerank) { this.rerank = rerank; } +*/ + private void addExtractor(TextExtractor ... e) { + for (TextExtractor ee: e) { + this.extractors.add(ee); + } + } + private void addHeader(String key, String value) { + this.headers.put(key, value); + } - public void changeId(String id) { // BEWARE, only used in Main - this.id = id; - } + private void addPrivateParameter(String key, String value) { + this.privateParameters.put(key, value); + } + public void setLastUpdatedToNow() { + this.lastUpdated = new Date().getTime(); + } + + public void setLastUpdatedToDateString(String date) { + try { + this.lastUpdated = dateFormat.parse(date).getTime(); + } catch (ParseException e) { } + } + + public void setUpSinceToNow() { + this.upsince = new Date().getTime(); + } + + public void setUpSinceDateString(String date) { + try { + this.upsince = dateFormat.parse(date).getTime(); + } catch (ParseException e) { } + } + + + public Resource updateFromAPI() throws SearchException { + SearchResult result = searchWithoutQuery(); + if (result == null) { throw new SearchException("No results."); } + Resource resource = result.getResource(); + if (resource == null) { throw new SearchException("Object \"resource\" not found."); } + updateWith(resource); + return this; + } + public SearchResult randomSearch() throws SearchException { - if (nextQuery == null) { - nextQuery = this.testQuery; + if (this.nextQuery == null) { + this.nextQuery = this.testQuery; } - String thisQuery = nextQuery; - nextQuery = null; // so, nextQuery will be null in case of a searchexception - SearchResult result = search(thisQuery); - nextQuery = result.randomTerm(); + String thisQuery = this.nextQuery; + this.nextQuery = null; // so, nextQuery will be null in case of a searchexception + SearchResult result = search(thisQuery, null); + if (this.testQuery.equals(thisQuery) && result.getHits().isEmpty()) { + this.nrOfError += 1; + this.lastUsedError = new Date().getTime(); + this.lastMessage = "No results for test query: " + thisQuery; + throw new SearchException(this.lastMessage); + } else { + this.nextQuery = result.randomTerm(thisQuery); + } return result; } public SearchResult search(String query) throws SearchException { - return search(query, false); + return search(query, null, null); } - public SearchResult search(String query, boolean debug) throws SearchException { + public SearchResult search(String query, String debug) throws SearchException { + return search(query, debug, null); + } + + public SearchResult search(String query, String debug, Integer startPage) throws SearchException { + SearchResult result; try { - String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8")); + if (rateLimitReached()) { + throw new SearchException("Too many queries"); + } + if (this.urlAPITemplate == null) { + throw new SearchException("No API Template"); + } + String url = fillTemplate(this.urlAPITemplate, URLEncoder.encode(query, "UTF-8"), startPage); String postString = ""; String postQuery; if (this.postString != null && !this.postString.equals("")) { @@ -259,35 +325,41 @@ public SearchResult search(String query, boolean debug) throws SearchException { } else { postQuery = URLEncoder.encode(query, "UTF-8"); } - postString = fillTemplate(this.postString, postQuery); + postString = fillTemplate(this.postString, postQuery, startPage); } - String page = getCompleteWebPage(url, postString, this.headers); - SearchResult result; + String page = getCompletePage(url, postString, this.headers); if (this.mimeType != null && this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { - result = searsiaSearch(page); + result = searsiaSearch(page, debug); } else { result = xpathSearch(url, page, debug); } if (this.rerank != null && query != null) { result.scoreReranking(query, this.rerank); } - result.setQuery(query); - return result; + if (!result.getHits().isEmpty()) { + this.nrOfOk += 1; // only success if at least one result + this.lastUsedOk = new Date().getTime(); + } } catch (Exception e) { // catch all, also runtime exceptions - throw createPrivateSearchException(e); - } + this.nrOfError += 1; + this.lastUsedError = new Date().getTime(); + SearchException se = createPrivateSearchException(e); + this.lastMessage = se.getMessage(); + throw se; + } + result.setQuery(query); + result.setResourceId(this.getId()); + return result; } - - public SearchResult search() throws SearchException { + public SearchResult searchWithoutQuery() throws SearchException { if (!this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { throw new SearchException("Engine is not a searsia engine: " + this.id); } try { - String url = this.urlAPITemplate; - url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters - String page = getCompleteWebPage(url, this.postString, this.headers); - return searsiaSearch(page); + String url = fillTemplate(this.urlAPITemplate, ""); + String page = getCompletePage(url, this.postString, this.headers); + return searsiaSearch(page, null); } catch (Exception e) { // catch all, also runtime exceptions throw createPrivateSearchException(e); } @@ -296,65 +368,105 @@ public SearchResult search() throws SearchException { public Resource searchResource(String resourceid) throws SearchException { if (!this.mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) { - throw new SearchException("Resource is not a searsia engine: " + resourceid); + throw new SearchException("Resource is not a searsia engine: " + this.getId()); } - try { - Resource engine = null; - String url = this.urlAPITemplate.replaceAll("\\{r\\??\\}", URLEncoder.encode(resourceid, "UTF-8")); - url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters - String jsonPage = getCompleteWebPage(url, this.postString, this.headers); + Resource engine = null; + String url = this.urlAPITemplate; + String rid = this.getId(); + int lastIndex = url.lastIndexOf(rid); // replace last occurrence of resourceId + if (lastIndex < 0) { + throw new SearchException("No resources available"); + } + try { + String newRid = URLEncoder.encode(resourceid, "UTF-8"); + url = url.substring(0, lastIndex) + url.substring(lastIndex).replaceFirst(rid, newRid); + url = fillTemplate(url, "", null); + String jsonPage = getCompletePage(url, this.postString, this.headers); JSONObject json = new JSONObject(jsonPage); if (json.has("resource")) { engine = new Resource(json.getJSONObject("resource")); } - return engine; + } catch (IOException e) { + String message = e.getMessage(); + if (message != null && message.equals(goneErrorMessage)) { // TODO: not using error message like this?? + engine = deletedResource(resourceid); + } else { + throw createPrivateSearchException(e); + } } catch (Exception e) { - throw createPrivateSearchException(e); + throw createPrivateSearchException(e); } + return engine; + } + + private Resource deletedResource(String resourceid) throws SearchException { + Resource engine = null; + JSONObject json = new JSONObject(); + json.put("id", resourceid); + json.put("deleted", true); + try { + engine = new Resource(json); + } catch (XPathExpressionException e) { + throw new SearchException(e); + } + return engine; } - - private SearchResult searsiaSearch(String jsonPage) { + private SearchResult searsiaSearch(String jsonPage, String debug) throws XPathExpressionException, JSONException { SearchResult result = new SearchResult(); + if (debug != null && debug.equals("response")) { + result.setDebugOut(jsonPage); + } JSONObject json = new JSONObject(jsonPage); - JSONArray hits = json.getJSONArray("hits"); + JSONArray hits = new JSONArray(); + try { + hits = json.getJSONArray("hits"); + } catch (JSONException e) { } for (int i = 0; i < hits.length(); i += 1) { result.addHit(new Hit((JSONObject) hits.get(i))); } if (json.has("resource")) { - try { - Resource engine = new Resource(json.getJSONObject("resource")); - result.setResource(engine); - } catch (XPathExpressionException e) { - LOGGER.warn("Warning: " + e.getMessage()); - } + Resource engine = new Resource(json.getJSONObject("resource")); + result.setResource(engine); + } + if (json.has("searsia")) { + result.setVersion(json.getString("searsia")); } return result; } - private SearchResult xpathSearch(String url, String page, boolean debug) + private SearchResult xpathSearch(String url, String page, String debug) throws IOException, XPathExpressionException { - Document document; - if (this.mimeType != null && this.mimeType.equals("application/json")) { + Document document = null; + if (this.mimeType == null) { + throw new IOException("No MIME Type provided."); + } + if (this.mimeType.equals("application/json")) { document = parseDocumentJSON(page); - } else if (this.mimeType != null && this.mimeType.equals("application/x-javascript")) { + } else if (this.mimeType.equals("application/x-javascript")) { document = parseDocumentJavascript(page); - } else if (this.mimeType != null && this.mimeType.equals("application/xml")) { + } else if (this.mimeType.equals("application/xml")) { document = parseDocumentXML(page); - } else { + } else if (this.mimeType.equals("text/html")) { document = parseDocumentHTML(page, url); + } else { + throw new IOException("MIME Type not supported: " + this.mimeType); } if (document == null) { throw new IOException("Error parsing document. Wrong mimetype?"); } SearchResult result = new SearchResult(); - if (debug) { - result.setXmlOut(DOMBuilder.DOM2String(document)); + if (debug != null) { + if (debug.equals("xml")) { + result.setDebugOut(DOMBuilder.DOM2String(document)); + } else if (debug.equals("response")) { + result.setDebugOut(page); + } } XPathFactory xFactory = XPathFactory.newInstance(); XPath xpath = xFactory.newXPath(); - NodeList xmlNodeList = (NodeList) xpath.evaluate(itemXpath, document, XPathConstants.NODESET); + NodeList xmlNodeList = (NodeList) xpath.evaluate(this.itemXpath, document, XPathConstants.NODESET); for (int i = 0; i < xmlNodeList.getLength() && i < 30; i++) { Node item = xmlNodeList.item(i); result.addHit(extractHit(item)); @@ -362,19 +474,15 @@ private SearchResult xpathSearch(String url, String page, boolean debug) return result; } - private Hit extractHit(Node item) { + private Hit extractHit(Node item) throws XPathExpressionException { Hit hit = new Hit(); for(TextExtractor extractor: this.extractors) { - try { - extractor.extract(item, hit); - } catch (XPathExpressionException e) { - LOGGER.warn(e.getMessage()); // TODO: handle this gracefully :-) - } + extractor.extract(item, hit); } return hit; } - private Document parseDocumentHTML(String htmlString, String urlString) throws IOException { + private Document parseDocumentHTML(String htmlString, String urlString) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(htmlString, urlString); return DOMBuilder.jsoup2DOM(jsoupDoc); } @@ -385,7 +493,7 @@ private Document parseDocumentHTML(String htmlString, String urlString) throws I * @return Document * @throws IOException */ - private Document parseDocumentJavascript(String scriptString) throws IOException { + private Document parseDocumentJavascript(String scriptString) { int nrOfCurly = 0; int first = -1; JSONArray array = new JSONArray(); @@ -398,7 +506,6 @@ private Document parseDocumentJavascript(String scriptString) throws IOException nrOfCurly -= 1; if (nrOfCurly == 0) { String subString = scriptString.substring(first, i + 1); - subString = subString.replaceAll("\"([0-9][^\"]*)\":", "\"t$1\":"); // tags starting with a number are not well-formed XML try { array.put(new JSONObject(subString)); } catch (JSONException e) { } @@ -407,43 +514,54 @@ private Document parseDocumentJavascript(String scriptString) throws IOException } JSONObject object = new JSONObject(); object.put("list", array); - String xml = "" + XML.toString(object) + ""; - return DOMBuilder.string2DOM(xml); + return DOMBuilder.json2DOM(object); } - private Document parseDocumentJSON(String jsonString) throws IOException { - jsonString = jsonString.replaceAll("\"[^\"]*[/<>' =][^\"]*\":[ \n\r]*\"[^\"]*\",?", ""); // completely remove data with keys that have one of: /<>' = - jsonString = jsonString.replaceAll("\"([0-9][^\"]*)\"[ \n\r]*:", "\"t$1\":"); // tags starting with a number are not well-formed XML - jsonString = jsonString.replaceAll("\"content\":", "\"searsia_org_json_content\":"); // work around. org.json.XML is broken: https://github.com/stleary/JSON-java/issues/286 - if (jsonString.startsWith("[")) { // turn lists into objects + private Document parseDocumentJSON(String jsonString) { + if (jsonString.startsWith("[")) { // turn lists into objects jsonString = "{\"list\":" + jsonString + "}"; } - String xml = "" + XML.toString(new JSONObject(jsonString)) + ""; - xml = xml.replaceAll("searsia_org_json_content>", "content>"); // use a constant for 'searsia_org_json_content'? see 5 lines above - return DOMBuilder.string2DOM(xml); + return DOMBuilder.json2DOM(new JSONObject(jsonString)); } - private Document parseDocumentXML(String xmlString) throws IOException { + private Document parseDocumentXML(String xmlString) { return DOMBuilder.string2DOM(xmlString); } - private String fillTemplate(String template, String query) throws UnsupportedEncodingException { + private String fillTemplate(String template, String query) throws SearchException { + return fillTemplate(template, query, null); + } + + private String fillTemplate(String template, String query, Integer startPage) throws SearchException { String url = template; - for (String param: this.privateParameters.keySet()) { - url = url.replaceAll("\\{" + param + "\\??\\}", this.privateParameters.get(param)); + for (String param: getPrivateParameterKeys()) { + url = url.replaceAll("\\{" + param + "\\??\\}", getPrivateParameter(param)); + } + url = url.replaceAll("\\{searchTerms\\??\\}", query); // opensearch standard + url = url.replaceAll("\\{q\\??\\}", query); // old Searsia + if (startPage == null) { + startPage = this.getIndexOffset(); + url = url.replaceAll("\\{startPage\\}", startPage.toString()); + } else { + url = url.replaceAll("\\{startPage\\??\\}", startPage.toString()); } - url = url.replaceAll("\\{q\\??\\}", query); url = url.replaceAll("\\{[0-9A-Za-z\\-_]+\\?\\}", ""); // remove optional parameters if (url.matches(".*\\{[0-9A-Za-z\\-_]+\\}.*")) { - throw new UnsupportedEncodingException("Missing url parameter"); // TODO: better error + String param = url.substring(url.indexOf("{"), url.indexOf("}") + 1); + throw new SearchException("Missing url parameter " + param); } + String signature = getSignatureName(); + if (signature != null) { + url = Signatures.sign(url, signature, getSignatureKey()); + } return url; } private SearchException createPrivateSearchException(Exception e) { String message = e.toString(); - for (String param: this.privateParameters.keySet()) { - message = message.replaceAll(this.privateParameters.get(param), "{" + param + "}"); + message = message.replaceAll("java\\.[a-z]+\\.", ""); + for (String param: getPrivateParameterKeys()) { + message = message.replaceAll(getPrivateParameter(param), "{" + param + "}"); } return new SearchException(message); } @@ -454,8 +572,8 @@ private SearchException createPrivateSearchException(Exception e) { */ private boolean rateLimitReached() { Long now = new Date().getTime(); - Long timePassed = now - this.lastCheck; - this.lastCheck = now; + Long timePassed = now - this.lastUsed; + this.lastUsed = now; this.allowance += (((double) timePassed / defaultPER)) * this.rate; if (this.allowance > this.rate) { this.allowance = this.rate; @@ -468,49 +586,85 @@ private boolean rateLimitReached() { } } - // TODO refactor, waaay too big: - private String getCompleteWebPage(String urlString, String postString, Map headers) throws IOException { - if (rateLimitReached()) { - throw new IOException("Rate limited"); - } - URL url = new URL(urlString); - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setRequestProperty("User-Agent", "Searsia/0.4"); - connection.setRequestProperty("Accept", this.mimeType); //TODO: "*/*" + private URLConnection setConnectionProperties(URL url, Map headers) throws IOException { + URLConnection connection = url.openConnection(); + connection.setRequestProperty("User-Agent", "Searsia/1.0"); + connection.setRequestProperty("Accept", this.mimeType + "; q=1.0, */*; q=0.5"); connection.setRequestProperty("Accept-Language", "en-US,en;q=0.5"); // TODO: from browser? for (Map.Entry entry : headers.entrySet()) { String value = entry.getValue(); - for (String param: this.privateParameters.keySet()) { - value = value.replace("{" + param + "}", this.privateParameters.get(param)); - } - if (value.contains("{")) { - throw new IOException("Missing header parameter"); // TODO: better error + if (value.contains("{")) { + for (String param: getPrivateParameterKeys()) { + value = value.replace("{" + param + "}", getPrivateParameter(param)); + } + if (value.contains("{")) { + String param = value.substring(value.indexOf("{"), value.indexOf("}") + 1); + throw new IOException("Missing header parameter " + param); + } } connection.setRequestProperty(entry.getKey(), value); } connection.setReadTimeout(9000); connection.setConnectTimeout(9000); - connection.setInstanceFollowRedirects(true); + return connection; + } + + private String correctContentType(String contentType) { // TODO more charsets + if (contentType != null && contentType.toLowerCase().contains("charset=iso-8859-1")) { + contentType = "ISO-8859-1"; + } else { + contentType = "UTF-8"; + } + return contentType; + } + + private InputStreamReader httpConnect(URLConnection connection, String postString) throws IOException { + HttpURLConnection http = (HttpURLConnection) connection; + http.setInstanceFollowRedirects(true); if (postString != null && !postString.equals("")) { - connection.setRequestMethod("POST"); - connection.setRequestProperty("Content-Length", "" + Integer.toString(postString.getBytes().length)); - connection.setDoOutput(true); - DataOutputStream wr = new DataOutputStream(connection.getOutputStream()); + http.setRequestMethod("POST"); + http.setRequestProperty("Content-Length", "" + Integer.toString(postString.getBytes().length)); + http.setDoOutput(true); + DataOutputStream wr = new DataOutputStream(http.getOutputStream()); wr.writeBytes(postString); wr.flush(); wr.close(); } else { - connection.setRequestMethod("GET"); - connection.connect(); + http.setRequestMethod("GET"); + http.connect(); + } + int responseCode = http.getResponseCode(); + if (responseCode == 301) { // FollowRedirects did not work?! + throw new IOException("Moved permanently"); } - //int responseCode = connection.getResponseCode(); - BufferedReader in = null; + if (responseCode == 410) { // Gone: we will use this special error message elsewhere in this code. + throw new IOException(goneErrorMessage); + } + String contentType = correctContentType(http.getHeaderField("Content-Type")); + return new InputStreamReader(http.getInputStream(), contentType); + } + + private InputStreamReader fileConnect(URLConnection connection) throws IOException { + String fileName = connection.getURL().getFile(); + return new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8"); + } + + private String getCompletePage(String urlString, String postString, Map headers) throws IOException { + URL url = new URL(urlString); + URLConnection connection = setConnectionProperties(url, headers); + InputStreamReader reader; + if (url.getProtocol().equals("file")) { + reader = fileConnect(connection); + } else { + reader = httpConnect(connection, postString); + } + BufferedReader in = new BufferedReader(reader); StringBuilder page = new StringBuilder(); - in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8")); if (in != null) { String inputLine; while ((inputLine = in.readLine()) != null) { page.append(inputLine); + page.append("\n"); } in.close(); } @@ -522,14 +676,39 @@ public String getId() { return this.id; } - public String getMD5() { - return getHashString(this.urlAPITemplate); - } - public String getName() { return this.name; } + public String getSignature() { + return this.signature; + } + + public String getSignatureName() { + if (this.signature == null) { + return null; + } + int begin = this.signature.indexOf("("); + if (begin == -1) { + return this.signature; + } else { + return this.signature.substring(0, begin); + } + } + + public String getSignatureKey() { + int begin = this.signature.indexOf("("); + if (begin == -1) { + return null; + } else { + String key = this.signature.substring(begin + 1, this.signature.length() -1); + if (key.startsWith("{")) { + key = this.privateParameters.get(key.substring(1, key.length() - 1)); + } + return key; + } + } + public String getUserTemplate() { return this.urlUserTemplate; } @@ -573,7 +752,16 @@ public String getPostQueryEncode() { public String getItemXpath() { return this.itemXpath; } + + public String getPrivateParameter(String param) { + return this.privateParameters.get(param); + } + + public Set getPrivateParameterKeys() { + return this.privateParameters.keySet(); + } + public List getExtractors() { return this.extractors; } @@ -598,15 +786,119 @@ public int getRate() { return this.rate; } + public boolean isDeleted() { + return this.deleted; + } + + public int getAllowance() { + long timePassed = new Date().getTime() - this.lastUsed; + double currentAllowance = this.allowance + (((double) timePassed / defaultPER)) * this.rate; + if (currentAllowance > this.rate) { + return this.rate; + } + return (int) currentAllowance; + } + + private float getExactPrior() { + float prior = 0.0f; + if (this.prior != null) { + prior = this.prior; + } + return prior; + } + public float getPrior() { - if (this.prior == null) { - return 0.0f; - } else { - return this.prior; - } + float prior = 0.0f; + if (this.prior != null) { + prior = this.prior; + } + prior += this.nrOfOk * 0.00001f; // add a tiny amount of success... + prior -= this.nrOfError * 0.00001f; + return prior; + } + + public int getNrOfErrors() { + return this.nrOfError; + } + + public int getNrOfSuccess() { + return this.nrOfOk; + } + + private long secondsAgo(long last) { + long now = new Date().getTime(); + long ago = 1 + (now - last) / 1000; + if (ago < 0 || ago > 8640000l) { // 100 days... + ago = 8640000l; + } + return ago; + } + + public String getLastError() { + return this.lastMessage; + } + + public String getLastUsedString() { + return dateFormat.format(new Date(this.lastUsed)); + } + + public String getLastSuccessDate() { + return dateFormat.format(new Date(this.lastUsedOk)); + } + + public String getLastErrorDate() { + return dateFormat.format(new Date(this.lastUsedError)); + } + + public String getLastUpdatedString() { + return dateFormat.format(new Date(this.lastUpdated)); } - public float score(String query) { + public String getUpSinceString() { + return dateFormat.format(new Date(this.upsince)); + } + + public long getLastUpdatedSecondsAgo() { + return secondsAgo(this.lastUpdated); + } + + public Long getLastUsedSecondsAgo() { + return secondsAgo(this.lastUsed); + } + + public int getIndexOffset() { + return 1; // TODO: indexOffSet of opensearch url template syntax + } + + public boolean isHealthy() { + return this.lastUsedOk >= this.lastUsedError || this.nrOfError == 0; + } + + + public Resource getLocalResource() { + JSONObject json = new JSONObject(); + Resource result = null; + json.put("id", this.getId()); + json.put("mimetype", SearchResult.SEARSIA_MIME_TYPE); + String value = this.getName(); + if (value != null) { json.put("name", value); } + json.put("name", this.getName()); + value = this.getBanner(); + if (value != null) { json.put("banner", value); } + value = this.getFavicon(); + if (value != null) { json.put("favicon", value); } + value = this.getSuggestTemplate(); + if (value != null) { json.put("suggesttemplate", value); } + value = this.getTestQuery(); + if (value != null) { json.put("testquery", value); } + try { + result = new Resource(json); + } catch (XPathExpressionException e) { } + return result; + } + + + public float score(String query) { float score = 0.0f; Map nameTerm = new HashMap(); String name = getName(); @@ -620,51 +912,183 @@ public float score(String query) { score += 2.0f; // some arbitrary number } } - } + } return score; } - - public JSONObject toJson() { - JSONObject engine = new JSONObject(); - if (id != null) engine.put("id", id); - if (name != null) engine.put("name", name); - if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); - if (favicon != null) engine.put("favicon", favicon); - if (banner != null) engine.put("banner", banner); - if (urlAPITemplate != null) engine.put("apitemplate", urlAPITemplate); - if (urlSuggestTemplate != null) engine.put("suggesttemplate", urlSuggestTemplate); - if (mimeType != null) engine.put("mimetype", mimeType); - if (rerank != null) engine.put("rerank", rerank); - if (postString != null) engine.put("post", postString); - if (postQueryEncode != null) engine.put("postencode", postQueryEncode); - if (testQuery != null) engine.put("testquery", testQuery); - if (prior != null) engine.put("prior", prior); - if (rate != defaultRATE) engine.put("maxqueriesperday", rate); - if (itemXpath != null) engine.put("itempath", itemXpath); - if (extractors != null && extractors.size() > 0) { - JSONObject json = new JSONObject(); - for (TextExtractor e: extractors) { - json.put(e.getField(), e.getPath()); - } - engine.put("extractors", json); - } - if (headers != null && headers.size() > 0) { - JSONObject json = new JSONObject(); - for (String header: headers.keySet()) { - json.put(header, headers.get(header)); - } - engine.put("headers", json); + + public Resource deepcopy() { + try { + return new Resource(this.toJson()); + } catch (XPathExpressionException | JSONException e) { + throw new RuntimeException(e.getMessage()); } - return engine; } + /** + * Update resource + * @param e2 + */ + public void updateWith(Resource e2) { // TODO: bad idea in multi-threaded app!? + setLastUpdatedToNow(); + if (!equals(e2)) { + if (this.id != null && !this.id.equals(e2.id)) throw new RuntimeException("Cannot update resource ID."); + setUpSinceToNow(); + this.nrOfOk = 0; + this.nrOfError = 0; + this.lastMessage = null; + this.id = e2.id; + this.deleted = e2.deleted; + this.name = e2.name; + this.signature = e2.signature; + this.urlUserTemplate = e2.urlUserTemplate; + this.favicon = e2.favicon; + this.banner = e2.banner; + this.urlAPITemplate = e2.urlAPITemplate; + this.urlSuggestTemplate = e2.urlSuggestTemplate; + if (e2.mimeType == null) { this.mimeType = SearchResult.SEARSIA_MIME_TYPE; } + else { this.mimeType = e2.mimeType; } + this.rerank = e2.rerank; + this.postString = e2.postString; + this.postQueryEncode = e2.postQueryEncode; + if (e2.testQuery == null) { this.testQuery = defaultTestQuery; } else { this.testQuery = e2.testQuery; } + this.prior = e2.prior; + this.rate = e2.rate; + this.itemXpath = e2.itemXpath; + this.extractors = e2.extractors; + this.headers = e2.headers; + this.privateParameters = e2.privateParameters; + } + } + + public void updateAllowance(Resource e2) { + if (this.id != null && !this.id.equals(e2.id)) throw new RuntimeException("Cannot update resource ID."); + this.allowance = e2.allowance; + } + + + public JSONObject toJson() { + return toJsonEngine(); + } + + public JSONObject toJsonEngine() { + JSONObject engine = new JSONObject(); + if (id != null) engine.put("id", id); + if (deleted) { + engine.put("deleted", true); + } else { + if (name != null) engine.put("name", name); + if (signature != null) engine.put("signature", signature); + if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); + if (favicon != null) engine.put("favicon", favicon); + if (banner != null) engine.put("banner", banner); + if (urlAPITemplate != null) engine.put("apitemplate", urlAPITemplate); + if (urlSuggestTemplate != null) engine.put("suggesttemplate", urlSuggestTemplate); + if (mimeType != null) engine.put("mimetype", mimeType); + if (rerank != null) engine.put("rerank", rerank); + if (postString != null) engine.put("post", postString); + if (postQueryEncode != null) engine.put("postencode", postQueryEncode); + if (testQuery != null) engine.put("testquery", testQuery); + if (prior != null) engine.put("prior", prior); + if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + if (itemXpath != null) engine.put("itempath", itemXpath); + if (extractors != null && extractors.size() > 0) { + JSONObject json = new JSONObject(); + for (TextExtractor e: extractors) { + json.put(e.getField(), e.getPath()); + } + engine.put("extractors", json); + } + if (headers != null && headers.size() > 0) { + JSONObject json = new JSONObject(); + for (String header: headers.keySet()) { + json.put(header, headers.get(header)); + } + engine.put("headers", json); + } + } + return engine; + } + + + public JSONObject toJsonEngineDontShare() { + JSONObject engine = new JSONObject(); + if (id != null) engine.put("id", id); + if (deleted) { + engine.put("deleted", true); + } else { + if (name != null) engine.put("name", name); + if (urlUserTemplate != null) engine.put("urltemplate", urlUserTemplate); + if (favicon != null) engine.put("favicon", favicon); + if (banner != null) engine.put("banner", banner); + if (mimeType != null && !mimeType.equals(SearchResult.SEARSIA_MIME_TYPE)) + engine.put("mimetype", mimeType); + if (rerank != null) engine.put("rerank", rerank); + if (rate != defaultRATE) engine.put("maxqueriesperday", rate); + } + return engine; + } + + + public JSONObject toJsonHealth() { + JSONObject health = new JSONObject(); + health.put("dayallowance", getAllowance()); + health.put("requestsok", this.nrOfOk); + health.put("requestserr", this.nrOfError); + health.put("lastsuccess", getLastSuccessDate()); + health.put("lasterror", getLastErrorDate()); + health.put("lastupdated", getLastUpdatedString()); + health.put("upsince", getUpSinceString()); + if (this.lastMessage != null) health.put("lastmessage", this.lastMessage); + return health; + } + + + /** + * Only used at startup when reading resources from disk + * @param health + * @throws ParseException + */ + public void updateHealth(JSONObject health) throws ParseException { + Integer num = health.getInt("requestsok"); + if (num != null) this.nrOfOk = num; + num = health.getInt("requestserr"); + if (num != null) this.nrOfError = num; + this.lastUsedOk = dateFormat.parse(health.getString("lastsuccess")).getTime(); + this.lastUsedError = dateFormat.parse(health.getString("lasterror")).getTime(); + this.lastUpdated = dateFormat.parse(health.getString("lastupdated")).getTime(); + this.upsince = dateFormat.parse(health.getString("upsince")).getTime(); + if (health.has("lastmessage")) this.lastMessage = health.getString("lastmessage"); + } + + + @Override + public int compareTo(Resource e2) { + Float score1 = getPrior(); + Float score2 = e2.getPrior(); + int compare = score1.compareTo(score2); + if (compare != 0) { + return compare; + } else { + String rid1 = getId(); // we need a full ordering + String rid2 = e2.getId(); + if (rid1 != null && rid2 != null) { + return rid1.compareTo(rid2); + } else { + return 0; + } + } + } + + @Override public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? if (o == null) return false; Resource e = (Resource) o; if (!stringEquals(this.getId(), e.getId())) return false; + if (this.isDeleted() != e.isDeleted()) return false; if (!stringEquals(this.getName(), e.getName())) return false; + if (!stringEquals(this.getSignature(), e.getSignature())) return false; if (!stringEquals(this.getMimeType(), e.getMimeType())) return false; if (!stringEquals(this.getRerank(), e.getRerank())) return false; if (!stringEquals(this.getFavicon(), e.getFavicon())) return false; @@ -677,20 +1101,13 @@ public boolean equals(Object o) { // TODO: AARGH, can't this be done simpler? if (!stringEquals(this.getUserTemplate(), e.getUserTemplate())) return false; if (!stringEquals(this.getSuggestTemplate(), e.getSuggestTemplate())) return false; if (this.getRate() != e.getRate()) return false; - if (Math.abs(this.getPrior() - e.getPrior()) > 0.0001f) return false; + if (Math.abs(this.getExactPrior() - e.getExactPrior()) > 0.001f) return false; if (!listEquals(this.getExtractors(), e.getExtractors())) return false; if (!mapEquals(this.getHeaders(), e.getHeaders())) return false; return true; } - @Override - public int compareTo(Resource e2) { - Float score1 = getPrior(); - Float score2 = e2.getPrior(); - return score1.compareTo(score2); - } - private boolean listEquals(List a, List b) { if (a == null && b == null) return true; @@ -722,25 +1139,4 @@ private boolean stringEquals(String a, String b) { return a.equals(b); } - // for 'random' ids, if not provided - private static String getHashString(String inputString) { - MessageDigest md; - byte[] hash; - try { - md = MessageDigest.getInstance("MD5"); - } catch (java.security.NoSuchAlgorithmException e) { - throw new RuntimeException(e); - } - try { - hash = md.digest(inputString.getBytes("UTF-8")); - } catch (java.io.UnsupportedEncodingException e) { - throw new RuntimeException(e); - } - StringBuilder sb = new StringBuilder(); - for(byte b : hash){ - sb.append(String.format("%02x", b & 0xff)); - } - return sb.toString(); - } - } diff --git a/src/main/java/org/searsia/engine/SearchException.java b/src/main/java/org/searsia/engine/SearchException.java index b3a3006..2430777 100644 --- a/src/main/java/org/searsia/engine/SearchException.java +++ b/src/main/java/org/searsia/engine/SearchException.java @@ -16,7 +16,9 @@ package org.searsia.engine; - +/** + * A Searsia Search Exception + */ public class SearchException extends Exception { private static final long serialVersionUID = -7429746644586456271L; @@ -29,4 +31,17 @@ public SearchException(String message) { super(message); } + @Override + public String getMessage() { + String message = super.getMessage(); + message = message.replaceAll("^[A-Za-z\\.]*\\.", ""); // removes Java package names + message = message.replaceAll(":? ?https?:[^ ]+", ""); // removes URLs (which may contain API keys) + return message; + } + + @Override + public String getLocalizedMessage() { // misusing Localization for full error message + return super.getMessage(); + } + } diff --git a/src/main/java/org/searsia/engine/Signatures.java b/src/main/java/org/searsia/engine/Signatures.java new file mode 100644 index 0000000..a911c07 --- /dev/null +++ b/src/main/java/org/searsia/engine/Signatures.java @@ -0,0 +1,127 @@ +package org.searsia.engine; + +import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.net.URLEncoder; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Base64; +import java.util.Calendar; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedMap; +import java.util.TimeZone; +import java.util.TreeMap; + +import javax.crypto.Mac; +import javax.crypto.spec.SecretKeySpec; + + +public class Signatures { + + private static final String UTF8_CHARSET = "UTF-8"; + private static final String HMAC_SHA256_ALGORITHM = "HmacSHA256"; + + /** + * Signing a web request using Amazon's HmacSHA256. + * Example code from: + * http://docs.aws.amazon.com/AWSECommerceService/latest/DG/AuthJavaSampleSig2.html + * For more information: + * https://tools.ietf.org/html/rfc2104 + */ + public static String sign(String urlString, String algorithm, String secretKey) { + if (algorithm != null && !algorithm.equals(HMAC_SHA256_ALGORITHM)) { + throw new RuntimeException("Unsupported signature: " + algorithm); + } + if (secretKey == null) { + throw new RuntimeException("Signature key not found."); + } + Mac mac = null; + URL url = null; + try { + byte[] secretyKeyBytes = secretKey.getBytes(UTF8_CHARSET); + SecretKeySpec secretKeySpec = new SecretKeySpec(secretyKeyBytes, HMAC_SHA256_ALGORITHM); + mac = Mac.getInstance(HMAC_SHA256_ALGORITHM); + mac.init(secretKeySpec); + url = new URL(urlString); + } catch (Exception e) { // UnsupportedEncodingException, NoSuchAlgorithmException, InvalidKeyException, MalformedURLException + throw new RuntimeException(e); + } + String protocol = url.getProtocol(); + String host = url.getHost(); + String path = url.getPath(); + String query = url.getQuery(); + + Map params = new HashMap(); + for (String pair: query.split("&")) { + String[] keyValue = pair.split("="); + params.put(keyValue[0], keyValue[1]); + } + params.putIfAbsent("Timestamp", timestamp()); + + SortedMap sortedParamMap = new TreeMap(params); + String canonicalQS = canonicalize(sortedParamMap); + String toSign = "GET\n" + host + "\n" + path + "\n" + canonicalQS; + String hmac = hmac(mac, toSign); + String sig = percentEncodeRfc3986(hmac); + return protocol + "://" + host + path + "?" + canonicalQS + "&Signature=" + sig; + } + + private static String timestamp() { + String timestamp = null; + Calendar cal = Calendar.getInstance(); + DateFormat dfm = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + dfm.setTimeZone(TimeZone.getTimeZone("GMT")); + timestamp = dfm.format(cal.getTime()); + return timestamp; + } + + private static String hmac(Mac mac, String stringToSign) { + String signature = null; + byte[] data; + byte[] rawHmac; + try { + data = stringToSign.getBytes(UTF8_CHARSET); + rawHmac = mac.doFinal(data); + Base64.Encoder encoder = Base64.getEncoder(); + signature = new String(encoder.encode(rawHmac)); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + return signature; + } + + private static String canonicalize(SortedMap sortedParamMap) { + if (sortedParamMap.isEmpty()) { + return ""; + } + StringBuffer buffer = new StringBuffer(); + Iterator> iter = sortedParamMap.entrySet().iterator(); + while (iter.hasNext()) { + Map.Entry kvpair = iter.next(); + buffer.append(percentEncodeRfc3986(kvpair.getKey())); + buffer.append("="); + buffer.append(percentEncodeRfc3986(kvpair.getValue())); + if (iter.hasNext()) { + buffer.append("&"); + } + } + String canonical = buffer.toString(); + return canonical; + } + + private static String percentEncodeRfc3986(String s) { + String out; + try { + out = URLEncoder.encode(s, UTF8_CHARSET) + .replace("+", "%20") + .replace("*", "%2A") + .replace("%7E", "~"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + return out; + } + +} diff --git a/src/main/java/org/searsia/engine/TextExtractor.java b/src/main/java/org/searsia/engine/TextExtractor.java index c57a9bd..8bad627 100644 --- a/src/main/java/org/searsia/engine/TextExtractor.java +++ b/src/main/java/org/searsia/engine/TextExtractor.java @@ -27,13 +27,18 @@ import org.searsia.Hit; +/** + * Manage XPath queries and extract the hit fields. + * + * @author Dolf Trieschnigg + * @author Djoerd Hiemstra + */ public class TextExtractor { private String field; private String xpath; private XPathExpression compiledXpath; - - private boolean trim = true; + public TextExtractor(String field, String xpath) throws XPathExpressionException { this.field = field; @@ -45,8 +50,14 @@ public TextExtractor(String field, String xpath) throws XPathExpressionException } + /** + * Modifies hit by adding result for the text extractor + * @param item An XML context element + * @param hit An updated hit + * @throws XPathExpressionException + */ public void extract(Node item, Hit hit) throws XPathExpressionException { - String resultString = ""; + String resultString = ""; // TODO: StringBuilder try { NodeList nodeList = (NodeList) this.compiledXpath.evaluate(item, XPathConstants.NODESET); if (nodeList != null) { @@ -66,32 +77,30 @@ public void extract(Node item, Hit hit) throws XPathExpressionException { } } - /** - * processes the match found with the XPath - * - * By default, uses the trim attribute to indicate whether the match should be trimmed - * Note: the string can be null - * - * @param s - * @return - */ private String processMatch(String s) { s = s.replaceAll("(?i)]*>||||", ""); // No HTML, please: spans removed - s = s.replaceAll("<[^>]+>", " "); // all other tags replaced by a space - if (trim) { - s = s.trim(); - } + s = s.replaceAll("<[^>]+>|\ufffd", " "); // all other tags or unicode replace character replaced by a space + s = s.trim(); // TODO multiple spaces, \\s ? return s; } + /** + * Get the field for the text extractor + * @return field + */ public String getField() { return field; } + /** + * Get the XPath query for the text extractor + * @return XPath query + */ public String getPath() { return xpath; } + @Override public boolean equals(Object o) { TextExtractor e = (TextExtractor) o; if (!getField().equals(e.getField())) return false; diff --git a/src/main/java/org/searsia/index/ResourceIndex.java b/src/main/java/org/searsia/index/ResourceIndex.java index 000704d..d7506fb 100644 --- a/src/main/java/org/searsia/index/ResourceIndex.java +++ b/src/main/java/org/searsia/index/ResourceIndex.java @@ -20,11 +20,14 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.text.ParseException; import java.util.LinkedHashMap; import java.util.Map; import java.util.Random; -import java.util.logging.Logger; +import javax.xml.xpath.XPathExpressionException; + +import org.apache.log4j.Logger; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -44,7 +47,6 @@ import org.apache.lucene.util.Version; import org.json.JSONException; import org.json.JSONObject; - import org.searsia.engine.Resource; /** @@ -55,19 +57,21 @@ public class ResourceIndex { private final static Logger LOGGER = Logger.getLogger(ResourceIndex.class.getName()); + private final static Version version = Version.LUCENE_4_10_4; private final static int MAX_SOURCE_CACHE = 10000; // TODO: breaks if we discover more than 10000 sources private Map engines = new LinkedHashMap(); private Random random = new Random(); private Resource mother = null; - private Resource me = null; + private Resource me = null; private Path meFile = null; - private Path indexDir = null; + private Path indexDir = null; private IndexWriter writer = null; + private String lastFlushed = null; /** - * Reads resources from index (if they exist) + * Creates index or reads resources from index (if it exist) * @param path path where the Searsia index resides * @param filename index file name * @throws IOException @@ -76,10 +80,15 @@ public ResourceIndex(String path, String filename) throws IOException { this.meFile = Paths.get(path, filename + ".json"); this.indexDir = Paths.get(path, filename + "_sources"); if (meFile.toFile().exists()) { - this.me = readMyselfFile(meFile); + try { + this.me = readMyselfFile(meFile); + } catch (IOException e) { + LOGGER.warn("Myself not found: " + e.getMessage()); + meFile.toFile().delete(); + } } if (this.indexDir.toFile().exists()) { - readResourceIndex(); + readResourceIndex(); } else { this.indexDir.toFile().mkdir(); } @@ -104,6 +113,8 @@ private Resource readMyselfFile(Path meFile) throws IOException { me = new Resource(json); } catch (javax.xml.xpath.XPathExpressionException e) { throw new IOException(e); + } catch (JSONException e) { + throw new IOException(e); } return me; } @@ -116,7 +127,7 @@ private void readResourceIndex() throws IOException { reader = DirectoryReader.open(dir); } catch (org.apache.lucene.index.IndexNotFoundException e) { - LOGGER.warning("No resources in index."); + LOGGER.warn("No resources in index."); return; } try { @@ -124,19 +135,28 @@ private void readResourceIndex() throws IOException { ScoreDoc[] hits = searcher.search(new MatchAllDocsQuery(), MAX_SOURCE_CACHE).scoreDocs; for (ScoreDoc hit: hits) { Document doc = searcher.doc(hit.doc); - JSONObject json = new JSONObject(doc.get("json")); - Resource engine = new Resource(json); - this.engines.put(engine.getId(), engine); + try{ + JSONObject json = new JSONObject(doc.get("json")); + Resource engine = new Resource((JSONObject) json.get("resource")); + if (json.has("health")) { + engine.updateHealth((JSONObject) json.get("health")); + String lastUpdated = engine.getLastUpdatedString(); + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { + this.lastFlushed = lastUpdated; + } + } + this.engines.put(engine.getId(), engine); + } catch (XPathExpressionException | JSONException | ParseException e) { + LOGGER.warn("Garbled index: " + e.getLocalizedMessage()); + } } - } catch (javax.xml.xpath.XPathExpressionException e) { + } catch (IOException e) { throw new IOException(e); - } catch (JSONException e) { - throw new IOException(e); - } - finally { + } finally { reader.close(); } } + private void initResourceIndex() throws IOException { Directory dir = FSDirectory.open(indexDir.toFile()); @@ -163,39 +183,25 @@ public Resource getMyself() { return this.me; } - - private boolean exists(Resource engine) { - for (Resource e: this.engines.values()) - if (e.equals(engine)) - return true; - return false; - } - - - private void updateResourceIndex(String id, Resource engine) throws IOException { - Document doc = new Document(); - if (id != null) { - JSONObject json = engine.toJson(); - json.put("parameters", engine.getJsonPrivateParameters()); // we need to remember those - doc.add(new StringField("id", id, Field.Store.YES)); // unique identifier - doc.add(new StoredField("json", json.toString())); - this.writer.updateDocument(new Term("id", id), doc); - } - this.writer.commit(); - } - - + /** + * Delete resource from index (not used, instead use resource.deleted) + * @param id + * @throws IOException + */ public void delete(String id) throws IOException { Resource engine = get(id); if (engine == null) { - throw new IOException("Resouce '" + id + "' not found"); + throw new IOException("Resource '" + id + "' not found"); } this.engines.remove(id); this.writer.deleteDocuments(new Term("id", id)); this.writer.commit(); } - - + + /** + * Adds resource to index or update it. + * @param engine + */ public void put(Resource engine) { if (this.mother != null && engine.getId().equals(this.mother.getId())) { throw new RuntimeException("Mother id conflict: " + engine.getId()); @@ -203,48 +209,69 @@ public void put(Resource engine) { if (this.me != null && engine.getId().equals(this.me.getId())) { throw new RuntimeException("Local id conflict: " + engine.getId()); } - if (!exists(engine)) { - try { - // TODO: keepPrivateParameters(engine); do not overwrite own parameters, ugh - updateResourceIndex(engine.getId(), engine); - } catch (IOException e) { - LOGGER.warning("Update of resource " + engine.getId() + " failed"); - // TODO Oh crap, what to do? - } + Resource old = get(engine.getId()); + if (old == null) { + this.engines.put(engine.getId(), engine); + } else { + old.updateWith(engine); } - engines.put(engine.getId(), engine); } + /** + * Checks existence of resource + * @param id + * @return + */ public boolean containsKey(String id) { return this.engines.containsKey(id); } + /** + * Get resource + * @param id + * @return + */ public Resource get(String id) { return this.engines.get(id); } + /** + * Get a random resource. If it is not there, return the mother. + * @return + */ public Resource getRandom() { Object[] keys = this.engines.keySet().toArray(); if (keys.length > 0) { int nr = random.nextInt(keys.length); - return this.engines.get(keys[nr]); - } else { - return null; - } + int i = nr + 1; + Resource engine = this.engines.get(keys[nr]); + while (engine.isDeleted() && i != nr) { // if deleted, pick next + if (i >= keys.length) { i = 0; } + engine = this.engines.get(keys[i]); + i += 1; + } + if (!engine.isDeleted()) { + return engine; + } + } + return getMother(); } // Efficiency can be gained here? - public Map topValues(String queryString, int max) { - Float[] topScores = new Float[max]; + public Map topValuesNotDeleted(String queryString, int max) { + float[] topScores = new float[max]; Resource[] topEngines = new Resource[max]; int size = 0; float lastScore = -99.0f; + String lastId = ""; for (Resource engine: this.engines.values()) { - float score = engine.score(queryString) + engine.getPrior(); // TODO: add bias ? - if (size < max || score > lastScore) { + if (engine.isDeleted()) { continue; } + float score = engine.score(queryString) + engine.getPrior(); + String id = engine.getId(); + if (size < max || (score > lastScore || (score == lastScore && id.compareTo(lastId) > 0))) { if (size < max) size++; int index = size - 1; - while(index > 0 && topScores[index - 1] < score) { + while(index > 0 && (topScores[index - 1] < score || (topScores[index - 1] == score && id.compareTo(topEngines[index - 1].getId()) > 0))) { topScores[index] = topScores[index - 1]; topEngines[index] = topEngines[index - 1]; index -= 1; @@ -252,6 +279,7 @@ public Map topValues(String queryString, int max) { topScores[index] = score; topEngines[index] = engine; lastScore = topScores[size - 1]; + lastId = topEngines[size - 1].getId(); } } Map result = new LinkedHashMap(); @@ -262,26 +290,32 @@ public Map topValues(String queryString, int max) { } public void putMother(Resource mother) { - this.mother = mother; + mother.setLastUpdatedToNow(); + if (this.mother == null) { + this.mother = mother; + } else { + this.mother.updateWith(mother); + } } - public void putMyself(Resource engine) { - if (get(engine.getId()) != null) { - throw new RuntimeException("The server id '" + engine.getId() + "' already exists."); + public void putMyself(Resource me) { + if (get(me.getId()) != null) { + throw new RuntimeException("The server id '" + me.getId() + "' already exists."); } + me.setLastUpdatedToNow(); try { - writeMyselfFile(engine); + writeMyselfFile(me); } catch (IOException e) { - LOGGER.warning("Could not write index file"); + LOGGER.error("Could not write resource index file"); } - this.me = engine; + this.me = me; } public float maxPrior() { float max = 0.0f; for (Resource e: this.engines.values()) { if (e.getPrior() > max) { - max = e.getPrior(); + max = e.getPrior(); } } return max; @@ -296,10 +330,88 @@ public void dump() { } } + private Document luceneDocument(Resource engine) { + Document doc = new Document(); + String id = engine.getId(); + JSONObject json = new JSONObject(); + JSONObject resourceJson = engine.toJsonEngine(); + resourceJson.put("privateparameters", engine.getJsonPrivateParameters()); // we need to remember those + JSONObject healthJson = engine.toJsonHealth(); + json.put("resource", resourceJson); + json.put("health", healthJson); + json.put("searsia", "v1"); + doc.add(new StringField("id", id, Field.Store.YES)); // unique identifier + doc.add(new StoredField("json", json.toString())); + return doc; + } + + /** + * Flush the index updates to disk + */ + public void flush() { + try { + String lastDate = this.lastFlushed; + for (Resource engine: this.engines.values()) { + String lastUpdated = engine.getLastUpdatedString(); + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastUpdated) < 0) { + if (lastDate == null || lastDate.compareTo(lastUpdated) < 0) { + lastDate = lastUpdated; + } + this.writer.updateDocument(new Term("id", engine.getId()), luceneDocument(engine)); + } + } + if (this.lastFlushed == null || this.lastFlushed.compareTo(lastDate) < 0) { + this.writer.commit(); + this.lastFlushed = lastDate; + LOGGER.info("Flushed resources to disk."); + } + } catch (Exception e) { + LOGGER.warn("Flushing resource index failed: " + e); + } + } + + /** + * Close the index + * @throws IOException + */ public void close() throws IOException { + this.flush(); this.writer.close(); this.mother = null; this.me = null; } + + public JSONObject toJsonHealth() { + String lastMessage = null; + int countOk = 0, + countError = 0; + for (Resource engine: this.engines.values()) { + if (engine.isDeleted()) { continue; } + String error = engine.getLastError(); + if (engine.isHealthy()) { + countOk += 1; + } else { + countError += 1; + lastMessage = engine.getId() + ": " + error; + } + if (countError == 0 && lastMessage == null && error != null) { + lastMessage = engine.getId() + ": " + error; // last error of any engine + } + } + if (this.mother.isHealthy()) { + countOk += 1; + } else { + countError += 1; + lastMessage = this.mother.getId() + " (mother): " + this.mother.getLastError(); + } + JSONObject stats = new JSONObject(); + stats.put("enginesok", countOk); + stats.put("engineserr", countError); + if (lastMessage != null) { + stats.put("lastmessage", lastMessage); + } + return stats; + } + } diff --git a/src/main/java/org/searsia/index/SearchResultIndex.java b/src/main/java/org/searsia/index/SearchResultIndex.java index b48f624..ba6af89 100644 --- a/src/main/java/org/searsia/index/SearchResultIndex.java +++ b/src/main/java/org/searsia/index/SearchResultIndex.java @@ -1,7 +1,24 @@ +/* + * Copyright 2016-2017 Searsia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.searsia.index; import java.io.File; import java.io.IOException; +import java.util.Iterator; import java.util.concurrent.ArrayBlockingQueue; import org.apache.log4j.Logger; @@ -25,9 +42,9 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; - import org.searsia.Hit; import org.searsia.SearchResult; @@ -85,6 +102,7 @@ private void openWriter() throws IOException { private void openReader() throws IOException { this.hitsReader = DirectoryReader.open(FSDirectory.open(this.hitsDirectory)); this.hitsSearcher = new IndexSearcher(this.hitsReader); + this.hitsSearcher.setSimilarity(new BM25Similarity(0.0f, 0.0f)); // simple idf scoring //searcher.setSimilarity(new BM25Similarity(1.2f, 0.75f)); // k1, b //searcher.setSimilarity(new LMDirichletSimilarity(200f)); // mu //searcher.setSimilarity(new LMJelinekMercerSimilarity(0.5f)); // lambda @@ -126,11 +144,12 @@ private void storeSearchResult(SearchResult result) throws IOException { } public void offer(SearchResult result) { + // assert(result.getQuery() != null && result.getResourceId() != null); this.queue.offer(result); } public SearchResult search (String queryString) throws IOException { - return search(queryString, 40); + return search(queryString, 80); } public SearchResult search (String queryString, int hitsPerPage) throws IOException { @@ -159,7 +178,12 @@ public SearchResult search (String queryString, int hitsPerPage) throws IOExcept return result; } - + /** + * Get Hit by Lucene id. Used for tests only + * @param hitId + * @return hit + * @throws IOException + */ protected Hit getHit(String hitId) throws IOException { Term term = new Term("id", hitId); Query query = new TermQuery(term); @@ -176,29 +200,48 @@ protected Hit getHit(String hitId) throws IOException { } } + /** + * Searches the queue for a cached result + * TODO: Is this thread safe in case of a concurrent cash flush? See: + * https://docs.oracle.com/javase/8/docs/api/java/util/concurrent/package-summary.html#Weakly + * @param query + * @return search result page + */ + public SearchResult cacheSearch(String query, String resourceId) { + if (query != null && resourceId != null) { // TODO: make more efficient with initial HashMap check query+id + Iterator iterator = this.queue.iterator(); + while (iterator.hasNext()) { + SearchResult result = iterator.next(); + if (query.equals(result.getQuery()) && resourceId.equals(result.getResourceId())) { + return result; + } + } + } + return null; + } /** * Flushes the queue with updates to disk * @throws IOException */ public void flush() throws IOException { - while (queue.size() > 0) { - SearchResult result = queue.poll(); + while (this.queue.size() > 0) { + SearchResult result = this.queue.poll(); storeSearchResult(result); - } + } this.hitsWriter.commit(); closeReader(); - LOGGER.info("{\"message\":\"Flushed cache to index.\"}"); + LOGGER.info("Flushed cache to index."); } /** * Checks if the queue is full (its size is larger than 'limit') * If so, it flushes the updates to disk. - * @return true is queue was flushed. + * @return true if queue was flushed. * @throws IOException */ - public boolean check() throws IOException { - boolean full = queue.size() > limit; + public boolean checkFlush() throws IOException { + boolean full = this.queue.size() > limit; if (full) { flush(); } diff --git a/src/main/java/org/searsia/web/OpenSearch.java b/src/main/java/org/searsia/web/OpenSearch.java index f461a50..b628327 100644 --- a/src/main/java/org/searsia/web/OpenSearch.java +++ b/src/main/java/org/searsia/web/OpenSearch.java @@ -20,9 +20,11 @@ import javax.ws.rs.GET; import javax.ws.rs.Path; +import javax.ws.rs.PathParam; import javax.ws.rs.Produces; import javax.ws.rs.core.Response; +import org.searsia.engine.Resource; import org.searsia.index.ResourceIndex; /** @@ -31,41 +33,36 @@ * @author hiemstra * */ -@Path("opensearch.xml") +@Path("opensearch") public class OpenSearch { private ResourceIndex engines; + private boolean dontshare; - public OpenSearch(ResourceIndex engines) throws IOException { - this.engines = engines; + public OpenSearch(ResourceIndex engines, boolean dontshare) throws IOException { + this.engines = engines; + this.dontshare = dontshare; } - @GET + @GET @Path("{resourceid}") @Produces("application/opensearchdescription+xml; charset=utf-8") - public Response get() { - String response = "\n"; - String shortName = engines.getMyself().getName(); - String favicon = engines.getMyself().getFavicon(); - String userTemplate = engines.getMyself().getUserTemplate(); - String suggestTemplate = engines.getMyself().getSuggestTemplate(); - String apiTemplate = engines.getMyself().getAPITemplate(); - String testQuery = engines.getMyself().getTestQuery(); - if (shortName == null) shortName = "Searsia"; - response += "\n"; - response += " " + xmlEncode(shortName) + "\n"; - response += " Search the web with " + xmlEncode(shortName) + "\n"; - response += " \n"; - if (userTemplate != null) response += " \n"; - if (suggestTemplate != null) response += " \n"; - if (testQuery != null) response += " \n"; - if (favicon != null) response += " " + xmlEncode(favicon) + "\n"; - response += " UTF-8\n"; - response += " UTF-8\n"; - response += "\n"; - return Response.ok(response).build(); + public Response get(@PathParam("resourceid") String resourceid) { + resourceid = resourceid.replaceAll("\\.xml$", ""); + Resource engine = null; + if (resourceid.equals(engines.getMyself().getId())) { + engine = engines.getMyself(); + } else { + engine = engines.get(resourceid); + } + if (engine != null) { + String xmlString = engineXML(engine); + return Response.ok(xmlString).build(); + } else { + return SearsiaApplication.responseError(404, "Not found: " + resourceid); + } } - + private String xmlEncode(String text) { text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); @@ -73,9 +70,37 @@ private String xmlEncode(String text) { } private String templateEncode(String url) { - url = url.replaceAll("\\{q", "{searchTerms"); - url = url.replaceAll("\\{r", "{searsia:resourceId"); + url = url.replaceAll("\\{q", "{searchTerms"); // backwards compatible with Searsia v0.x return xmlEncode(url); } + private String engineXML(Resource engine) { + String response = "\n"; + String shortName = engine.getName(); + String favicon = engine.getFavicon(); + String userTemplate = engine.getUserTemplate(); + String suggestTemplate = engine.getSuggestTemplate(); + String apiTemplate = engine.getAPITemplate(); + String mimeType = engine.getMimeType(); + String postString = engine.getPostString(); + String testQuery = engine.getTestQuery(); + String method = "GET"; + if (postString != null) method = "POST"; + if (shortName == null) shortName = "Searsia"; + response += "\n"; + response += " " + xmlEncode(shortName) + "\n"; + response += " Search the web with " + xmlEncode(shortName) + "\n"; + if(!dontshare && apiTemplate != null) { // TODO: own api or foward API? + response += " \n"; + } + if (userTemplate != null) response += " \n"; + if (suggestTemplate != null) response += " \n"; + if (testQuery != null) response += " \n"; + if (favicon != null) response += " " + xmlEncode(favicon) + "\n"; + response += " UTF-8\n"; + response += " UTF-8\n"; + response += "\n"; + return response; + } + } diff --git a/src/main/java/org/searsia/web/Redirect.java b/src/main/java/org/searsia/web/Redirect.java new file mode 100644 index 0000000..475b830 --- /dev/null +++ b/src/main/java/org/searsia/web/Redirect.java @@ -0,0 +1,42 @@ +package org.searsia.web; + +import java.io.IOException; + +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.Produces; +import javax.ws.rs.core.Response; + +import org.searsia.SearchResult; + + +@Path("searsia") +public class Redirect { + + String id; + + public Redirect(String id) throws IOException { + this.id = id; + } + + @GET + @Produces(SearchResult.SEARSIA_MIME_ENCODING) + public Response notFound() { + return SearsiaApplication.responseError(404, "Not found"); + } + + /** + * Redirect, not used because it does not always behave well in + * case web servers do a simple rewrite of URLs. + * @return + */ + public Response redirect() { + return Response + .status(301) + .entity("") + .header("Access-Control-Allow-Origin", "*") + .header("Location", this.id + ".json") + .build(); + } + +} \ No newline at end of file diff --git a/src/main/java/org/searsia/web/Search.java b/src/main/java/org/searsia/web/Search.java index 8ce2487..ca9dc65 100644 --- a/src/main/java/org/searsia/web/Search.java +++ b/src/main/java/org/searsia/web/Search.java @@ -1,5 +1,5 @@ /* - * Copyright 2016 Searsia + * Copyright 2016-2017 Searsia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,18 +20,19 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.Locale; import javax.ws.rs.GET; import javax.ws.rs.OPTIONS; import javax.ws.rs.Path; import javax.ws.rs.Produces; +import javax.ws.rs.PathParam; import javax.ws.rs.QueryParam; import javax.ws.rs.core.Response; -import org.apache.log4j.Logger; import org.json.JSONObject; - import org.searsia.SearchResult; +import org.searsia.SearsiaOptions; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.engine.Resource; @@ -42,120 +43,179 @@ * * @author Dolf Trieschnigg and Djoerd Hiemstra */ -@Path("search") + +@Path("searsia") public class Search { - private final static Logger LOGGER = Logger.getLogger(Search.class); + private final static org.apache.log4j.Logger LOGGER = org.apache.log4j.Logger.getLogger(Search.class); + private final static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); + private final static String startTime = dateFormat.format(new Date()); private ResourceIndex engines; private SearchResultIndex index; + private boolean health; + private boolean shared; + private long nrOfQueriesOk = 0; + private long nrOfQueriesError = 0; - public Search(SearchResultIndex index, ResourceIndex engines) throws IOException { - this.engines = engines; - this.index = index; - } - - private static final DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); - - private void logQuery(String resourceid, String query) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - if (resourceid != null) r.put("resourceid", resourceid); - r.put("query", query); - LOGGER.info(r.toString()); - } - private void logWarning(String message) { - JSONObject r = new JSONObject(); - r.put("time", df.format(new Date())); - r.put("warning", message); - LOGGER.warn(r.toString()); + public Search(SearchResultIndex index, ResourceIndex engines, SearsiaOptions options) throws IOException { + this.engines = engines; + this.index = index; + this.health = !options.isNoHealthReport(); + this.shared = !options.isNotShared(); } - - @OPTIONS + + @OPTIONS @Path("{resourceid}") public Response options() { return Response.status(Response.Status.NO_CONTENT) .header("Access-Control-Allow-Origin", "*") .header("Access-Control-Allow-Methods", "GET") .build(); } + + // TODO: gives 406 not acceptable whith "Accept: application/json" - @GET + @GET @Path("{resourceid}") @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response query(@QueryParam("r") String resourceid, @QueryParam("q") String query) { - // TODO: also log the outcome of the query - logQuery(resourceid, query); - - Resource me, engine, mother; - SearchResult result; - JSONObject json; - me = engines.getMyself(); - mother = engines.getMother(); - if (resourceid != null && resourceid.trim().length() > 0 && !resourceid.equals(me.getId())) { - engine = engines.get(resourceid); - if (engine == null) { // unknown? ask your mother - if (mother != null) { - try { - engine = mother.searchResource(resourceid); - } catch (SearchException e) { - String message = "Resource not found: @" + resourceid; - logWarning(message); - return SearsiaApplication.responseError(404, message); - } - } - if (engine == null) { - String message = "Unknown resource identifier: @" + resourceid; - logWarning(message); - return SearsiaApplication.responseError(404, message); - } - engines.put(engine); - } - if (query != null && query.trim().length() > 0) { - try { - result = engine.search(query); - result.removeResourceRank(); // only trust your mother - json = result.toJson(); // first json for response, so - result.addQueryResourceRankDate(engine.getId()); // response will not have query + resource - index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) - json.put("resource", engine.toJson()); - return SearsiaApplication.responseOk(json); - } catch (Exception e) { - String message = "Resource @" + resourceid + " unavailable: " + e.getMessage(); - logWarning(message); - return SearsiaApplication.responseError(503, message); - } - } else { - json = new JSONObject().put("resource", engine.toJson()); - return SearsiaApplication.responseOk(json); - } + public Response query(@PathParam("resourceid") String resourceid, + @QueryParam("q") String searchTerms, + @QueryParam("resources") String countResources, + @QueryParam("page") String startPage) { + resourceid = resourceid.replaceAll("\\.json$", ""); + Resource me = engines.getMyself(); + if (!resourceid.equals(me.getId())) { + return getRemoteResults(resourceid, searchTerms); } else { - if (query != null && query.trim().length() > 0) { - try { - result = index.search(query); - } catch (IOException e) { - String message = "Service unavailable: " + e.getMessage(); - logWarning(message); - return SearsiaApplication.responseError(503, message); - } - if (result.getHits().isEmpty() && mother != null) { // empty? ask mother! - try { - result = mother.search(query); - index.offer(result); // really trust mother - } catch (SearchException e) { - String message = "Mother not available"; - logWarning(message); - } - } else { // own results? Do resource ranking. - result.scoreResourceSelection(query, engines); - } - } else { // no query? Return empty results - result = new SearchResult(); - result.scoreResourceSelection(query, engines); - } - json = result.toJson(); - json.put("resource", engines.getMyself().toJson()); - return SearsiaApplication.responseOk(json); + Integer max = 10, start = 0; + if (countResources != null) { + try { + max = Integer.parseInt(countResources); + } catch (NumberFormatException e) { + max = 10; + } + if (max > 200) { max = 200; } // FedWeb14 has about 150 + if (max < 1) { max = 1; } + } + if (startPage != null) { + try { + start = Integer.parseInt(startPage); + start = (start - me.getIndexOffset()) * max; // openSearch standard default starts at 1 + } catch (NumberFormatException e) { + start = 0; + } + if (start < 0) { start = 0; } + } + return getLocalResults(searchTerms, max, start); } } - + + private Response getRemoteResults(String resourceid, String query) { + Resource engine = engines.get(resourceid); + Resource mother = engines.getMother(); + JSONObject json = null; + if (engine == null || engine.getLastUpdatedSecondsAgo() > 9600) { // unknown or really old? ask your mother + if (mother != null) { // TODO: option for 9600 and similar value (7200) in Main + try { + Resource newEngine = mother.searchResource(resourceid); + engine = newEngine; + engines.put(engine); + } catch (SearchException e) { + if (engine != null) { + LOGGER.warn("Not found at mother: " + resourceid); + } + } + } + if (engine == null) { + String message = "Not found: " + resourceid; + LOGGER.warn(message); + return SearsiaApplication.responseError(404, message); + } + } + if (engine.isDeleted()) { + String message = "Gone: " + resourceid; + LOGGER.warn(message); + return SearsiaApplication.responseError(410, message); + } + if (query != null && query.trim().length() > 0) { + SearchResult result = index.cacheSearch(query, engine.getId()); + if (result != null) { + boolean censorQueryResourceId = true; + json = result.toJson(censorQueryResourceId); + LOGGER.info("Cache " + resourceid + ": " + query); + } else { + try { + result = engine.search(query); + result.removeResource(); // only trust your mother + json = result.toJson(); // first json for response, so + result.addResourceDate(engine.getId()); // response will not have resource id + date + index.offer(result); // maybe do this AFTER the http response is sent: https://jersey.java.net/documentation/latest/async.html (11.1.1) + LOGGER.info("Query " + resourceid + ": " + query); + } catch (Exception e) { + String message = "Resource " + resourceid + " unavailable: " + e.getMessage(); + LOGGER.warn(message); + return SearsiaApplication.responseError(503, message); + } + } + } else { + json = new JSONObject(); + if (this.health) { + json.put("health", engine.toJsonHealth()); + } + LOGGER.info("Resource " + resourceid + "."); + } + if (this.shared) { + json.put("resource", engine.toJson()); + } else { + json.put("resource", engine.toJsonEngineDontShare()); + } + return SearsiaApplication.responseOk(json); + } + + private Response getLocalResults(String query, int max, int start) { + JSONObject json = null, healthJson = null; + Resource mother = engines.getMother(); + Resource me = engines.getMyself(); + SearchResult result = null; + if (query != null && query.trim().length() > 0) { + try { + result = index.search(query); + } catch (Exception e) { + String message = "Service unavailable: " + e.getMessage(); + LOGGER.warn(message); + this.nrOfQueriesError += 1; + return SearsiaApplication.responseError(503, message); + } + this.nrOfQueriesOk += 1; + if (result.getHits().isEmpty() && mother != null) { // empty? ask mother! + try { + result = mother.search(query); + index.offer(result); // really trust mother + } catch (SearchException e) { + LOGGER.warn("Mother not available"); + } catch (Exception e) { + LOGGER.warn(e); + } + } + result.scoreResourceSelection(query, engines, max, start); + LOGGER.info("Local: " + query); + } else { // no query: create a 'resource only' result, plus health report + result = new SearchResult(); + result.scoreResourceSelection(null, engines, max, start); + if (this.health) { + healthJson = engines.toJsonHealth(); + healthJson.put("requestsok", this.nrOfQueriesOk); + healthJson.put("requestserr", this.nrOfQueriesError); + healthJson.put("upsince", startTime); + } + LOGGER.info("Local."); + } + json = result.toJson(); + json.put("resource", me.toJson()); + if (healthJson != null) { + json.put("health", healthJson); + } + return SearsiaApplication.responseOk(json); + } + } diff --git a/src/main/java/org/searsia/web/SearsiaApplication.java b/src/main/java/org/searsia/web/SearsiaApplication.java index 1356edf..eac13d0 100644 --- a/src/main/java/org/searsia/web/SearsiaApplication.java +++ b/src/main/java/org/searsia/web/SearsiaApplication.java @@ -17,13 +17,12 @@ package org.searsia.web; import java.io.IOException; -import java.util.logging.Level; -import java.util.logging.Logger; import javax.ws.rs.core.Response; import org.glassfish.jersey.server.ResourceConfig; import org.json.JSONObject; +import org.searsia.SearsiaOptions; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; @@ -34,7 +33,7 @@ */ public class SearsiaApplication extends ResourceConfig { - public static final String VERSION = "v0.4.1"; + public static final String VERSION = "v1.0.2"; protected static Response responseOk(JSONObject json) { json.put("searsia", VERSION); @@ -66,12 +65,14 @@ protected static Response jsonResponse(int status, JSONObject json) { .build(); } - public SearsiaApplication(SearchResultIndex index, ResourceIndex engines, Boolean openWide) throws IOException { + public SearsiaApplication(SearchResultIndex index, + ResourceIndex engines, + SearsiaOptions options) throws IOException { super(); - Logger.getLogger("org.glassfish.grizzly").setLevel(Level.WARNING); - register(new Search(index, engines)); - register(new Update(engines, openWide)); - register(new OpenSearch(engines)); + java.util.logging.Logger.getLogger("").setLevel(java.util.logging.Level.WARNING); + register(new Search(index, engines, options)); + register(new OpenSearch(engines, options.isNotShared())); + register(new Redirect(engines.getMyself().getId())); } } diff --git a/src/main/java/org/searsia/web/Update.java b/src/main/java/org/searsia/web/Update.java deleted file mode 100644 index 0753455..0000000 --- a/src/main/java/org/searsia/web/Update.java +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright 2016 Searsia - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.searsia.web; - -import java.util.List; - -import javax.ws.rs.DELETE; -import javax.ws.rs.OPTIONS; -import javax.ws.rs.PUT; -import javax.ws.rs.Path; -import javax.ws.rs.PathParam; -import javax.ws.rs.Produces; -import javax.ws.rs.core.Context; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.Response; - -import org.json.JSONObject; -import org.searsia.Hit; -import org.searsia.SearchResult; -import org.searsia.index.ResourceIndex; -import org.searsia.engine.Resource; - -/** - * Enables on-line updates, only if --open set in the options. - * - * @author Dolf Trieschnigg and Djoerd Hiemstra - */ -@Path("update") -public class Update { - - private ResourceIndex engines; - private Boolean wideOpen; - - - public Update(ResourceIndex engines, Boolean wideOpen) { - this.engines = engines; - this.wideOpen = wideOpen; - } - - - private JSONObject getJSONResource(String postString, HttpHeaders headers) { - JSONObject jsonResource = null; - String contentType = headers.getHeaderString("Content-Type").toLowerCase(); - if (contentType.equals(SearchResult.SEARSIA_MIME_ENCODING)) { - JSONObject jsonInput = new JSONObject(postString); - jsonResource = jsonInput.getJSONObject("resource"); - } else { - throw new RuntimeException("Content-type not implemented"); - } - return jsonResource; - } - - - @OPTIONS - @Path("{id}") - public Response options() { - return Response.status(Response.Status.NO_CONTENT) - .header("Access-Control-Allow-Origin", "*") - .header("Access-Control-Allow-Methods", "DELETE, PUT") - .header("Access-Control-Allow-Headers", "Content-Type") - .build(); - } - - /** - * Updates the engines database with a new resource. Test with: - * curl -X PUT -H 'Content-Type: application/searsia+json; charset=UTF-8' http://localhost:16842/searsia/update/2 -d '{"resource":{"id":"2", "apitemplate":"https://search.utwente.nl/searsia/suggestions.php?q={q}", "testquery":"osiris"}}' - * - * @param id engine identifier - * @param headers http headers - * @param putString data - * @return search results for the test query if the update is successful - */ - @PUT // - @Path("{id}") - @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response put(@PathParam("id") String id, @Context HttpHeaders headers, String putString) { - if (!this.wideOpen) { - return SearsiaApplication.responseError(401, "Unauthorized. Run Searsia server with --open"); - } - Resource engine = null; - try { - JSONObject jsonResource = getJSONResource(putString, headers); - if (!id.equals(jsonResource.get("id"))) { - return SearsiaApplication.responseError(400, "Conflicting id's"); - } - engine = new Resource(jsonResource); - } catch (Exception e) { - return SearsiaApplication.responseError(400, e.getMessage()); - } - SearchResult result = null; - updateEngine(engine); - try { - result = engine.search(engine.getTestQuery(), true); // debug = true - } catch (Exception e) { - return SearsiaApplication.responseError(503, "Resource unavailable: " + e.getMessage()); - } - - JSONObject jsonOutput = result.toJson(); - jsonOutput.put("resource", engine.toJson()); - jsonOutput.put("debug", result.getXmlOut()); - List hits = result.getHits(); - if (result == null || hits.size() == 0) { - jsonOutput.put("error", "No results for test query: '" + engine.getTestQuery() + "'" ); - return SearsiaApplication.jsonResponse(405, jsonOutput); - //return SearsiaApplication.responseError(405, "No results for test query: '" + engine.getTestQuery() + "'" ); - } else { - for (Hit hit: hits) { - if (hit.getTitle() == null) { - jsonOutput.put("error", "Search result without title for query: '" + engine.getTestQuery() + "'"); - return SearsiaApplication.jsonResponse(405, jsonOutput); - } - break; // check only first - } - } - try { - engines.put(engine); - } catch (Exception e) { - return SearsiaApplication.responseError(400, e.getMessage()); - } - return SearsiaApplication.responseOk(jsonOutput); - } - - /** - * If Searsia engine, get several values. Will change the value of 'engine' - * @param engine - */ - private void updateEngine(Resource engine) { - if (engine.getMimeType().equals(SearchResult.SEARSIA_MIME_TYPE)) { - SearchResult result = null; - Resource resource = null; - try { - result = engine.search(); - resource = result.getResource(); - if (resource != null) { - engine.setUrlAPITemplate(resource.getAPITemplate()); - if (engine.getName() == null) { engine.setName(resource.getName()); } - if (engine.getBanner() == null) { engine.setBanner(resource.getBanner()); } - if (engine.getFavicon() == null) { engine.setFavicon(resource.getFavicon()); } - if (engine.getRerank() == null) { engine.setRerank(resource.getRerank()); } - if (engine.getTestQuery().equals(Resource.defaultTestQuery)) { engine.setTestQuery(resource.getTestQuery()); } // awkward if the user typed 'searsia' - } - } catch (Exception e) { - // nothing - } - } - } - - /** - * Deletes the engine with resource id: id. Test with: - * curl -X DELETE http://localhost:16842/searsia/update/2 - * - * @param id engine identifier - * @return only searsia version if successful - */ - @DELETE - @Path("{id}") - @Produces(SearchResult.SEARSIA_MIME_ENCODING) - public Response delete(@PathParam("id") String id) { - if (!this.wideOpen) { - return SearsiaApplication.responseError(401, "Unauthorized"); - } - JSONObject jsonOutput = new JSONObject(); - try { - engines.delete(id); - } catch (Exception e) { - return SearsiaApplication.responseError(400, e.getMessage()); - } - return SearsiaApplication.responseOk(jsonOutput); - } - -} diff --git a/src/test/java/org/searsia/MainTest.java b/src/test/java/org/searsia/MainTest.java index e13a785..c8b1a7b 100644 --- a/src/test/java/org/searsia/MainTest.java +++ b/src/test/java/org/searsia/MainTest.java @@ -9,7 +9,9 @@ public class MainTest { @Test public void test() { - String[] args = {"--path=target/index-test/", "--log=4", "--exit", "--quiet"}; + String[] args = {"--path=target/index-test/", + "--mother=http://searsia.org/searsia/wiki/index.json", + "--log=4", "--test=json", "--quiet"}; Main.main(args); Assert.assertTrue(true); // happy if we get here! } diff --git a/src/test/java/org/searsia/SearchResultTest.java b/src/test/java/org/searsia/SearchResultTest.java index 507311b..f460123 100644 --- a/src/test/java/org/searsia/SearchResultTest.java +++ b/src/test/java/org/searsia/SearchResultTest.java @@ -2,14 +2,13 @@ import org.junit.Assert; import org.junit.Test; - import org.searsia.Hit; import org.searsia.SearchResult; public class SearchResultTest { @Test - public void test1() { + public void testSimple() { SearchResult sr = new SearchResult(); Hit h = new Hit(); h.put("title", "boo"); @@ -18,24 +17,50 @@ public void test1() { } @Test - public void test2() { + public void testEmpty() { SearchResult sr = new SearchResult(); Assert.assertEquals("{\"hits\":[]}", sr.toJson().toString()); } @Test - public void test3() { + public void testSampleAndRerank() { SearchResult sr = new SearchResult(); Hit h = new Hit("The ultimate test", "Oh yeah", "http://searsia.org", "http://searsia.org/images/search.png"); sr.addHit(h); - String term = sr.randomTerm(); - String terms = h.toIndexVersion(); - Assert.assertTrue(terms.contains(term)); - Assert.assertTrue(sr.getHits().size() > 0); + String terms = h.toIndexVersion().toLowerCase(); + h = new Hit("Another test", "yeah", "http://searsia.org/test.html", + "http://searsia.org/images/search.png"); + sr.addHit(h); + terms += " " + h.toIndexVersion().toLowerCase(); + String notThis = "test"; + String term = sr.randomTerm(notThis); + Assert.assertFalse("Same random term", term.equals(notThis)); + Assert.assertTrue("Index contains random term: " + term, terms.contains(term)); + Assert.assertEquals("Total nr of hits", 2, sr.getHits().size()); + sr.scoreReranking("test", "or"); + Assert.assertEquals("Nr of hits after reranking", 2, sr.getHits().size()); sr.scoreReranking("doesnotmatch", "or"); - Assert.assertTrue(sr.getHits().size() == 0); + Assert.assertEquals("Query matches zero results", 0, sr.getHits().size()); } - + @Test + public void testCast() { + float score = 0.1f; + Hit h1 = new Hit(); + h1.put("score", score); + Assert.assertEquals(score, h1.getScore(), 0.0001f); + Hit h2 = new Hit(); + h2.put("score", Float.toString(score)); + Assert.assertEquals(score, h2.getScore(), 0.0001f); + Hit h3 = new Hit(); + h3.put("score", "wrong means zero"); + Assert.assertEquals(0.0f, h3.getScore(), 0.0001f); + Hit h4 = new Hit("{\"title\":\"boo\",\"score\":1.0}"); + Assert.assertEquals(1.0f, h4.getScore(), 0.0001f); + Hit h5 = new Hit("{\"title\":\"boo\",\"score\":1}"); + Assert.assertEquals(1.0f, h5.getScore(), 0.0001f); + Hit h6 = new Hit("{\"title\":\"boo\",\"score\":9.7E-4}"); + Assert.assertTrue(h6.getScore() > 0.0f); + } } diff --git a/src/test/java/org/searsia/engine/DOMBuilderTest.java b/src/test/java/org/searsia/engine/DOMBuilderTest.java new file mode 100644 index 0000000..3820a3d --- /dev/null +++ b/src/test/java/org/searsia/engine/DOMBuilderTest.java @@ -0,0 +1,62 @@ +package org.searsia.engine; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.junit.Test; +import org.w3c.dom.Document; + +import org.searsia.engine.DOMBuilder; + +public class DOMBuilderTest { + + private static String readFile(String fileName) throws IOException { + String s, result = ""; + BufferedReader reader = new BufferedReader(new FileReader("src/test/resources/" + fileName)); + try { + while ((s = reader.readLine()) != null) { + result += s; + } + } + finally { + reader.close(); + } + return result; + } + + @Test + public void testJsonFileIfExists() { + String jsonString = null; + try { + jsonString = readFile("test.json"); + } catch (IOException e) { } + if (jsonString != null) { + if (jsonString.startsWith("[")) { + jsonString = "{\"list\":" + jsonString + "}"; + } + JSONObject json = new JSONObject(jsonString); + Document doc = DOMBuilder.json2DOM(json); + String xml = DOMBuilder.DOM2String(doc); + System.out.println(xml); + } + } + + @Test + public void testHtmlFileIfExists() { + String htmlString = null; + try { + htmlString = readFile("test.html"); + } catch (IOException e) { } + if (htmlString != null) { + org.jsoup.nodes.Document jsoup = Jsoup.parse(htmlString); + Document doc = DOMBuilder.jsoup2DOM(jsoup); + String xml = DOMBuilder.DOM2String(doc); + System.out.println(xml); + } + } + + +} diff --git a/src/test/java/org/searsia/engine/ResourceTest.java b/src/test/java/org/searsia/engine/ResourceTest.java index c3b0bc3..df14e73 100644 --- a/src/test/java/org/searsia/engine/ResourceTest.java +++ b/src/test/java/org/searsia/engine/ResourceTest.java @@ -1,13 +1,12 @@ package org.searsia.engine; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; import org.json.JSONObject; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; -import java.util.logging.Level; -import java.util.logging.Logger; - import javax.xml.xpath.XPathExpressionException; import org.searsia.SearchResult; @@ -16,82 +15,15 @@ public class ResourceTest { private static final String SECRET_API_KEY = "a7235cdsf43d3a2dfgeda"; - - private Resource htmlSearch() throws XPathExpressionException { - Resource hiemstra = new Resource("http://wwwhome.cs.utwente.nl/~hiemstra/?s={q}&api={apikey}&p={p?}","hiemstra"); - hiemstra.setUrlUserTemplate(hiemstra.getAPITemplate()); - hiemstra.addPrivateParameter("apikey", SECRET_API_KEY); - hiemstra.addHeader("User-Agent", "Test/1.0"); - hiemstra.setPrior(0.3f); - hiemstra.setRate(133); - hiemstra.setMimeType("text/html"); - hiemstra.setFavicon("http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico"); - hiemstra.setItemXpath("//div[@class='post']"); - hiemstra.addExtractor( - new TextExtractor("title", "./h3"), - new TextExtractor("description", "./h3/following-sibling::text()"), - new TextExtractor("url", "./h3/a/@href") - ); - return hiemstra; - } - - private Resource postSearch() throws XPathExpressionException { - Resource hiemstra = new Resource("http://wwwhome.cs.utwente.nl/~hiemstra/","hiemstrapost"); - hiemstra.setPostString("os={q}"); - hiemstra.setPostQueryEncode("application/x-www-form-urlencoded"); - hiemstra.setMimeType("application/xml"); - hiemstra.setItemXpath("//item"); - hiemstra.addExtractor( - new TextExtractor("title", "./title"), - new TextExtractor("description", "./description"), - new TextExtractor("url", "./link") - ); - return hiemstra; - } - - private Resource searsiaSearch() throws XPathExpressionException { - return new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json"); - } - - private Resource xmlSearch() throws XPathExpressionException, SearchException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-{r?}.json"); - Resource wikifull = wiki.searchResource("wikifull"); - return wikifull; - } - - private Resource jsonSearch() throws XPathExpressionException { - Resource wiki = new Resource("http://searsia.org/searsia/wiki-{q?}-wikifull.json"); - wiki.setMimeType("application/json"); - wiki.setItemXpath("//hits"); - wiki.addExtractor( - new TextExtractor("title", "./title"), - new TextExtractor("description", "./description"), - new TextExtractor("url", "./url"), - new TextExtractor("content", "./content") - ); - return wiki; - } - - private Resource javascriptSearch() throws XPathExpressionException { - Resource wikifull = new Resource("http://searsia.org/searsia/wiki-{q}-wikifull.js"); - wikifull.setMimeType("application/x-javascript"); - wikifull.setItemXpath("//hits"); - wikifull.addExtractor( - new TextExtractor("title", "./title"), - new TextExtractor("description", "./description"), - new TextExtractor("url", "./url") - ); - return wikifull; - } - + @BeforeClass public static void setUp() { - Logger.getLogger("").setLevel(Level.WARNING); + Logger.getLogger("").setLevel(Level.WARN); } @Test public void testSearchSearsia() throws XPathExpressionException, SearchException { - Resource se = searsiaSearch(); + Resource se = new Resource("file:src/test/resources/index.json").updateFromAPI(); String query = "informat"; SearchResult result = se.search(query); Assert.assertEquals(query, result.getQuery()); @@ -100,16 +32,15 @@ public void testSearchSearsia() throws XPathExpressionException, SearchException @Test public void testSearchHtml() throws XPathExpressionException, SearchException { - Resource se = htmlSearch(); - SearchResult result = se.search("dolf trieschnigg", true); + Resource se = new Resource("file:src/test/resources/hiemstra.json").updateFromAPI(); + SearchResult result = se.search("dolf trieschnigg", "xml"); Assert.assertEquals("text/html", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); - // TODO text nodes are glued together. } @Test public void testSearchPost() throws XPathExpressionException, SearchException { - Resource se = postSearch(); + Resource se = new Resource("file:src/test/resources/hiemstrapost.json").updateFromAPI(); SearchResult result = se.search("dolf trieschnigg"); Assert.assertEquals("application/xml", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -117,52 +48,69 @@ public void testSearchPost() throws XPathExpressionException, SearchException { @Test public void testSearchXml() throws XPathExpressionException, SearchException { - Resource se = xmlSearch(); - SearchResult result = se.search("informat"); - Assert.assertEquals("application/xml", se.getMimeType()); + Resource se1 = new Resource("http://searsia.org/searsia/wiki/index{searchTerms}.json").updateFromAPI(); + Resource se2 = se1.searchResource("wikifull1"); + SearchResult result = se2.search("informat"); + Assert.assertEquals("application/xml", se2.getMimeType()); Assert.assertEquals(10, result.getHits().size()); } @Test public void testSearchXml2() throws XPathExpressionException, SearchException { - Resource se = htmlSearch(); - se.setMimeType("application/xml"); - se.setRerank(null); + Resource se = new Resource("file:src/test/resources/hiemstraxml.json").updateFromAPI(); long startTime = System.currentTimeMillis(); SearchResult result = se.search("test"); + Assert.assertEquals("application/xml", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); Assert.assertFalse("Parser timed out", System.currentTimeMillis() - startTime > 10000); } + @Test + public void testSearchXml3() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("http://searsia.org/searsia/wiki/cse1{searchTerms}.json").updateFromAPI(); + SearchResult result = se1.search("life"); + Assert.assertEquals("application/xml", se1.getMimeType()); + Assert.assertEquals(10, result.getHits().size()); + } + @Test public void testSearchJson() throws XPathExpressionException, SearchException { - Resource se = jsonSearch(); - Boolean debug = true; + Resource se = new Resource("file:src/test/resources/searsia.json").updateFromAPI(); + String debug = "xml"; SearchResult result = se.search("informat", debug); - Assert.assertNotNull(result.getXmlOut()); + Assert.assertNotNull(result.getDebugOut()); Assert.assertEquals("application/json", se.getMimeType()); - Assert.assertEquals(10, result.getHits().size()); + Assert.assertTrue("Result size 10 or more", result.getHits().size() >= 10); } @Test public void testSearchJson2() throws XPathExpressionException, SearchException { - Resource se = jsonSearch(); + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json"); SearchResult result = se.search("json"); Assert.assertEquals(1, result.getHits().size()); Assert.assertEquals("extra content", result.getHits().get(0).getString("content")); } @Test - public void testSearchJson3() throws XPathExpressionException, SearchException { - Resource se = jsonSearch(); + public void testSearchJsonStrangeKeys() throws XPathExpressionException, SearchException { + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json"); SearchResult result = se.search("strange keys"); Assert.assertEquals(1, result.getHits().size()); } - @Test + @Test + public void testSearchJsonHtmlAndlinks() throws XPathExpressionException, SearchException { + Resource se = new Resource("http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json"); + SearchResult result = se.search("html and links"); + Assert.assertEquals(2, result.getHits().size()); + Assert.assertEquals("Another test for Searsia", result.getHits().get(0).getTitle()); + Assert.assertEquals("mailto:info@searsia.org", result.getHits().get(1).getString("url")); // TODO getUrl instead of getString + } + + @Test public void testSearchJavascript() throws XPathExpressionException, SearchException { - Resource se = javascriptSearch(); - Boolean debug = true; + Resource se = new Resource("file:src/test/resources/javascript.json").updateFromAPI(); + String debug = "xml"; SearchResult result = se.search("informat", debug); Assert.assertEquals("application/x-javascript", se.getMimeType()); Assert.assertEquals(10, result.getHits().size()); @@ -170,40 +118,60 @@ public void testSearchJavascript() throws XPathExpressionException, SearchExcept @Test public void testSearchSearsiaEmpty() throws XPathExpressionException, SearchException { - Resource se = searsiaSearch(); - SearchResult result = se.search(); + Resource se = new Resource("http://searsia.org/searsia/wiki/index{searchTerms}.json").updateFromAPI(); + SearchResult result = se.searchWithoutQuery(); Assert.assertTrue(result.getHits().size() > 0); } @Test public void testSearchResource() throws XPathExpressionException, SearchException { - Resource se = searsiaSearch(); - Resource engine = se.searchResource("wikifull"); + Resource se = new Resource("file:src/test/resources/index.json").updateFromAPI(); + Resource engine = se.searchResource("wikifull1"); Assert.assertTrue(engine != null); } + @Test + public void testSearchNoResource1() throws XPathExpressionException, SearchException { + Resource se = new Resource("file:src/test/resources/hiemstra.json").updateFromAPI(); + Boolean exception = false; + try { + se.searchResource("wikifull1"); + } catch (SearchException e) { + exception = true; + } + Assert.assertTrue("Non-Searsia engine throws exception", exception); + } + + @Test + public void testSearchNoResource2() throws XPathExpressionException, SearchException { + Resource se = new Resource("file:src/test/resources/randomid.json").updateFromAPI(); + Boolean exception = false; + try { + se.searchResource("wikifull1"); + } catch (SearchException e) { + exception = true; + } + Assert.assertTrue("No resources exception", exception); + } + @Test - public void testSearchError() throws XPathExpressionException { - Resource se = htmlSearch(); - se.setUrlAPITemplate("http://wwwhome.cs.utwente.nl/~hiemstra/WRONG/?s={q}&api={apikey}&p={p?}"); + public void testSearchError() throws XPathExpressionException, SearchException { + Resource se = new Resource("file:src/test/resources/wrong.json").updateFromAPI(); String message = null; + String apiKey = se.getPrivateParameter("apikey"); try { se.search("test"); } catch (SearchException e) { message = e.getMessage(); } - Assert.assertNotNull(message); - Assert.assertFalse("error message reveals secret", message.contains(SECRET_API_KEY)); + Assert.assertNotNull("Error message", message); + Assert.assertNotNull("API key", apiKey); + Assert.assertFalse("Error message reveals secret", message.contains(apiKey)); } @Test - public void testJsonRoundtrip() throws XPathExpressionException { - Resource se1 = htmlSearch(); - se1.setPostString("POST"); - se1.setPostQueryEncode("application/x-www-form-urlencoded"); - se1.setRerank("lm"); - se1.setBanner("me.png"); - se1.setUrlSuggestTemplate("http://whatever"); + public void testJsonRoundtrip() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("file:src/test/resources/hiemstracrazy.json").updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertEquals("id", se1.getId(), se2.getId()); @@ -225,18 +193,25 @@ public void testJsonRoundtrip() throws XPathExpressionException { Assert.assertEquals("postencode", se1.getPostQueryEncode(), se2.getPostQueryEncode()); Assert.assertFalse("secret revealed", json.toString().contains(SECRET_API_KEY)); } + + @Test + public void testJsonPrivateParameter() throws XPathExpressionException { + JSONObject json = new JSONObject("{\"id\":\"test\", \"privateparameters\":{\"apikey\":\"secret\"}}"); + Resource se = new Resource(json); + Assert.assertEquals("private parameters", se.getPrivateParameter("apikey"), "secret"); + } @Test - public void equalEngines1() throws XPathExpressionException { - Resource se1 = htmlSearch(); + public void equalEngines1() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("file:src/test/resources/hiemstra.json").updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertTrue("Equals big engine", se1.equals(se2)); } @Test - public void equalEngines2() throws XPathExpressionException { - Resource se1 = searsiaSearch(); + public void equalEngines2() throws XPathExpressionException, SearchException { + Resource se1 = new Resource("file:src/test/resources/index.json").updateFromAPI(); JSONObject json = se1.toJson(); Resource se2 = new Resource(json); Assert.assertTrue("Truely Equals small engine", se1.equals(se2)); diff --git a/src/test/java/org/searsia/engine/SignaturesTest.java b/src/test/java/org/searsia/engine/SignaturesTest.java new file mode 100644 index 0000000..89f3dda --- /dev/null +++ b/src/test/java/org/searsia/engine/SignaturesTest.java @@ -0,0 +1,26 @@ +package org.searsia.engine; + +import org.junit.Assert; +import org.junit.Test; + +import org.searsia.engine.Signatures; + +public class SignaturesTest { + + /** + * Signing a web request using Amazon's HMAC-SHA256. + * Example string from: + * http://docs.aws.amazon.com/AWSECommerceService/latest/DG/rest-signature.html + * For more information: + * https://tools.ietf.org/html/rfc2104 + */ + @Test + public void testAmazonHMACSHA256() { + String secretKey = "1234567890"; + String requestUrl = "http://webservices.amazon.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=AKIAIOSFODNN7EXAMPLE&AssociateTag=mytag-20&Operation=ItemLookup&ItemId=0679722769&ResponseGroup=Images,ItemAttributes,Offers,Reviews&Version=2013-08-01&Timestamp=2014-08-18T12:00:00Z"; + String targetUrl = "http://webservices.amazon.com/onca/xml?AWSAccessKeyId=AKIAIOSFODNN7EXAMPLE&AssociateTag=mytag-20&ItemId=0679722769&Operation=ItemLookup&ResponseGroup=Images%2CItemAttributes%2COffers%2CReviews&Service=AWSECommerceService&Timestamp=2014-08-18T12%3A00%3A00Z&Version=2013-08-01&Signature=j7bZM0LXZ9eXeZruTqWm2DIvDYVUU3wxPPpp%2BiXxzQc%3D"; + String signedUrl = Signatures.sign(requestUrl, "HmacSHA256", secretKey); + Assert.assertEquals("Signed request", targetUrl, signedUrl); + } + +} diff --git a/src/test/java/org/searsia/index/TestResourceIndex.java b/src/test/java/org/searsia/index/TestResourceIndex.java index d99ffdc..52cd597 100644 --- a/src/test/java/org/searsia/index/TestResourceIndex.java +++ b/src/test/java/org/searsia/index/TestResourceIndex.java @@ -2,60 +2,71 @@ import java.io.IOException; +import javax.xml.xpath.XPathExpressionException; + +import org.apache.log4j.Logger; +import org.apache.log4j.varia.NullAppender; +import org.json.JSONException; +import org.json.JSONObject; import org.junit.Assert; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; - import org.searsia.index.ResourceIndex; import org.searsia.engine.Resource; public class TestResourceIndex { + private static final Logger LOGGER = Logger.getLogger("org.searsia"); private static final String PATH = "target/index-test"; private static final String INDEX = "test"; private static ResourceIndex engines; @BeforeClass public static void setUp() throws Exception { + LOGGER.removeAllAppenders(); + LOGGER.addAppender(new NullAppender()); engines = new ResourceIndex(PATH, INDEX); Resource engine = searsia(); engines.putMother(engine); } @AfterClass - public static void lastThing() throws IOException { + public static void lastThing() throws IOException, XPathExpressionException, JSONException { engines.put(newby()); checkFiles(); } - private static Resource utwente() { - Resource e = new Resource("http://utwente.nl/search?q={q}", "567"); - e.setName("UT"); - return e; + private static Resource utwente() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://utwente.nl/search?q={searchTerms}\",\"id\":\"567\",\"name\":\"UT\"}" + ); + return new Resource(json); } - private static Resource searsia() { - Resource e = new Resource("http://searsia.com/?q={q}", "1234"); - e.addPrivateParameter("api", "topsecret"); - return e; + private static Resource searsia() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://searsia.com/?q={searchTerms}\",\"id\":\"1234\",\"privateparameters\":{\"api\":\"topsecret\"}}" + ); + return new Resource(json); } - private static Resource newby() { - Resource e = new Resource("http://new.com/?q={q}"); - e.changeId("890"); - e.addPrivateParameter("apikey", "secret"); - return e; + private static Resource newby() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://new.com/?q={searchTerms}\",\"id\":\"new\",\"privateparameters\":{\"apikey\":\"secret\"}}" + ); + return new Resource(json); } - private static Resource me() { - Resource e = new Resource("http://me.org"); - e.setName("Me"); - return e; + private static Resource me() throws XPathExpressionException, JSONException { + JSONObject json = new JSONObject( + "{\"apitemplate\":\"http://me.org\",\"id\":\"me\",\"name\":\"Me\"}" + ); + return new Resource(json); } - public static void checkFiles() throws IOException { + public static void checkFiles() throws IOException, XPathExpressionException, JSONException { Resource e1 = me(); Resource e2 = engines.getMyself(); Assert.assertTrue("Trying to retrieve me", e1.equals(e2)); @@ -65,12 +76,12 @@ public static void checkFiles() throws IOException { Assert.assertTrue("No private parameters expected", e4.getJsonPrivateParameters() == null); Resource e6 = engines.get(newby().getId()); Assert.assertTrue("Private parameters expected", e6.getJsonPrivateParameters() != null); - Assert.assertTrue("Top 1", engines.topValues("anything", 1).size() == 1); - Assert.assertTrue("Top 2", engines.topValues(null, 2).size() == 2); + Assert.assertTrue("Top 1", engines.topValuesNotDeleted("anything", 1).size() == 1); + Assert.assertTrue("Top 2", engines.topValuesNotDeleted(null, 2).size() == 2); } @Test - public void addResource() { + public void addResource() throws XPathExpressionException, JSONException { Resource e1 = utwente(); engines.put(e1); Resource e2 = engines.get(e1.getId()); @@ -78,7 +89,7 @@ public void addResource() { } @Test - public void addMe() { + public void addMe() throws XPathExpressionException, JSONException { Resource e1 = me(); engines.putMyself(e1); Resource e2 = engines.getMyself(); @@ -86,7 +97,7 @@ public void addMe() { } @Test - public void getMother() { + public void getMother() throws XPathExpressionException, JSONException { Resource e1 = searsia(); Resource e2 = engines.getMother(); Assert.assertTrue("Mother", e1.equals(e2)); diff --git a/src/test/java/org/searsia/index/TestSearchResultIndex.java b/src/test/java/org/searsia/index/TestSearchResultIndex.java index 1207809..8e9ec36 100644 --- a/src/test/java/org/searsia/index/TestSearchResultIndex.java +++ b/src/test/java/org/searsia/index/TestSearchResultIndex.java @@ -30,12 +30,13 @@ public class TestSearchResultIndex { public static void setUp() throws Exception { LOGGER.removeAllAppenders(); LOGGER.addAppender(new NullAppender()); - index = new SearchResultIndex(PATH, INDEX, 2); + index = new SearchResultIndex(PATH, INDEX, 10); SearchResult result = readFile("exampleSearchResult.json"); index.offer(result); index.flush(); } - + + private static SearchResult readFile(String fileString) throws IOException { SearchResult result = new SearchResult(); String s, jsonString = ""; // TODO: Does the following file name work in Windows? @@ -48,10 +49,16 @@ private static SearchResult readFile(String fileString) throws IOException { finally { reader.close(); } - JSONArray hits = (new JSONObject(jsonString).getJSONArray("hits")); + JSONObject json = new JSONObject(jsonString); + JSONArray hits = json.getJSONArray("hits"); for(int i = 0; i < hits.length(); i++) { result.addHit(new Hit(hits.getJSONObject(i))); } + JSONObject resource = json.getJSONObject("resource"); + result.setResourceId(resource.getString("id")); + if (json.has("query")) { + result.setQuery(json.getString("query")); + } return result; } @@ -72,11 +79,15 @@ public void testSearch1() throws Exception { public void testSearch2() throws Exception { SearchResult result = readFile("exampleSearchResult.json"); index.offer(result); - index.flush(); String query = "dolf"; + String resourceId = result.getResourceId(); + SearchResult result2 = index.cacheSearch(query, resourceId); + Assert.assertEquals(query, result2.getQuery()); + Assert.assertEquals("Cache result size", 10, result2.getHits().size()); + index.flush(); result = index.search(query); Assert.assertEquals(query, result.getQuery()); - Assert.assertEquals(1, result.getHits().size()); + Assert.assertEquals("Index result size", 1, result.getHits().size()); } @Test @@ -85,7 +96,7 @@ public void testSearch3() throws Exception { Assert.assertEquals(6, result.getHits().size()); } - @Test + @Test // test hit lookup (not used currently) public void testSearch4() throws Exception { SearchResult result = readFile("exampleSearchResult.json"); Hit hit1 = result.getHits().get(0); @@ -93,6 +104,21 @@ public void testSearch4() throws Exception { Assert.assertEquals(hit1.getTitle(), hit2.getTitle()); } + @Test // test the cache + public void testSearch5() throws Exception { + SearchResult result = readFile("exampleSearchResult.json"); + String query = "information"; + result.setQuery(query); + String resourceId = result.getResourceId(); + index.offer(result); + result = index.cacheSearch(query, resourceId); + Assert.assertEquals(10, result.getHits().size()); + result = index.cacheSearch(query, "nothing"); + Assert.assertTrue(result == null); + result = index.cacheSearch("nope", resourceId); + Assert.assertTrue(result == null); + } + /** * Can also be used from the command line to test an existing index * @param args query diff --git a/src/test/java/org/searsia/web/SearchTest.java b/src/test/java/org/searsia/web/SearchTest.java index b7a7270..e01d667 100644 --- a/src/test/java/org/searsia/web/SearchTest.java +++ b/src/test/java/org/searsia/web/SearchTest.java @@ -3,51 +3,76 @@ import java.io.IOException; import javax.ws.rs.core.Response; +import javax.xml.xpath.XPathExpressionException; +import org.apache.log4j.Appender; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Level; import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.apache.log4j.varia.NullAppender; import org.json.JSONArray; +import org.json.JSONException; import org.json.JSONObject; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; - +import org.searsia.SearsiaOptions; import org.searsia.index.SearchResultIndex; import org.searsia.index.ResourceIndex; import org.searsia.web.Search; import org.searsia.engine.Resource; public class SearchTest { + + private static boolean letsLog = false; private static final Logger LOGGER = Logger.getLogger("org.searsia"); private static final String PATH = "target/index-test"; private static final String INDEX = "test2"; private static SearchResultIndex index; private static ResourceIndex engines; - - - private static Resource utwente() { - return new Resource("https://search.utwente.nl/searsia/search.php?q={q?}&r={r?}", "utwente"); + private static SearsiaOptions options; + + + private static Resource wiki() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wiki{searchTerms}.json\", \"id\":\"wiki\"}")); } - private static Resource wrong() { - return new Resource("http://searsia.com/doesnotexist?q={q}", "wrong"); + private static Resource wrong() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://reallyreallydoesnotexist.com/wrong?q={searchTerms}\", \"id\":\"wrong\"}")); } - private static Resource me() { - return new Resource("http://me.org?q={q}"); + private static Resource ok() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://searsia.org/searsia/wiki/wikifull1{searchTerms}.json\", \"id\":\"wikifull1\"}")); } + private static Resource okDeleted() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"deleted\":true, \"id\":\"wikifull1\"}")); + } + private static Resource me() throws XPathExpressionException, JSONException { + return new Resource(new JSONObject("{\"apitemplate\":\"http://me.org?q={searchTerms}\", \"id\":\"wiki\"}")); + } + @BeforeClass public static void setUp() throws Exception { + Appender appender = null; LOGGER.removeAllAppenders(); - LOGGER.addAppender(new NullAppender()); // thou shall not log - index = new SearchResultIndex(PATH, INDEX, 2); + if (letsLog) { + appender = new ConsoleAppender(new PatternLayout("%m%n"), ConsoleAppender.SYSTEM_ERR); + } else { + appender = new NullAppender(); // thou shall not log + } + LOGGER.addAppender(appender); + LOGGER.setLevel(Level.ALL); + index = new SearchResultIndex(PATH, INDEX, 10); engines = new ResourceIndex(PATH, INDEX); - engines.putMother(utwente()); - engines.put(wrong()); + options = new SearsiaOptions(); + engines.putMother(wiki()); + engines.put(wrong()); + engines.put(ok()); engines.putMyself(me()); } @@ -58,20 +83,20 @@ public static void lastThing() throws IOException { @Test // returns 'my' resource description public void test() throws IOException { - Search search = new Search(index, engines); - Response response = search.query("", ""); + Search search = new Search(index, engines, options); + Response response = search.query("wiki.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); - Assert.assertEquals("708addc213e3daf4b9742883d18d0c45", resource.get("id")); + Assert.assertEquals("wiki", resource.get("id")); } @Test // returns local search results for 'searsia' public void testQuery() throws IOException { - Search search = new Search(index, engines); - Response response = search.query("", "searsia search for noobs"); + Search search = new Search(index, engines, options); + Response response = search.query("wiki.json", "searsia search for noobs", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); @@ -85,42 +110,85 @@ public void testQuery() throws IOException { } } Assert.assertEquals(200, status); - Assert.assertTrue(hits.length() > 0); + Assert.assertTrue(hits.length() == 3); Assert.assertEquals("http://searsia.org", url); + Assert.assertNotNull(json.get("resource")); } @Test // returns local resource 'wrong' - public void testResource() throws IOException { - Search search = new Search(index, engines); - Response response = search.query("wrong", ""); + public void testResource() throws IOException, XPathExpressionException, JSONException { + Search search = new Search(index, engines, options); + Response response = search.query("wrong.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); + Assert.assertTrue(json.has("health")); Assert.assertEquals(wrong().getAPITemplate(), resource.get("apitemplate")); } - @Test // returns resource 'youtube' (from mother) + @Test // returns local resource 'wrong' without apitemplate and health + public void testResourceNoSharing() throws IOException, XPathExpressionException, JSONException { + String[] args = {"-d", "-n", "-m=http://searsia.org/searsia/wiki/wiki{searchTerms}.json"}; + SearsiaOptions newOptions = new SearsiaOptions(args); + Search search = new Search(index, engines, newOptions); + Response response = search.query("wrong.json", "", null, null); + String entity = (String) response.getEntity(); + JSONObject json = new JSONObject(entity); + JSONObject resource = (JSONObject) json.get("resource"); + Assert.assertFalse(json.has("health")); + Assert.assertFalse(resource.has("apitemplate")); + } + + @Test // returns resource 'wikididyoumean' (from mother) public void testResourceUnknown() throws IOException { - Search search = new Search(index, engines); - Response response = search.query("youtube", ""); + Search search = new Search(index, engines, options); + Response response = search.query("wikididyoumean.json", "", null, null); int status = response.getStatus(); String entity = (String) response.getEntity(); JSONObject json = new JSONObject(entity); JSONObject resource = (JSONObject) json.get("resource"); Assert.assertEquals(200, status); - Assert.assertEquals("Youtube", resource.get("name")); + Assert.assertEquals("Did you mean:", resource.get("name")); } @Test // returns results for the engine 'wrong' (which does not exist) public void testError() throws IOException { - Search search = new Search(index, engines); - Response response = search.query("wrong", "testquery"); + Search search = new Search(index, engines, options); + Response response = search.query("wrong.json", "testquery", null, null); int status = response.getStatus(); Assert.assertEquals(503, status); } + @Test // returns results for the engine 'wikifull1' + public void testOk() throws IOException, XPathExpressionException, JSONException { + Search search = new Search(index, engines, options); + Response response = search.query("wikifull1.json", "informat", null, null); + int status = response.getStatus(); + String entity = (String) response.getEntity(); + JSONObject json = new JSONObject(entity); + Assert.assertEquals(200, status); + Assert.assertNotNull(json.get("hits")); + Assert.assertNotNull(json.get("resource")); + LOGGER.trace("Query result: " + json); + + response = search.query("wikifull1.json", "informat", null, null); + status = response.getStatus(); + entity = (String) response.getEntity(); + json = new JSONObject(entity); + Assert.assertEquals(200, status); + Assert.assertNotNull(json.get("hits")); + Assert.assertNotNull(json.get("resource")); + LOGGER.trace("Cache result: " + json); + + engines.put(okDeleted()); + response = search.query("wikifull1.json", "informat", null, null); + status = response.getStatus(); + entity = (String) response.getEntity(); + json = new JSONObject(entity); + Assert.assertEquals(410, status); + LOGGER.trace("No result: " + json); + } - } diff --git a/src/test/resources/exampleSearchResult.json b/src/test/resources/exampleSearchResult.json index 3a051e8..92088ba 100644 --- a/src/test/resources/exampleSearchResult.json +++ b/src/test/resources/exampleSearchResult.json @@ -53,8 +53,9 @@ ], "resource": { "id": "nl.utwente.hiemstra", - "urltemplate": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/?s={q}", + "urltemplate": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/?s={searchTerms}", "favicon": "http:\/\/wwwhome.cs.utwente.nl\/~hiemstra\/images\/ut.ico", "name": "Djoerd Hiemstra" - } + }, + "query": "dolf" } diff --git a/src/test/resources/hiemstra.json b/src/test/resources/hiemstra.json new file mode 100644 index 0000000..064fec1 --- /dev/null +++ b/src/test/resources/hiemstra.json @@ -0,0 +1,24 @@ +{ + "resource": { + "id": "hiemstra", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "headers": { + "User-Agent": "Test/1.0" + }, + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "text/html", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" + } +} diff --git a/src/test/resources/hiemstracrazy.json b/src/test/resources/hiemstracrazy.json new file mode 100644 index 0000000..85706d5 --- /dev/null +++ b/src/test/resources/hiemstracrazy.json @@ -0,0 +1,29 @@ +{ + "resource": { + "id": "hiemstrapost", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "post": "POST", + "postencode": "application/x-www-form-urlencoded", + "rerank": "lm", + "banner": "me.png", + "suggesttemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/keywords.php?q=d&limit=10", + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "headers": { + "User-Agent": "Test/1.0" + }, + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "text/html", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" + } +} diff --git a/src/test/resources/hiemstrapost.json b/src/test/resources/hiemstrapost.json new file mode 100644 index 0000000..63a01ae --- /dev/null +++ b/src/test/resources/hiemstrapost.json @@ -0,0 +1,22 @@ +{ + "resource": { + "id": "hiemstra", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/", + "post": "s={searchTerms}&api={apikey}&paged={startPage?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "application/xml", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" + } +} diff --git a/src/test/resources/hiemstraxml.json b/src/test/resources/hiemstraxml.json new file mode 100644 index 0000000..c1ad8cd --- /dev/null +++ b/src/test/resources/hiemstraxml.json @@ -0,0 +1,17 @@ +{ + "resource": { + "id": "hiemstraxml", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&paged={startPage?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "itempath": "//div[@class='post']", + "mimetype": "application/xml", + "prior": 0.3, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" + } +} diff --git a/src/test/resources/index.json b/src/test/resources/index.json new file mode 100644 index 0000000..11bd1a9 --- /dev/null +++ b/src/test/resources/index.json @@ -0,0 +1,6 @@ +{ + "resource": { + "apitemplate": "http://searsia.org/searsia/wiki/index{searchTerms}.json", + "id": "index" + } +} diff --git a/src/test/resources/javascript.json b/src/test/resources/javascript.json new file mode 100644 index 0000000..54eb1c4 --- /dev/null +++ b/src/test/resources/javascript.json @@ -0,0 +1,14 @@ +{ + "resource": { + "id": "javascript", + "apitemplate": "http://searsia.org/searsia/wiki/wikifull1{searchTerms}.js", + "itempath": "//hits", + "extractors": { + "description": "./description", + "title": "./title", + "url": "./url" + }, + "mimetype": "application/x-javascript", + "name": "Wiki Full 1" + } +} diff --git a/src/test/resources/randomid.json b/src/test/resources/randomid.json new file mode 100644 index 0000000..5380e4e --- /dev/null +++ b/src/test/resources/randomid.json @@ -0,0 +1,8 @@ +{ + "resource": { + "apitemplate": "http://searsia.org/searsia/wiki/wikididyoumean{searchTerms?}.json", + "id": "randomid", + "mimetype": "application/searsia+json", + "testquery": "searsia" + } +} diff --git a/src/test/resources/searsia.json b/src/test/resources/searsia.json new file mode 100644 index 0000000..9ac05f6 --- /dev/null +++ b/src/test/resources/searsia.json @@ -0,0 +1,16 @@ +{ + "resource": { + "id": "searsia", + "apitemplate": "http://searsia.org/searsia/search.json", + "itempath": "//hits", + "extractors": { + "description": "./description", + "title": "./title", + "url": "./url" + }, + "favicon": "http://searsia.org/images/searsia.png", + "mimetype": "application/json", + "name": "Searsia", + "urltemplate": "http://searsia.org/searsia/search.json" + } +} diff --git a/src/test/resources/wrong.json b/src/test/resources/wrong.json new file mode 100644 index 0000000..4120cf4 --- /dev/null +++ b/src/test/resources/wrong.json @@ -0,0 +1,24 @@ +{ + "resource": { + "id": "hiemstra", + "apitemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/WRONG?s={searchTerms}&api={apikey}&paged={startPage?}", + "extractors": { + "description": "./h3/following-sibling::text()", + "title": "./h3", + "url": "./h3/a/@href" + }, + "favicon": "http://wwwhome.cs.utwente.nl/~hiemstra/images/ut.ico", + "headers": { + "User-Agent": "Test/1.0" + }, + "itempath": "//div[@class='post']", + "maxqueriesperday": 133, + "mimetype": "text/html", + "prior": 0.3, + "privateparameters": { + "apikey": "SECRET!!" + }, + "testquery": "searsia", + "urltemplate": "http://wwwhome.cs.utwente.nl/~hiemstra/?s={searchTerms}&api={apikey}&paged={startPage?}" + } +}