Skip to content

Commit

Permalink
Filter out noise from HAR file
Browse files Browse the repository at this point in the history
  • Loading branch information
cccs-shellyw committed Dec 16, 2024
1 parent 337a6d8 commit 31e5406
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 63 deletions.
34 changes: 19 additions & 15 deletions src/ca/gc/cyber/kangooroo/KangoorooStandaloneRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import ca.gc.cyber.kangooroo.report.KangoorooURLReport;
import ca.gc.cyber.kangooroo.utils.io.net.http.HarUtils;
import ca.gc.cyber.kangooroo.utils.io.net.url.URLRedirection;
import ca.gc.cyber.kangooroo.utils.log.MessageLog;

import java.io.File;
import java.io.FileInputStream;
Expand All @@ -18,6 +17,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;

import java.util.stream.Collectors;

import org.apache.commons.cli.CommandLine;
Expand All @@ -29,12 +29,10 @@
import org.apache.commons.cli.ParseException;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yaml.snakeyaml.Yaml;

import com.browserup.harreader.HarReader;
import com.browserup.harreader.model.HarEntry;
import com.browserup.harreader.model.HarHeader;
import com.google.gson.Gson;
Expand All @@ -60,11 +58,21 @@ public static Logger getLogger() {
return log;
}


public static KangoorooURLReport generateKangoorooReport(KangoorooResult result, Long processTime,
URL url, URLType urlType, String windowSize, String userAgent, boolean sanitizeSession,
URL url, URLType urlType, String windowSize, String userAgent, boolean sanitizeSession,
List<String> messageLog) throws IOException {

// There is a bug with browserup proxy where if you upstream proxy there will
// always be a connection error message with status code 0.
// For now I am doing an ugly fix where I remove the response entry with 0
// status code.

HarUtils.removeResponseEntries(result.getHar(), 0);

// Chromebrowser makes noisy requests that is unrelated to the URL for get.
// We will filter out these requests from the HAR file
HarUtils.removeRequestUrlEntries(result.getHar(), "https://accounts.google.com/ListAccounts");

KangoorooURLReport kangoorooReport = null;

if (sanitizeSession) {
Expand All @@ -83,7 +91,6 @@ public static KangoorooURLReport generateKangoorooReport(KangoorooResult result,
}

Map<String, String> engineInfo = Map.of("engineName", ENGINE_NAME, "engineVersion", engineVersion);


kangoorooReport.setExperiment(engineInfo,
(result.isConnectionSuccess() && result.isFetchSuccess()) ? "SUCCESS" : "FAIL", messageLog,
Expand Down Expand Up @@ -111,8 +118,8 @@ public static KangoorooURLReport generateKangoorooReport(KangoorooResult result,
url.getHost(), result.getInitial().getServerIPAddress());

KangoorooURL actualUrl = null;
if (result.getUrl() != null){

if (result.getUrl() != null) {
actualUrl = new KangoorooURL(result.getUrl().toExternalForm(),
DigestUtils.md5Hex(result.getUrl().toExternalForm()),
result.getUrl().getHost(), lastHop.getServerIPAddress());
Expand All @@ -122,7 +129,6 @@ public static KangoorooURLReport generateKangoorooReport(KangoorooResult result,

kangoorooReport.setSummary(fetchResult, requestedUrl, actualUrl, urlRedirects, headers);


}

return kangoorooReport;
Expand All @@ -144,13 +150,12 @@ private static void runKangooroo(boolean useSandbox, boolean useCaptchaSolver, b
long start = System.currentTimeMillis();
KangoorooResult res = browser.get(crawlUrl, windowSize, userAgent);
long processingTime = System.currentTimeMillis() - start;

browser.browserShutdown();

createKangoorooOutput(urlOutputDir, configuration, crawlUrl, res, processingTime, saveOriginalHar, urlType,
sanitizeSession, browser.getMessageLog().getMessagesAsList(), simpleResult);


}

private static void createKangoorooOutput(File urlOutputDir, KangoorooRunnerConf configuration, URL crawlUrl,
Expand All @@ -173,7 +178,7 @@ private static void createKangoorooOutput(File urlOutputDir, KangoorooRunnerConf
if (simpleResult) {
log.info("Remove HAR info from result.json.");
report.setReport(null);
}
}

File resultFile = new File(urlOutputDir, "results.json");
FileUtils.writeStringToFile(resultFile, GSON.toJson(report), java.nio.charset.StandardCharsets.UTF_8);
Expand Down Expand Up @@ -224,7 +229,8 @@ public static void main(String[] args) throws Throwable {
Option notSaveHarOption = new Option("nsh", "not-save-har", false,
"Do NOT save original HAR as separate file.");
Option modulesOption = new Option("mods", "modules", true, "Use modules");
Option simpleResultOption = new Option("sr", "simple-result", false, "Simplified result.json by removing har entries.");
Option simpleResultOption = new Option("sr", "simple-result", false,
"Simplified result.json by removing har entries.");

options.addOption(notSanitizeSessionOption);
options.addOption(notSaveFilesOption);
Expand Down Expand Up @@ -328,7 +334,6 @@ public static void main(String[] args) throws Throwable {
log.info("To save original har file: " + saveOriginalHar);
log.info("To save all files: " + saveFiles);


boolean useSandbox = !params.hasOption("no-sandbox");
boolean useCaptchaSolver = enabledModules.contains("captcha");

Expand Down Expand Up @@ -406,7 +411,6 @@ public static void main(String[] args) throws Throwable {
}
}


try {
runKangooroo(useSandbox, useCaptchaSolver, saveFiles, saveOriginalHar, sanitizeSession, urlOutputDir,
urlTempDir, configuration, crawlUrl, windowSize, userAgent, urlType, simplifyResult);
Expand Down
8 changes: 8 additions & 0 deletions src/ca/gc/cyber/kangooroo/browser/KangoorooChromeBrowser.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,15 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import io.netty.handler.codec.http.HttpRequest;
import io.netty.handler.codec.http.HttpResponse;
import com.browserup.bup.util.HttpMessageContents;
import com.browserup.bup.util.HttpMessageInfo;
import com.browserup.bup.BrowserUpProxy;
import com.browserup.bup.BrowserUpProxyServer;
import com.browserup.bup.client.ClientUtil;
import com.browserup.bup.filters.RequestFilter;
import com.browserup.bup.proxy.CaptureType;
import com.browserup.harreader.model.Har;
import com.browserup.harreader.model.HarEntry;
Expand Down Expand Up @@ -247,6 +253,7 @@ protected Pair<Har, URL> processResult(Har har, String userAgent, RemoteWebDrive
*/
private Har get(WebDriver driver, String url, File downloadFolder) {
BrowserUpProxy proxy = getPROXY();

proxy.newHar();
try {
driver.manage().timeouts().pageLoadTimeout(DEFAULT_PAGE_LOAD_TIMEOUT, TimeUnit.SECONDS);
Expand Down Expand Up @@ -672,6 +679,7 @@ public final void checkEnvironment() {

// STEP 2. Make sure Chromium is using MergedFont to display any fonts
BrowserUpProxyServer proxy = new BrowserUpProxyServer();

proxy.start();
RemoteWebDriver driver = createDriver("", "", new File("tmp"));
driver.get("file:///" + Paths.get("etc", "unicode-test.html").toAbsolutePath());
Expand Down
Loading

0 comments on commit 31e5406

Please sign in to comment.