Merge pull request #317 from jekh/remove-htpmime-dependency

jekh · jekh · commit 865c0d55320a · 2015-09-07T17:20:32.000-07:00
Remove Apache httpmime dependency from LP module
diff --git a/browsermob-core-littleproxy/pom.xml b/browsermob-core-littleproxy/pom.xml
@@ -68,6 +68,10 @@
                     <groupId>javax.servlet</groupId>
                     <artifactId>servlet-api</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>org.apache.httpcomponents</groupId>
+                    <artifactId>httpmime</artifactId>
+                </exclusion>
                 <!-- Due to usage in LegacyProxyServer and BrowserMobProxyServer, this dependency needs to be given "provided" scope.
                      It is not used by the BMP LittleProxy implementation itself. -->
                 <exclusion>
diff --git a/browsermob-core-littleproxy/src/main/java/net/lightbody/bmp/filters/HarCaptureFilter.java b/browsermob-core-littleproxy/src/main/java/net/lightbody/bmp/filters/HarCaptureFilter.java
@@ -21,6 +21,7 @@
 import net.lightbody.bmp.core.har.HarPostDataParam;
 import net.lightbody.bmp.core.har.HarRequest;
 import net.lightbody.bmp.core.har.HarResponse;
+import net.lightbody.bmp.exception.UnsupportedCharsetException;
 import net.lightbody.bmp.filters.support.HttpConnectTiming;
 import net.lightbody.bmp.filters.util.HarCaptureUtil;
 import net.lightbody.bmp.proxy.CaptureType;
@@ -421,9 +422,22 @@ protected void captureRequestContent(HttpRequest httpRequest, byte[] fullMessage
             urlEncoded = false;
         }
 
+        Charset charset;
+        try {
+             charset = BrowserMobHttpUtil.readCharsetInContentTypeHeader(contentType);
+        } catch (UnsupportedCharsetException e) {
+            log.warn("Found unsupported character set in Content-Type header '{}' in HTTP request to {}. Content will not be captured in HAR.", contentType, httpRequest.getUri(), e);
+            return;
+        }
+
+        if (charset == null) {
+            // no charset specified, so use the default -- but log a message since this might not encode the data correctly
+            charset = BrowserMobHttpUtil.DEFAULT_HTTP_CHARSET;
+            log.debug("No charset specified; using charset {} to decode contents to {}", charset, httpRequest.getUri());
+        }
+
         if (urlEncoded) {
-            String textContents = BrowserMobHttpUtil.getContentAsString(fullMessage, contentType, originalRequest);
-            Charset charset = BrowserMobHttpUtil.deriveCharsetFromContentTypeHeader(contentType);
+            String textContents = BrowserMobHttpUtil.getContentAsString(fullMessage, charset);
 
             QueryStringDecoder queryStringDecoder = new QueryStringDecoder(textContents, charset, false);
 
@@ -440,7 +454,7 @@ protected void captureRequestContent(HttpRequest httpRequest, byte[] fullMessage
             //TODO: implement capture of files and multipart form data
 
             // not URL encoded, so let's grab the body of the POST and capture that
-            String postBody = BrowserMobHttpUtil.getContentAsString(fullMessage, contentType, originalRequest);
+            String postBody = BrowserMobHttpUtil.getContentAsString(fullMessage, charset);
             harEntry.getRequest().getPostData().setText(postBody);
         }
     }
@@ -451,7 +465,7 @@ protected void captureResponseContent(HttpResponse httpResponse, byte[] fullMess
 
         String contentType = HttpHeaders.getHeader(httpResponse, HttpHeaders.Names.CONTENT_TYPE);
         if (contentType == null) {
-            log.warn("No content type specified in response. Content will be treated as {}", BrowserMobHttpUtil.UNKNOWN_CONTENT_TYPE);
+            log.warn("No content type specified in response from {}. Content will be treated as {}", originalRequest.getUri(), BrowserMobHttpUtil.UNKNOWN_CONTENT_TYPE);
             contentType = BrowserMobHttpUtil.UNKNOWN_CONTENT_TYPE;
         }
 
@@ -461,8 +475,22 @@ protected void captureResponseContent(HttpResponse httpResponse, byte[] fullMess
             forceBinary = true;
         }
 
+        Charset charset;
+        try {
+            charset = BrowserMobHttpUtil.readCharsetInContentTypeHeader(contentType);
+        } catch (UnsupportedCharsetException e) {
+            log.warn("Found unsupported character set in Content-Type header '{}' in HTTP response from {}. Content will not be captured in HAR.", contentType, originalRequest.getUri(), e);
+            return;
+        }
+
+        if (charset == null) {
+            // no charset specified, so use the default -- but log a message since this might not encode the data correctly
+            charset = BrowserMobHttpUtil.DEFAULT_HTTP_CHARSET;
+            log.debug("No charset specified; using charset {} to decode contents from {}", charset, originalRequest.getUri());
+        }
+
         if (!forceBinary && BrowserMobHttpUtil.hasTextualContent(contentType)) {
-            String text = BrowserMobHttpUtil.getContentAsString(fullMessage, contentType, originalRequest);
+            String text = BrowserMobHttpUtil.getContentAsString(fullMessage, charset);
             harEntry.getResponse().getContent().setText(text);
         } else if (dataToCapture.contains(CaptureType.RESPONSE_BINARY_CONTENT)) {
             harEntry.getResponse().getContent().setText(DatatypeConverter.printBase64Binary(fullMessage));
diff --git a/browsermob-core-littleproxy/src/test/groovy/net/lightbody/bmp/util/BrowserMobHttpUtilTest.groovy b/browsermob-core-littleproxy/src/test/groovy/net/lightbody/bmp/util/BrowserMobHttpUtilTest.groovy
@@ -2,7 +2,12 @@ package net.lightbody.bmp.util
 
 import org.junit.Test
 
+import java.nio.charset.Charset
+
 import static org.junit.Assert.assertEquals
+import static org.junit.Assert.assertFalse
+import static org.junit.Assert.assertNull
+import static org.junit.Assert.assertTrue
 
 class BrowserMobHttpUtilTest {
     @Test
@@ -22,7 +27,7 @@ class BrowserMobHttpUtilTest {
 
         uriToResource.each {uri, expectedResource ->
             String parsedResource = BrowserMobHttpUtil.getPathFromUri(uri)
-            assertEquals("Parsed resource from URL did not match expected resource", expectedResource, parsedResource)
+            assertEquals("Parsed resource from URL did not match expected resource for URL: " + uri, expectedResource, parsedResource)
         }
     }
 
@@ -41,7 +46,63 @@ class BrowserMobHttpUtilTest {
 
         uriToHostAndPort.each {uri, expectedHostAndPort ->
             String parsedHostAndPort = BrowserMobHttpUtil.getHostAndPortFromUri(uri)
-            assertEquals("Parsed host and port from URL did not match expected host and port", expectedHostAndPort, parsedHostAndPort)
+            assertEquals("Parsed host and port from URL did not match expected host and port for URL: " + uri, expectedHostAndPort, parsedHostAndPort)
+        }
+    }
+
+    @Test
+    void testReadCharsetInContentTypeHeader() {
+        Map<String, Charset> contentTypeHeaderAndCharset = [
+                'text/html; charset=UTF-8' : Charset.forName('UTF-8'),
+                'text/html; charset=US-ASCII' : Charset.forName('US-ASCII'),
+                'text/html' : null,
+                'application/json;charset=utf-8' : Charset.forName('UTF-8'),
+                'text/*; charset=US-ASCII' : Charset.forName('US-ASCII'),
+                'unknown-type/something-incredible' : null,
+                'unknown-type/something-incredible;charset=UTF-8' : Charset.forName('UTF-8'),
+                '1234 & extremely malformed!' : null,
+                '1234 & extremely malformed!;charset=UTF-8' : null, // malformed content-types result in unparseable charsets
+                '' : null,
+        ]
+
+        contentTypeHeaderAndCharset.each {contentTypeHeader, expectedCharset ->
+            Charset derivedCharset = BrowserMobHttpUtil.readCharsetInContentTypeHeader(contentTypeHeader)
+            assertEquals("Charset derived from parsed content type header did not match expected charset for content type header: " + contentTypeHeader, expectedCharset, derivedCharset)
+        }
+
+        Charset derivedCharset = BrowserMobHttpUtil.readCharsetInContentTypeHeader(null)
+        assertNull("Expected null Content-Type header to return a null charset", derivedCharset)
+
+        boolean threwException = false
+        try {
+            BrowserMobHttpUtil.readCharsetInContentTypeHeader('text/html; charset=FUTURE_CHARSET')
+        } catch (UnsupportedCharsetException) {
+            threwException = true
         }
+
+        assertTrue('Expected an UnsupportedCharsetException to occur when parsing the content type header text/html; charset=FUTURE_CHARSET', threwException)
+    }
+
+    @Test
+    void testHasTextualContent() {
+        Map<String, Boolean> contentTypeHeaderAndTextFlag = [
+                'text/html' : true,
+                'text/*' : true,
+                'application/x-javascript' : true,
+                'application/javascript' : true,
+                'application/xml' : true,
+                'application/xhtml+xml' : true,
+                'application/xhtml+xml; charset=UTF-8' : true,
+                'application/octet-stream' : false,
+                '': false,
+        ]
+
+        contentTypeHeaderAndTextFlag.each {contentTypeHeader, expectedIsText ->
+            boolean isTextualContent = BrowserMobHttpUtil.hasTextualContent(contentTypeHeader)
+            assertEquals("hasTextualContent did not return expected value for content type header: " + contentTypeHeader, expectedIsText, isTextualContent)
+        }
+
+        boolean isTextualContent = BrowserMobHttpUtil.hasTextualContent(null)
+        assertFalse("Expected hasTextualContent to return false for null content type", isTextualContent)
     }
 }
diff --git a/browsermob-core/src/main/java/net/lightbody/bmp/exception/UnsupportedCharsetException.java b/browsermob-core/src/main/java/net/lightbody/bmp/exception/UnsupportedCharsetException.java
@@ -0,0 +1,23 @@
+package net.lightbody.bmp.exception;
+
+/**
+ * A checked exception wrapper for {@link java.nio.charset.UnsupportedCharsetException}. This exception is checked to prevent
+ * situations where an unsupported character set in e.g. a Content-Type header causes the proxy to fail completely, rather
+ * than fallback to some suitable default behavior, such as not parsing the text contents of a message.
+ */
+public class UnsupportedCharsetException extends Exception {
+    public UnsupportedCharsetException(java.nio.charset.UnsupportedCharsetException e) {
+        super(e);
+
+        if (e == null) {
+            throw new IllegalArgumentException("net.lightbody.bmp.exception.UnsupportedCharsetException must be initialized with a non-null instance of java.nio.charset.UnsupportedCharsetException");
+        }
+    }
+
+    /**
+     * @return the underlying {@link java.nio.charset.UnsupportedCharsetException} that this exception wraps.
+     */
+    public java.nio.charset.UnsupportedCharsetException getUnsupportedCharsetExceptionCause() {
+        return (java.nio.charset.UnsupportedCharsetException) this.getCause();
+    }
+}
diff --git a/browsermob-core/src/main/java/net/lightbody/bmp/proxy/CaptureType.java b/browsermob-core/src/main/java/net/lightbody/bmp/proxy/CaptureType.java
@@ -19,8 +19,7 @@ public enum CaptureType {
 
     /**
      * Non-binary HTTP request content, such as post data or other text-based request payload.
-     * FIXME: link to binary content-types
-     * See ${@link TBD} for a list of Content-Types that
+     * See {@link net.lightbody.bmp.util.BrowserMobHttpUtil#hasTextualContent(String)} for a list of Content-Types that
      * are considered non-binary.
      *
      */
@@ -43,8 +42,7 @@ public enum CaptureType {
 
     /**
      * Non-binary HTTP response content (typically, HTTP body content).
-     * FIXME: link to binary content-types
-     * See ${@link TBD} for a list of Content-Types that
+     * See {@link net.lightbody.bmp.util.BrowserMobHttpUtil#hasTextualContent(String)} for a list of Content-Types that
      * are considered non-binary.
      */
     RESPONSE_CONTENT,
diff --git a/browsermob-core/src/main/java/net/lightbody/bmp/util/BrowserMobHttpUtil.java b/browsermob-core/src/main/java/net/lightbody/bmp/util/BrowserMobHttpUtil.java
@@ -1,12 +1,13 @@
 package net.lightbody.bmp.util;
 
 import com.google.common.net.HostAndPort;
+import com.google.common.net.MediaType;
 import io.netty.buffer.ByteBuf;
 import io.netty.handler.codec.http.HttpHeaders;
 import io.netty.handler.codec.http.HttpRequest;
 import io.netty.handler.codec.http.HttpResponse;
 import net.lightbody.bmp.exception.DecompressionException;
-import org.apache.http.entity.ContentType;
+import net.lightbody.bmp.exception.UnsupportedCharsetException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -18,6 +19,7 @@
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.InflaterInputStream;
@@ -149,62 +151,47 @@ public static byte[] extractReadableBytes(ByteBuf content) {
     }
 
     /**
-     * Converts the byte array into a String based on the charset specified in the contentTypeHeader. If no
-     * charset is specified in the contentTypeHeader, this method uses default (see {@link #DEFAULT_HTTP_CHARSET}). The httpRequest is used
-     * only for logging purposes if the contentTypeHeader does not contain a charset.
+     * Converts the byte array into a String based on the specified charset. The charset cannot be null.
      *
      * @param content bytes to convert to a String
-     * @param contentTypeHeader request's content type header
-     * @param httpRequest HTTP request responsible for this content (used for logging purposes only)
+     * @param charset the character set of the content
      * @return String containing the converted content
+     * @throws IllegalArgumentException if charset is null
      */
-    public static String getContentAsString(byte[] content, String contentTypeHeader, HttpRequest httpRequest) {
-        Charset charset = readCharsetInContentTypeHeader(contentTypeHeader);
+    public static String getContentAsString(byte[] content, Charset charset) {
         if (charset == null) {
-            // no charset specified, so use the default -- but log a message since this might not encode the data correctly
-            charset = DEFAULT_HTTP_CHARSET;
-            if (httpRequest != null) {
-                log.debug("No charset specified; using charset {} to decode contents to/from {}", charset, httpRequest.getUri());
-            } else {
-                log.debug("No charset specified; using charset {} to decode contents", charset);
-            }
+            throw new IllegalArgumentException("Charset cannot be null");
         }
 
         return new String(content, charset);
     }
 
     /**
-     * Derives the charset from the Content-Type header. Unlike {@link #readCharsetInContentTypeHeader}, if contentTypeHeader is null or
-     * does not specify a charset, this method will return the ISO-8859-1 charset.
-     *
-     * @param contentTypeHeader the Content-Type header string; can be null or empty
-     * @return the character set indicated in the contentTypeHeader, or ISO-8859-1 if none is specified or no contentTypeHeader is specified
-     */
-    public static Charset deriveCharsetFromContentTypeHeader(String contentTypeHeader) {
-        Charset charset = readCharsetInContentTypeHeader(contentTypeHeader);
-        if (charset == null) {
-            return DEFAULT_HTTP_CHARSET;
-        }
-
-        return charset;
-    }
-
-    /**
-     * Reads the charset directly from the Content-Type header string. If the Content-Type header does not contain a charset, or if the header
-     * is null or empty, this method returns null. See also {@link #deriveCharsetFromContentTypeHeader(String)}.
+     * Reads the charset directly from the Content-Type header string. If the Content-Type header does not contain a charset,
+     * is malformed or unparsable, or if the header is null or empty, this method returns null.
      *
      * @param contentTypeHeader the Content-Type header string; can be null or empty
-     * @return the character set indicated in the contentTypeHeader, or null if the charset is not present
+     * @return the character set indicated in the contentTypeHeader, or null if the charset is not present or is not parsable
+     * @throws UnsupportedCharsetException if there is a charset specified in the content-type header, but it is not supported on this platform
      */
-    public static Charset readCharsetInContentTypeHeader(String contentTypeHeader) {
+    public static Charset readCharsetInContentTypeHeader(String contentTypeHeader) throws UnsupportedCharsetException {
         if (contentTypeHeader == null || contentTypeHeader.isEmpty()) {
-            return DEFAULT_HTTP_CHARSET;
+            return null;
         }
 
-        //FIXME: remove dependency on HttpCore's ContentType
-        ContentType contentTypeCharset = ContentType.parse(contentTypeHeader);
+        MediaType mediaType;
+        try {
+             mediaType = MediaType.parse(contentTypeHeader);
+        } catch (IllegalArgumentException e) {
+            log.info("Unable to parse Content-Type header: {}. Content-Type header will be ignored.", contentTypeHeader, e);
+            return null;
+        }
 
-        return contentTypeCharset.getCharset();
+        try {
+            return mediaType.charset().orNull();
+        } catch (java.nio.charset.UnsupportedCharsetException e) {
+            throw new UnsupportedCharsetException(e);
+        }
     }
 
     /**
@@ -284,11 +271,15 @@ public static boolean startsWithHttpOrHttps(String uri) {
             return false;
         }
 
-        if (uri.startsWith("http://") || uri.startsWith("https://")) {
-            return true;
-        } else {
-            return false;
-        }
+        // the scheme is case insensitive, according to RFC 7230, section 2.7.3:
+        /*
+            The scheme and host
+            are case-insensitive and normally provided in lowercase; all other
+            components are compared in a case-sensitive manner.
+        */
+        String lowercaseUri = uri.toLowerCase(Locale.US);
+
+        return lowercaseUri.startsWith("http://") || lowercaseUri.startsWith("https://");
     }
 
     /**
diff --git a/browsermob-core/src/main/java/net/lightbody/bmp/util/HttpMessageContents.java b/browsermob-core/src/main/java/net/lightbody/bmp/util/HttpMessageContents.java
diff --git a/browsermob-core/src/main/java/net/lightbody/bmp/util/HttpObjectUtil.java b/browsermob-core/src/main/java/net/lightbody/bmp/util/HttpObjectUtil.java
diff --git a/browsermob-rest/src/main/java/net/lightbody/bmp/proxy/bricks/ProxyResource.java b/browsermob-rest/src/main/java/net/lightbody/bmp/proxy/bricks/ProxyResource.java