@@ -179,6 +179,7 @@ public void handleTagOpen(TagNode tag) {
179179 attrName = attrName .toLowerCase (Locale .ROOT );
180180 if (globalHrefAttributes .contains (attrName )) {
181181 attrValue = decodeCharEnt (attrValue );
182+ attrValue = trimDataUrl (attrValue );
182183 data .addHref (PATH ,makePath (name ,attrName ),"url" ,attrValue );
183184 }
184185 }
@@ -382,24 +383,36 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
382383 String val = node .getAttribute (attr );
383384 if (val != null ) {
384385 val = decodeCharEnt (val );
386+ val = trimDataUrl (val );
385387 data .addHref (PATH ,makePath (node .getTagName (),attr ),"url" ,val );
386388 }
387389 }
388390 }
389391
390392 private static ArrayList <String > getAttrList (TagNode node , String ... attrs ) {
391393 ArrayList <String > l = new ArrayList <String >();
394+ boolean isOgImage = false ;
392395 for (String attr : attrs ) {
393396 String val = node .getAttribute (attr );
394397 if (val != null ) {
395398 val = decodeCharEnt (val );
396399 l .add (attr );
397400 l .add (val );
401+ if (attr .equals ("property" ) && val .equals ("og:image" )) {
402+ isOgImage = true ;
403+ }
398404 }
399405 }
400406 if (l .size () == 0 ) {
401407 return null ;
402408 }
409+ if (isOgImage ) {
410+ // trim data: URLs in og:image metadata
411+ int content = l .indexOf ("content" );
412+ if (content > -1 && (content % 2 ) == 0 ) {
413+ l .set (content + 1 , trimDataUrl (l .get (content + 1 )));
414+ }
415+ }
403416 return l ;
404417 }
405418
@@ -409,6 +422,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
409422 ArrayList <String > l = null ;
410423 if (url != null ) {
411424 url = decodeCharEnt (url );
425+ url = trimDataUrl (url );
412426 l = new ArrayList <String >();
413427 l .add (PATH );
414428 l .add (makePath (node .getTagName (),urlAttr ));
@@ -442,6 +456,7 @@ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
442456 for (Pattern pattern : jsOnClickUrlPatterns ) {
443457 String url = patternJSExtract (pattern , onclick );
444458 if (url != null ) {
459+ url = trimDataUrl (url );
445460 data .addHref (PATH , path , "url" , url );
446461 }
447462 }
@@ -483,6 +498,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
483498 if (url != null ) {
484499 // got data:
485500 url = decodeCharEnt (url );
501+ url = trimDataUrl (url );
486502 l .add (PATH );
487503 l .add (makePath ("A" ,"href" ));
488504 l .add ("url" );
@@ -520,6 +536,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
520536 String url = node .getAttribute ("href" );
521537 if (url != null ) {
522538 url = decodeCharEnt (url );
539+ url = trimDataUrl (url );
523540 ArrayList <String > l = new ArrayList <String >();
524541 l .add (PATH );
525542 l .add (makePath ("AREA" ,"href" ));
@@ -583,6 +600,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
583600 String url = node .getAttribute ("action" );
584601 if (url != null ) {
585602 url = decodeCharEnt (url );
603+ url = trimDataUrl (url );
586604 // got data:
587605 l .add (PATH );
588606 l .add (makePath ("FORM" ,"action" ));
@@ -728,7 +746,8 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
728746 String url = m .group (1 );
729747 url = cssUrlTrimPattern .matcher (url ).replaceAll ("" );
730748 if (!url .isEmpty ()) {
731- data .addHref ("path" ,"STYLE/#text" ,"href" , url );
749+ url = trimDataUrl (url );
750+ data .addHref ("path" , "STYLE/#text" , "href" , url );
732751 }
733752 }
734753 }
@@ -757,4 +776,36 @@ public static String decodeCharEnt(String text, boolean inAttribute) {
757776 return text ;
758777 }
759778 }
779+
780+ /**
781+ * Trim data from
782+ * <a href="https://www.rfc-editor.org/rfc/rfc2397#section-2">data URLs</a>.
783+ *
784+ * Any data (after the comma) is trimmed from a data URL. If no comma is
785+ * found within the first 128 characters of the URL, the URL is trimmed to
786+ * 128 characters.
787+ *
788+ * @param url
789+ * URL to be trimmed
790+ * @return
791+ */
792+ public static String trimDataUrl (String url ) {
793+ if (url .startsWith ("data:" )) {
794+ int posComma = url .indexOf (',' , 5 );
795+ if (posComma == -1 ) {
796+ // no comma, trim to 128 characters if necessary
797+ if (url .length () > 128 ) {
798+ return url .substring (0 , 128 );
799+ }
800+ return url ;
801+ } else if (posComma > 128 ) {
802+ return url .substring (0 , 128 );
803+ } else if (posComma == 6 ) {
804+ return "data:," ;
805+ } else if (posComma > 6 ) {
806+ return url .substring (0 , posComma + 1 );
807+ }
808+ }
809+ return url ;
810+ }
760811}
0 commit comments