File tree 3 files changed +27
-15
lines changed
java/org/metafacture/html
test/java/org/metafacture/html
3 files changed +27
-15
lines changed Original file line number Diff line number Diff line change 30
30
import org .metafacture .framework .helpers .DefaultObjectPipe ;
31
31
32
32
/**
33
- * Extracts the first script from an HTML document
33
+ * Extracts the the specified element from an HTML document
34
34
*
35
35
* @author Fabian Steeg
36
36
*/
37
- @ Description ("Extracts the first script from an HTML document" )
37
+ @ Description ("Extracts the specified element from an HTML document" )
38
38
@ In (Reader .class )
39
39
@ Out (String .class )
40
- @ FluxCommand ("extract-script" )
41
- public class ScriptExtractor extends DefaultObjectPipe <Reader , ObjectReceiver <String >> {
40
+ @ FluxCommand ("extract-element" )
41
+ public class ElementExtractor extends DefaultObjectPipe <Reader , ObjectReceiver <String >> {
42
+ private String selector ;
43
+
44
+ /**
45
+ * @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax
46
+ */
47
+ public ElementExtractor (final String selector ) {
48
+ this .selector = selector ;
49
+ }
50
+
42
51
@ Override
43
52
public void process (final Reader reader ) {
44
53
try {
45
54
Document document = Jsoup .parse (IOUtils .toString (reader ));
46
- Element firstScript = document .select ("script" ).first ();
47
- getReceiver ().process (firstScript .data ());
55
+ Element firstElement = document .select (selector ).first ();
56
+ getReceiver ().process (firstElement .data ());
48
57
} catch (IOException e ) {
49
58
e .printStackTrace ();
50
59
}
Original file line number Diff line number Diff line change 14
14
# limitations under the License.
15
15
#
16
16
decode-html org.metafacture.html.HtmlDecoder
17
- extract-script org.metafacture.html.ScriptExtractor
17
+ extract-element org.metafacture.html.ElementExtractor
Original file line number Diff line number Diff line change 28
28
import org .mockito .MockitoAnnotations ;
29
29
30
30
/**
31
- * Tests for {@link ScriptExtractor }.
31
+ * Tests for {@link ElementExtractor }.
32
32
*
33
33
* @author Fabian Steeg
34
34
*
35
35
*/
36
- public final class ScriptExtractorTest {
36
+ public final class ElementExtractorTest {
37
37
38
- private static final StringReader IN = new StringReader ("<html><script>{\" code\" :\" yo\" }" );
38
+ private static final StringReader IN = new StringReader ("<html>"
39
+ + "<script data-test='site-head-data'>{\" code\" :\" hey\" }</script>"
40
+ + "<script data-test='model-linked-data'>{\" code\" :\" yo\" }" );
41
+
39
42
private static final String OUT = "{\" code\" :\" yo\" }" ;
40
43
41
- private ScriptExtractor scriptExtractor ;
44
+ private ElementExtractor elementExtractor ;
42
45
43
46
@ Mock
44
47
private ObjectReceiver <String > receiver ;
45
48
46
49
@ Before
47
50
public void setup () {
48
51
MockitoAnnotations .initMocks (this );
49
- scriptExtractor = new ScriptExtractor ( );
50
- scriptExtractor .setReceiver (receiver );
52
+ elementExtractor = new ElementExtractor ( "script[data-test=model-linked-data]" );
53
+ elementExtractor .setReceiver (receiver );
51
54
}
52
55
53
56
@ Test
54
57
public void testShouldProcessRecordsFollowedbySeparator () {
55
- scriptExtractor .process (IN );
58
+ elementExtractor .process (IN );
56
59
verify (receiver ).process (OUT );
57
60
verifyNoMoreInteractions (receiver );
58
61
}
59
62
60
63
@ After
61
64
public void cleanup () {
62
- scriptExtractor .closeStream ();
65
+ elementExtractor .closeStream ();
63
66
}
64
67
}
You can’t perform that action at this time.
0 commit comments