@@ -9,20 +9,24 @@ HTML::Parser - HTML parser class
9
9
# SYNOPSIS
10
10
11
11
``` perl
12
+ use strict;
13
+ use warnings;
12
14
use HTML::Parser ();
13
15
14
16
# Create parser object
15
- $p = HTML::Parser-> new( api_version => 3,
16
- start_h => [\&start, " tagname, attr" ],
17
- end_h => [\&end, " tagname" ],
18
- marked_sections => 1,
19
- );
17
+ my $p = HTML::Parser-> new(
18
+ api_version => 3,
19
+ start_h => [\&start, " tagname, attr" ],
20
+ end_h => [\&end, " tagname" ],
21
+ marked_sections => 1,
22
+ );
20
23
21
24
# Parse document text chunk by chunk
22
25
$p -> parse($chunk1 );
23
26
$p -> parse($chunk2 );
24
- # ...
25
- $p -> eof ; # signal end of document
27
+ # ...
28
+ # signal end of document
29
+ $p -> eof ;
26
30
27
31
# Parse directly from file
28
32
$p -> parse_file(" foo.html" );
@@ -83,26 +87,33 @@ The following method is used to construct a new `HTML::Parser` object:
83
87
Examples:
84
88
85
89
``` perl
86
- $p = HTML::Parser-> new(api_version => 3,
87
- text_h => [ sub {...}, " dtext" ]);
90
+ $p = HTML::Parser-> new(
91
+ api_version => 3,
92
+ text_h => [ sub {...}, " dtext" ]
93
+ );
88
94
```
89
95
90
96
This creates a new parser object with a text event handler subroutine
91
97
that receives the original text with general entities decoded.
92
98
93
99
` ` ` perl
94
- $p = HTML::Parser->new(api_version => 3,
95
- start_h => [ 'my_start', "self,tokens" ]);
100
+ $p = HTML::Parser->new(
101
+ api_version => 3,
102
+ start_h => [ 'my_start', "self,tokens" ]
103
+ );
96
104
` ` `
97
105
98
106
This creates a new parser object with a start event handler method
99
107
that receives the $p and the tokens array.
100
108
101
109
` ` ` perl
102
- $p = HTML::Parser->new(api_version => 3,
103
- handlers => { text => [\@ array, "event,text"],
104
- comment => [\@ array, "event,text"],
105
- });
110
+ $p = HTML::Parser->new(
111
+ api_version => 3,
112
+ handlers => {
113
+ text => [\@ array, "event,text"],
114
+ comment => [\@ array, "event,text"],
115
+ }
116
+ );
106
117
` ` `
107
118
108
119
This creates a new parser object that stores the event type and the
@@ -133,12 +144,12 @@ to the `HTML::Parser` object:
133
144
134
145
` ` ` perl
135
146
while (1) {
136
- my $chunk = &$code_ref ();
137
- if (!defined($chunk ) || !length($chunk )) {
138
- $p ->eof;
139
- return $p ;
140
- }
141
- $p ->parse($chunk ) || return undef;
147
+ my $chunk = &$code_ref ();
148
+ if (!defined($chunk ) || !length($chunk )) {
149
+ $p ->eof;
150
+ return $p ;
151
+ }
152
+ $p ->parse($chunk ) || return undef;
142
153
}
143
154
` ` `
144
155
@@ -214,16 +225,16 @@ Methods that can be used to get and/or set parser options are:
214
225
- $p ->case\_ sensitive
215
226
- $p ->case\_ sensitive( $bool )
216
227
217
- By default, tagnames and attribute names are down-cased. Enabling this
228
+ By default, tag names and attribute names are down-cased. Enabling this
218
229
attribute leaves them as found in the HTML source document.
219
230
220
231
- $p ->closing\_ plaintext
221
232
- $p ->closing\_ plaintext( $bool )
222
233
223
- By default, " plaintext" element can never be closed. Everything up to
234
+ By default, ` plaintext` element can never be closed. Everything up to
224
235
the end of the document is parsed in CDATA mode. This historical
225
236
behaviour is what at least MSIE does. Enabling this attribute makes
226
- closing " & lt ; /plaintext> " tag effective and the parsing process will resume
237
+ closing ` < /plaintext` > tag effective and the parsing process will resume
227
238
after seeing this tag. This emulates early gecko-based browsers.
228
239
229
240
- $p ->empty\_ element\_ tags
@@ -405,8 +416,8 @@ method is used to set up handlers for different events:
405
416
$p ->handler(start => "start", 'self, attr, attrseq, text' );
406
417
` ` `
407
418
408
- This causes the " start" method of object $p to be called for ' start' events.
409
- The callback signature is $p -> start(\\ %attr , \\ @attr \_seq , $text ).
419
+ This causes the " start" method of object ` $p ` to be called for ' start' events.
420
+ The callback signature is ` $p ->start(\% attr, \@ attr_seq , $text )` .
410
421
411
422
` ` ` perl
412
423
$p ->handler(start => \& start, 'attr, attrseq, text' );
@@ -857,24 +868,28 @@ $p->handler(start => "start", "self, tagname, attr, attrseq, text");
857
868
$p ->handler(end => " end" , " self, tagname, text" );
858
869
$p ->handler(text => " text" , " self, text, is_cdata" );
859
870
$p ->handler(process => " process" , " self, token0, text" );
860
- $p ->handler(comment =>
861
- sub {
862
- my($self , $tokens ) = @_ ;
863
- for (@$tokens ) {$self ->comment($_ );}},
864
- " self, tokens" );
865
- $p ->handler(declaration =>
866
- sub {
867
- my $self = shift;
868
- $self ->declaration(substr($_ [0], 2, -1));},
869
- " self, text" );
871
+ $p ->handler(
872
+ comment => sub {
873
+ my($self , $tokens ) = @_ ;
874
+ for (@$tokens ) {$self ->comment($_ );}
875
+ },
876
+ " self, tokens"
877
+ );
878
+ $p ->handler(
879
+ declaration => sub {
880
+ my $self = shift;
881
+ $self ->declaration(substr($_ [0], 2, -1));
882
+ },
883
+ " self, text"
884
+ );
870
885
```
871
886
872
887
Setting up these handlers can also be requested with the " api\_version =>
873
888
2" constructor option.
874
889
875
890
# SUBCLASSING
876
891
877
- The `HTML::Parser` class is subclassable . Parser objects are plain
892
+ The `HTML::Parser` class is able to be subclassed . Parser objects are plain
878
893
hashes and `HTML::Parser` reserves only hash keys that start with
879
894
" \_hparser" . The parser state can be set up by invoking the init()
880
895
method, which takes the same arguments as new().
@@ -887,19 +902,20 @@ does nothing and a default handler that will print out anything else:
887
902
888
903
```perl
889
904
use HTML::Parser;
890
- HTML::Parser->new(default_h => [sub { print shift }, 'text'],
891
- comment_h => [" " ],
892
- )->parse_file(shift || die) || die $! ;
905
+ HTML::Parser->new(
906
+ default_h => [sub { print shift }, 'text'],
907
+ comment_h => [" " ],
908
+ )->parse_file(shift || die) || die $! ;
893
909
```
894
910
895
911
An alternative implementation is:
896
912
897
913
```perl
898
914
use HTML::Parser;
899
- HTML::Parser->new(end_document_h => [sub { print shift },
900
- 'skipped_text'],
901
- comment_h => [" " ],
902
- )->parse_file(shift || die) || die $! ;
915
+ HTML::Parser->new(
916
+ end_document_h => [sub { print shift }, 'skipped_text'],
917
+ comment_h => [" " ],
918
+ )->parse_file(shift || die) || die $! ;
903
919
```
904
920
905
921
This will in most cases be much more efficient since only a single
@@ -914,17 +930,20 @@ parsing as soon as the title end tag is seen:
914
930
```perl
915
931
use HTML::Parser ();
916
932
917
- sub start_handler
918
- {
933
+ sub start_handler {
919
934
return if shift ne " title" ;
920
935
my $self = shift;
921
936
$self ->handler(text => sub { print shift }, " dtext" );
922
- $self ->handler(end => sub { shift->eof if shift eq " title" ; },
923
- " tagname,self" );
937
+ $self ->handler(
938
+ end => sub {
939
+ shift->eof if shift eq " title" ;
940
+ },
941
+ " tagname,self"
942
+ );
924
943
}
925
944
926
945
my $p = HTML::Parser->new(api_version => 3);
927
- $p ->handler( start => \& start_handler, " tagname,self" );
946
+ $p ->handler(start => \& start_handler, " tagname,self" );
928
947
$p ->parse_file(shift || die) || die $! ;
929
948
print " \n" ;
930
949
```
@@ -962,7 +981,7 @@ respectively.
962
981
NET tags, e.g. " code/.../" are not recognized. This is SGML
963
982
shorthand for " < ;code>...< ;/code>" .
964
983
965
- Unclosed start or end tags, e.g. " < ;tt< ;b>...< ;/b< ;/tt>" are not
984
+ Incomplete start or end tags, e.g. " < ;tt< ;b>...< ;/b< ;/tt>" are not
966
985
recognized.
967
986
968
987
# DIAGNOSTICS
@@ -1070,23 +1089,23 @@ in this listing is the same as used in [perldiag](https://metacpan.org/pod/perld
1070
1089
1071
1090
The alternative solution is to enable the `utf8_mode` and not decode before
1072
1091
passing strings to $p ->parse(). The parser can process raw undecoded UTF-8
1073
- sanely if the `utf8_mode` is enabled, or if the " attr" , " @attr " or " dtext"
1092
+ sanely if the `utf8_mode` is enabled, or if the ` attr`, ` @attr ` or ` dtext`
1074
1093
argspecs are avoided.
1075
1094
1076
- - Parsing string decoded with wrong endianness
1095
+ - Parsing string decoded with wrong endian selection
1077
1096
1078
1097
(W) The first character in the document is U+FFFE. This is not a
1079
- legal Unicode character but a byte swapped BOM. The result of parsing
1098
+ legal Unicode character but a byte swapped ` BOM` . The result of parsing
1080
1099
will likely be garbage.
1081
1100
1082
1101
- Parsing of undecoded UTF-32
1083
1102
1084
- (W) The parser found the Unicode UTF-32 BOM signature at the start
1103
+ (W) The parser found the Unicode UTF-32 ` BOM` signature at the start
1085
1104
of the document. The result of parsing will likely be garbage.
1086
1105
1087
1106
- Parsing of undecoded UTF-16
1088
1107
1089
- (W) The parser found the Unicode UTF-16 BOM signature at the start of
1108
+ (W) The parser found the Unicode UTF-16 ` BOM` signature at the start of
1090
1109
the document. The result of parsing will likely be garbage.
1091
1110
1092
1111
# SEE ALSO
0 commit comments