Move Parser.pm to lib/HTML. Cleanup some of the spacing in examples and misspellings

genio · genio · commit 72fa60e5a59b · 2020-08-30T15:36:32.000-04:00
diff --git a/Changes b/Changes
@@ -3,6 +3,11 @@ Change history for HTML-Parser
 {{$NEXT}}
   * Cleanup the prereqs a bit
   * Mark HTML::Filter as deprecated as the docs point out
+  * Move Parser.pm into the lib directory with the others. This will help
+    with everything from auto version bumps after releases, to scanning for
+    prerequisites and spelling errors.
+  * Fix a few spelling errors in the POD for HTML::Parser
+  * Clean up the spacing on many examples in HTML::Parser
 
 3.74      2020-08-30
   * Fix the order of date and version in this change log. (Thanks, haarg)
diff --git a/META.json b/META.json
@@ -1,5 +1,5 @@
 {
-   "abstract" : "Filter HTML text through the parser",
+   "abstract" : "HTML parser class",
    "author" : [
       "Gisle Aas <gaas@cpan.org>"
    ],
@@ -107,6 +107,10 @@
          "file" : "lib/HTML/LinkExtor.pm",
          "version" : "3.75"
       },
+      "HTML::Parser" : {
+         "file" : "lib/HTML/Parser.pm",
+         "version" : "3.75"
+      },
       "HTML::PullParser" : {
          "file" : "lib/HTML/PullParser.pm",
          "version" : "3.75"
diff --git a/Makefile.PL b/Makefile.PL
@@ -8,7 +8,7 @@ use warnings;
 use ExtUtils::MakeMaker;
 
 my %WriteMakefileArgs = (
-  "ABSTRACT" => "Filter HTML text through the parser",
+  "ABSTRACT" => "HTML parser class",
   "AUTHOR" => "Gisle Aas <gaas\@cpan.org>",
   "CONFIGURE_REQUIRES" => {
     "ExtUtils::MakeMaker" => "6.52"
diff --git a/README.md b/README.md
@@ -9,20 +9,24 @@ HTML::Parser - HTML parser class
 # SYNOPSIS
 
 ```perl
+use strict;
+use warnings;
 use HTML::Parser ();
 
 # Create parser object
-$p = HTML::Parser->new( api_version => 3,
-                        start_h => [\&start, "tagname, attr"],
-                        end_h   => [\&end,   "tagname"],
-                        marked_sections => 1,
-                      );
+my $p = HTML::Parser->new(
+  api_version => 3,
+  start_h => [\&start, "tagname, attr"],
+  end_h   => [\&end,   "tagname"],
+  marked_sections => 1,
+);
 
 # Parse document text chunk by chunk
 $p->parse($chunk1);
 $p->parse($chunk2);
-#...
-$p->eof;                 # signal end of document
+# ...
+# signal end of document
+$p->eof;
 
 # Parse directly from file
 $p->parse_file("foo.html");
@@ -83,26 +87,33 @@ The following method is used to construct a new `HTML::Parser` object:
     Examples:
 
     ```perl
-    $p = HTML::Parser->new(api_version => 3,
-                           text_h => [ sub {...}, "dtext" ]);
+    $p = HTML::Parser->new(
+      api_version => 3,
+      text_h => [ sub {...}, "dtext" ]
+    );
     ```
 
     This creates a new parser object with a text event handler subroutine
     that receives the original text with general entities decoded.
 
     ```perl
-    $p = HTML::Parser->new(api_version => 3,
-                           start_h => [ 'my_start', "self,tokens" ]);
+    $p = HTML::Parser->new(
+      api_version => 3,
+      start_h => [ 'my_start', "self,tokens" ]
+    );
     ```
 
     This creates a new parser object with a start event handler method
     that receives the $p and the tokens array.
 
     ```perl
-    $p = HTML::Parser->new(api_version => 3,
-                           handlers => { text => [\@array, "event,text"],
-                                         comment => [\@array, "event,text"],
-                                       });
+    $p = HTML::Parser->new(
+      api_version => 3,
+      handlers => {
+        text => [\@array, "event,text"],
+        comment => [\@array, "event,text"],
+      }
+    );
     ```
 
     This creates a new parser object that stores the event type and the
@@ -133,12 +144,12 @@ to the `HTML::Parser` object:
 
     ```perl
     while (1) {
-       my $chunk = &$code_ref();
-       if (!defined($chunk) || !length($chunk)) {
-           $p->eof;
-           return $p;
-       }
-       $p->parse($chunk) || return undef;
+      my $chunk = &$code_ref();
+      if (!defined($chunk) || !length($chunk)) {
+        $p->eof;
+        return $p;
+      }
+      $p->parse($chunk) || return undef;
     }
     ```
 
@@ -214,16 +225,16 @@ Methods that can be used to get and/or set parser options are:
 - $p->case\_sensitive
 - $p->case\_sensitive( $bool )
 
-    By default, tagnames and attribute names are down-cased.  Enabling this
+    By default, tag names and attribute names are down-cased.  Enabling this
     attribute leaves them as found in the HTML source document.
 
 - $p->closing\_plaintext
 - $p->closing\_plaintext( $bool )
 
-    By default, "plaintext" element can never be closed. Everything up to
+    By default, `plaintext` element can never be closed. Everything up to
     the end of the document is parsed in CDATA mode.  This historical
     behaviour is what at least MSIE does.  Enabling this attribute makes
-    closing "&lt;/plaintext>" tag effective and the parsing process will resume
+    closing ` </plaintext` > tag effective and the parsing process will resume
     after seeing this tag.  This emulates early gecko-based browsers.
 
 - $p->empty\_element\_tags
@@ -405,8 +416,8 @@ method is used to set up handlers for different events:
     $p->handler(start =>  "start", 'self, attr, attrseq, text' );
     ```
 
-    This causes the "start" method of object $p to be called for 'start' events.
-    The callback signature is $p->start(\\%attr, \\@attr\_seq, $text).
+    This causes the "start" method of object `$p` to be called for 'start' events.
+    The callback signature is `$p->start(\%attr, \@attr_seq, $text)`.
 
     ```perl
     $p->handler(start =>  \&start, 'attr, attrseq, text' );
@@ -857,24 +868,28 @@ $p->handler(start   => "start",   "self, tagname, attr, attrseq, text");
 $p->handler(end     => "end",     "self, tagname, text");
 $p->handler(text    => "text",    "self, text, is_cdata");
 $p->handler(process => "process", "self, token0, text");
-$p->handler(comment =>
-          sub {
-              my($self, $tokens) = @_;
-              for (@$tokens) {$self->comment($_);}},
-          "self, tokens");
-$p->handler(declaration =>
-          sub {
-              my $self = shift;
-              $self->declaration(substr($_[0], 2, -1));},
-          "self, text");
+$p->handler(
+  comment => sub {
+    my($self, $tokens) = @_;
+    for (@$tokens) {$self->comment($_);}
+  },
+  "self, tokens"
+);
+$p->handler(
+  declaration => sub {
+    my $self = shift;
+    $self->declaration(substr($_[0], 2, -1));
+  },
+  "self, text"
+);
 ```
 
 Setting up these handlers can also be requested with the "api\_version =>
 2" constructor option.
 
 # SUBCLASSING
 
-The `HTML::Parser` class is subclassable.  Parser objects are plain
+The `HTML::Parser` class is able to be subclassed.  Parser objects are plain
 hashes and `HTML::Parser` reserves only hash keys that start with
 "\_hparser".  The parser state can be set up by invoking the init()
 method, which takes the same arguments as new().
@@ -887,19 +902,20 @@ does nothing and a default handler that will print out anything else:
 
 ```perl
 use HTML::Parser;
-HTML::Parser->new(default_h => [sub { print shift }, 'text'],
-                  comment_h => [""],
-                 )->parse_file(shift || die) || die $!;
+HTML::Parser->new(
+  default_h => [sub { print shift }, 'text'],
+  comment_h => [""],
+)->parse_file(shift || die) || die $!;
 ```
 
 An alternative implementation is:
 
 ```perl
 use HTML::Parser;
-HTML::Parser->new(end_document_h => [sub { print shift },
-                                     'skipped_text'],
-                  comment_h      => [""],
-                 )->parse_file(shift || die) || die $!;
+HTML::Parser->new(
+  end_document_h => [sub { print shift }, 'skipped_text'],
+  comment_h      => [""],
+)->parse_file(shift || die) || die $!;
 ```
 
 This will in most cases be much more efficient since only a single
@@ -914,17 +930,20 @@ parsing as soon as the title end tag is seen:
 ```perl
 use HTML::Parser ();
 
-sub start_handler
-{
+sub start_handler {
   return if shift ne "title";
   my $self = shift;
   $self->handler(text => sub { print shift }, "dtext");
-  $self->handler(end  => sub { shift->eof if shift eq "title"; },
-                         "tagname,self");
+  $self->handler(
+    end  => sub {
+      shift->eof if shift eq "title";
+    },
+    "tagname,self"
+  );
 }
 
 my $p = HTML::Parser->new(api_version => 3);
-$p->handler( start => \&start_handler, "tagname,self");
+$p->handler(start => \&start_handler, "tagname,self");
 $p->parse_file(shift || die) || die $!;
 print "\n";
 ```
@@ -962,7 +981,7 @@ respectively.
 NET tags, e.g. "code/.../" are not recognized.  This is SGML
 shorthand for "&lt;code>...&lt;/code>".
 
-Unclosed start or end tags, e.g. "&lt;tt&lt;b>...&lt;/b&lt;/tt>" are not
+Incomplete start or end tags, e.g. "&lt;tt&lt;b>...&lt;/b&lt;/tt>" are not
 recognized.
 
 # DIAGNOSTICS
@@ -1070,23 +1089,23 @@ in this listing is the same as used in [perldiag](https://metacpan.org/pod/perld
 
     The alternative solution is to enable the `utf8_mode` and not decode before
     passing strings to $p->parse().  The parser can process raw undecoded UTF-8
-    sanely if the `utf8_mode` is enabled, or if the "attr", "@attr" or "dtext"
+    sanely if the `utf8_mode` is enabled, or if the `attr`, `@attr` or `dtext`
     argspecs are avoided.
 
-- Parsing string decoded with wrong endianness
+- Parsing string decoded with wrong endian selection
 
     (W) The first character in the document is U+FFFE.  This is not a
-    legal Unicode character but a byte swapped BOM.  The result of parsing
+    legal Unicode character but a byte swapped `BOM`.  The result of parsing
     will likely be garbage.
 
 - Parsing of undecoded UTF-32
 
-    (W) The parser found the Unicode UTF-32 BOM signature at the start
+    (W) The parser found the Unicode UTF-32 `BOM` signature at the start
     of the document.  The result of parsing will likely be garbage.
 
 - Parsing of undecoded UTF-16
 
-    (W) The parser found the Unicode UTF-16 BOM signature at the start of
+    (W) The parser found the Unicode UTF-16 `BOM` signature at the start of
     the document.  The result of parsing will likely be garbage.
 
 # SEE ALSO
diff --git a/dist.ini b/dist.ini
@@ -7,7 +7,7 @@ copyright_year   = 1996
 
 [ReadmeAnyFromPod / Markdown_Readme]
 type = gfm
-source_filename = Parser.pm
+source_filename = lib/HTML/Parser.pm
 filename = README.md
 location = root
 
@@ -54,7 +54,7 @@ badges = github_actions/windows
 [Test::Kwalitee]
 skiptest = no_symlinks
 [Test::Version]
-filename_match = qr/^Parser\.pm$/
+filename_match = qr/Parser\.pm$/
 [Test::Pod::Coverage::Configurable]
 trustme = HTML::Entities => qr/^(?:UNICODE_SUPPORT|decode|encode|encode_numeric|encode_numerically|num_entity)$/
 trustme = HTML::Filter => qr/^(?:output)$/
@@ -73,3 +73,10 @@ stopword = undecoded
 stopword = IMG
 stopword = textified
 stopword = Textification
+stopword = argspecs
+stopword = Attr
+stopword = Attrseq
+stopword = Dtext
+stopword = Tokenpos
+stopword = Unterminated
+stopword = CDATA
diff --git a/lib/HTML/Parser.pm b/lib/HTML/Parser.pm