prepare v1.11.0 (#631)

adbar · web-flow · commit 60647e5eeff7 · 2024-06-27T15:59:56.000+02:00
* prepare v1.11.0

* update docstrings and docs

* explaing change

* add last fix
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,6 +1,29 @@
 ## History / Changelog
 
 
+### 1.11.0
+
+Breaking change:
+- metadata now skipped by default (#613), to trigger inclusion in all output formats:
+   - `with_metadata=True` (Python)
+   - `--with-metadata` (CLI)
+
+Extraction:
+- add HTML as output format (#614)
+- better and faster baseline extraction (#619)
+- better handling of HTML/XML elements (#628)
+- XPath rules added with @felipehertzer (#540)
+- fix: avoid faulty readability_lxml content (#635)
+
+Evaluation:
+- new scripts and data with @LydiaKoerber (#606, #615)
+- additional data with @swetepete (#197)
+
+Maintenance:
+- docs extended and updated, added page on deduplication (#618)
+- review code, add tests and types in part of the submodules (#620, #623, #624, #625)
+
+
 ### 1.10.0
 
 Breaking changes:
diff --git a/README.md b/README.md
@@ -60,11 +60,10 @@ search engine optimization, and information security).
    - Optional elements: comments, links, images, tables
 
 - Multiple output formats:
-   - Text
-   - Markdown (with formatting)
-   - CSV (with metadata)
-   - JSON (with metadata)
-   - XML or [XML-TEI](https://tei-c.org/) (with metadata, text formatting and page structure)
+   - TXT and Markdown
+   - CSV
+   - JSON
+   - HTML, XML and [XML-TEI](https://tei-c.org/)
 
 - Optional add-ons:
    - Language detection on extracted content
diff --git a/docs/index.rst b/docs/index.rst
@@ -60,11 +60,10 @@ Features
    - Formatting and structure: paragraphs, titles, lists, quotes, code, line breaks, in-line text formatting
    - Optional elements: comments, links, images, tables
 - Multiple output formats:
-   - Text
-   - Markdown (with formatting)
-   - CSV (with metadata)
-   - JSON (with metadata)
-   - XML or `XML-TEI <https://tei-c.org/>`_ (with metadata, text formatting and page structure)
+   - TXT and Markdown
+   - CSV
+   - JSON
+   - HTML, XML and [XML-TEI](https://tei-c.org/)
 - Optional add-ons:
    - Language detection on extracted content
    - Graphical user interface (GUI)
diff --git a/docs/usage-cli.rst b/docs/usage-cli.rst
@@ -283,8 +283,8 @@ For all usage instructions see ``trafilatura -h``:
                    [--no-tables] [--only-with-metadata]
                    [--target-language TARGET_LANGUAGE] [--deduplicate]
                    [--config-file CONFIG_FILE] [--precision] [--recall]
-                   [-out {txt,csv,json,markdown,xml,xmltei} | --csv | --json |
-                   --markdown | --xml | --xmltei]
+                   [-out {txt,csv,html,json,markdown,xml,xmltei} | --csv | --html |
+                   --json | --markdown | --xml | --xmltei]
                    [--validate-tei] [-v] [--version]
 
 
@@ -343,7 +343,7 @@ Extraction:
   --no-comments         don't output any comments
   --no-tables           don't output any table elements
   --only-with-metadata  only output those documents with title, URL and date
-                        (for formats supporting metadata)
+  --with-metadata       extract and add metadata to the output
   --target-language TARGET_LANGUAGE
                         select a target language (ISO 639-1 codes)
   --deduplicate         filter out duplicate documents and sections
@@ -360,8 +360,9 @@ Format:
 
 .. code-block:: bash
 
-  -out {txt,csv,json,markdown,xml,xmltei}, --output-format {txt,csv,json,markdown,xml,xmltei} determine output format
+  -out {txt,csv,html,json,markdown,xml,xmltei}, --output-format {txt,csv,html,json,markdown,xml,xmltei}
   --csv                 shorthand for CSV output
+  --html                shorthand for HTML output
   --json                shorthand for JSON output
   --markdown            shorthand for MD output
   --xml                 shorthand for XML output
diff --git a/docs/usage-python.rst b/docs/usage-python.rst
@@ -33,7 +33,7 @@ For the basics see `quickstart documentation page <quickstart.html>`_.
 
 Default output is set to TXT (bare text) without metadata.
 
-The following formats are available: bare text, Markdown (from version 1.9 onwards), CSV, JSON, XML, and XML following the guidelines of the Text Encoding Initiative (TEI).
+The following formats are available: bare text, Markdown (from version 1.9 onwards), HTML (from version 1.11 onwards), CSV, JSON, XML, and XML following the guidelines of the Text Encoding Initiative (TEI).
 
 
 .. hint::
diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py
@@ -9,7 +9,7 @@
 __author__ = 'Adrien Barbaresi and contributors'
 __license__ = "Apache-2.0"
 __copyright__ = 'Copyright 2019-2024, Adrien Barbaresi'
-__version__ = '1.10.0'
+__version__ = '1.11.0'
 
 
 import logging
diff --git a/trafilatura/cli.py b/trafilatura/cli.py
@@ -138,11 +138,11 @@ def add_args(parser):
                         help="don't output any table elements",
                         action="store_false")  # false = no tables
     group4.add_argument("--only-with-metadata",
-                        help="only output those documents with title, URL and date (for formats supporting metadata)",
+                        help="only output those documents with title, URL and date",
                         action="store_true")
     group4.add_argument("--with-metadata",
-                        help=argparse.SUPPRESS,
-                        action="store_true")   # will be deprecated
+                        help="extract and add metadata to the output",
+                        action="store_true")
     group4.add_argument("--target-language",
                         help="select a target language (ISO 639-1 codes)",
                         type=str)
diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -89,7 +89,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         include_comments: Extract comments along with the main text.
         output_format: Define an output format, Python being the default
             and the interest of this internal function.
-            Other values: "csv", "json", "markdown", "txt", "xml", and "xmltei".
+            Other values: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
         target_language: Define a language to discard invalid documents (ISO 639-1 format).
         include_tables: Take into account information within the HTML <table> element.
         include_images: Take images into account (experimental).
@@ -98,7 +98,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
-        with_metadata: Extract metadata fields and add them to the output (available soon).
+        with_metadata: Extract metadata fields and add them to the output.
         only_with_metadata: Only keep documents featuring all essential metadata
             (date, title, url).
         max_tree_size: Discard documents with too many elements.
@@ -278,7 +278,7 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
         favor_recall: when unsure, prefer more text.
         include_comments: Extract comments along with the main text.
         output_format: Define an output format:
-            "csv", "json", "markdown", "txt", "xml", and "xmltei".
+            "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
         tei_validation: Validate the XML-TEI output with respect to the TEI standard.
         target_language: Define a language to discard invalid documents (ISO 639-1 format).
         include_tables: Take into account information within the HTML <table> element.
@@ -288,7 +288,7 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
         include_links: Keep links along with their targets (experimental).
         deduplicate: Remove duplicate segments and documents.
         date_extraction_params: Provide extraction parameters to htmldate as dict().
-        with_metadata: Extract metadata fields and add them to the output (available soon).
+        with_metadata: Extract metadata fields and add them to the output.
         only_with_metadata: Only keep documents featuring all essential metadata
             (date, title, url).
         max_tree_size: Discard documents with too many elements.