Merge pull request #707 from IBM/html2parquet

Added Trafilatura parameters for heading, table, image extraction
IBM · Oct 16, 2024 · 5ca6ea8 · 5ca6ea8
2 parents 7b0ff94 + 8d26bb3
commit 5ca6ea8
Show file tree

Hide file tree

Showing 8 changed files with 260 additions and 30 deletions.
diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md
@@ -10,19 +10,148 @@ The output format will contain the following colums
 
 ```jsonc
 {
-	"title": "string"             // the member filename
-	"document": "string"          // the base of the source archive
-	"contents": "string"          // the content of the HTML
+	"title": "string",             // the member filename
+	"document": "string",          // the base of the source archive
+	"contents": "string",          // the content of the HTML
     "document_id": "string",      // the document id, a hash of `contents`
     "size": "string",             // the size of `contents`
     "date_acquired": "date",      // the date when the transform was executing
 }
 ```
 ## Parameters
-The transform can be initialized with the following parameters.
 
-| Parameter  | Default  | Description  |
-|------------|----------|--------------|
-| `output_format`         | `markdown`        | The output type for the `contents` column. Valid types are `markdown` and `text`. |
+The table below provides the parameters that users can adjust to control the behavior of the extraction:
+
+| Parameter         | Default    | Description                                                                 |
+|-------------------|------------|-----------------------------------------------------------------------------|
+| `output_format`    | `markdown` | Specifies the format of the extracted content. Options: `markdown`, `txt`.  |
+| `favor_precision`  | `True`     | Prefers less content but more accurate extraction. Options: `True`, `False`. |
+| `favor_recall`     | `True`     | Extracts more content when uncertain. Options: `True`, `False`.              |
+
+The table below provides the parameters that are enabled by default to ensure a comprehensive extraction process:
+
+| Parameter           | Default   | Description                                                                 |
+|---------------------|-----------|-----------------------------------------------------------------------------|
+| `include_tables`     | `True`    | Extracts content from HTML `<table>` elements.                               |
+| `include_images`     | `True`    | Extracts image references (experimental feature).                            |
+| `include_links`      | `True`    | Extracts hyperlinks from the HTML content.                                   |
+| `include_formatting` | `True`    | Preserves basic HTML formatting (e.g., bold, italic) in the extracted content.|
+
+*Note: If both `favor_precision` and `favor_recall` are set to `True`, `favor_recall` takes precedence.*
+
+- To set the output format to plain text, use `output_format='txt'`.
+- To prioritize extracting more content over accuracy, set `favor_recall=True` and `favor_precision=False`.
+- When invoking the CLI, use the following syntax for these parameters: `--html2parquet_<parameter_name>`. For example: `--html2parquet_output_format='markdown'`.
+
+## Example
+
+### Sample HTML 
+```
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Sample HTML File</title>
+</head>
+<body>
+    <h1>Welcome to My Sample HTML Page</h1>
+    <h2>Overview</h2>
+    <p>This page contains various HTML components to demonstrate structure and formatting.</p>
+    <p>This paragraph contains <a href="https://example.com">a link to Example.com</a>.</p>
+
+    <h2>Sample Image</h2>
+    <img src="https://via.placeholder.com/300" alt="Placeholder Image" />
+
+    <h2>Key Features</h2>
+    <ul>
+        <li>Easy to use</li>
+        <li>Highly customizable</li>
+        <li>Supports multiple components</li>
+    </ul>
+
+    <h2>Sample Data Table</h2>
+    <table border="1">
+        <tr>
+            <th>Name</th>
+            <th>Age</th>
+            <th>City</th>
+        </tr>
+        <tr>
+            <td>Alice</td>
+            <td>30</td>
+            <td>New York</td>
+        </tr>
+        <tr>
+            <td>Bob</td>
+            <td>25</td>
+            <td>Los Angeles</td>
+        </tr>
+        <tr>
+            <td>Charlie</td>
+            <td>35</td>
+            <td>Chicago</td>
+        </tr>
+    </table>
+
+    <h2>Contact Us</h2>
+    <form action="/submit" method="POST">
+        <label for="name">Name:</label>
+        <input type="text" id="name" name="name" required><br><br>
+        <label for="email">Email:</label>
+        <input type="email" id="email" name="email" required><br><br>
+        <input type="submit" value="Submit">
+    </form>
+</body>
+</html>
+```
+
+### Sample Output (Using Default Parameters)
+
+```
+
+# Welcome to My Sample HTML Page
+
+## Overview
+
+This page contains various HTML components to demonstrate structure and formatting.
+
+This paragraph contains [a link to Example.com](https://example.com).
+
+## Sample Image
+
+
+## Key Features
+
+- Easy to use
+- Highly customizable
+- Supports multiple components
+
+## Getting Started
+
+- Download the HTML file
+- Open it in your browser
+- Explore the content
+
+## Sample Data Table
+
+Name |
+Age |
+City |
+
+Alice |
+30 |
+New York |
+
+Bob |
+25 |
+Los Angeles |
+
+Charlie |
+35 |
+Chicago |
+
+
+## Contact Us
+```
 
-When invoking the CLI, the parameters must be set as `--html2parquet_<name>`, e.g. `--html2parquet_output_format='markdown'`.
diff --git a/transforms/language/html2parquet/python/src/html2parquet_transform.py b/transforms/language/html2parquet/python/src/html2parquet_transform.py
@@ -27,19 +27,52 @@ def __init__(self, config: dict[str, Any]):
         super().__init__(config)
 
         self.output_format = config.get(html2parquet_output_format_key, html2parquet_output_format.MARKDOWN)
+        self.favor_precision = config.get(html2parquet_favor_precision_key, html2parquet_favor_precision.TRUE)
+        self.favor_recall = config.get(html2parquet_favor_recall_key, html2parquet_favor_recall.TRUE)
+
         if not isinstance(self.output_format, html2parquet_output_format):
             self.output_format = html2parquet_output_format[self.output_format]  
 
+        if not isinstance(self.favor_precision, html2parquet_favor_precision):
+            self.favor_precision = html2parquet_favor_precision[self.favor_precision]
+
+        if not isinstance(self.favor_recall, html2parquet_favor_recall):
+            self.favor_recall = html2parquet_favor_recall[self.favor_recall]  
+
     def _convert_html2parquet(self, member_filename:str, file_name:str, content_bytes: bytes) -> dict:
+
         title = member_filename if member_filename else TransformUtils.get_file_basename(file_name)
 
+        output_format_value = str(self.output_format)
+        if output_format_value not in ["markdown", "txt"]:
+            raise RuntimeError(f"Unknown output_format {self.output_format}.")
+
+        if self.favor_precision == html2parquet_favor_precision.TRUE:
+            favor_precision_value = True
+        elif self.favor_precision == html2parquet_favor_precision.FALSE:
+            favor_precision_value = False
+        else: 
+            raise RuntimeError(f"Unknown favor_precision {self.favor_precision}.")
+
+        if self.favor_recall == html2parquet_favor_recall.TRUE:
+            favor_recall_value = True
+        elif self.favor_recall == html2parquet_favor_recall.FALSE:
+            favor_recall_value = False
+        else: 
+            raise RuntimeError(f"Unknown favor_recall {self.favor_recall}.")
+
+
         # Use Trafilatura library
-        if self.output_format == html2parquet_output_format.MARKDOWN:
-            content_string = trafilatura.extract(content_bytes, output_format="markdown")
-        elif self.output_format == html2parquet_output_format.TEXT:
-            content_string = trafilatura.extract(content_bytes)
-        else:
-            raise RuntimeError(f"Uknown output_format {self.output_format}.")
+        content_string = trafilatura.extract(
+            content_bytes,
+            output_format=output_format_value,
+            include_tables=True,
+            include_images=True,
+            include_links=True,
+            include_formatting=True,
+            favor_precision=favor_precision_value,
+            favor_recall=favor_recall_value
+        )
 
 
         if content_string is None:
@@ -116,16 +149,38 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
 short_name = "html2parquet"
 cli_prefix = f"{short_name}_"
 html2parquet_output_format_key = f"output_format"
+html2parquet_favor_precision_key = f"favor_precision"
+html2parquet_favor_recall_key = f"favor_recall"
+
 
 class html2parquet_output_format(str, enum.Enum):
     MARKDOWN = "markdown"
-    TEXT = "text"
+    TEXT = "txt"
+
+    def __str__(self):
+        return str(self.value)
+class html2parquet_favor_precision(str, enum.Enum):
+    TRUE = True
+    FALSE = False
+
+    def __str__(self):
+        return str(self.value)
+
+class html2parquet_favor_recall(str, enum.Enum):
+    TRUE = True
+    FALSE = False
 
     def __str__(self):
         return str(self.value)
 
 html2parquet_output_format_default = html2parquet_output_format.MARKDOWN
+html2parquet_favor_precision_default = html2parquet_favor_precision.TRUE
+html2parquet_favor_recall_default = html2parquet_favor_recall.TRUE
+
+
 html2parquet_output_format_cli_param = f"{cli_prefix}{html2parquet_output_format_key}"
+html2parquet_favor_precision_cli_param = f"{cli_prefix}{html2parquet_favor_precision_key}"
+html2parquet_favor_recall_cli_param = f"{cli_prefix}{html2parquet_favor_recall_key}"
 
 
 class Html2ParquetTransformConfiguration(TransformConfiguration):
@@ -134,14 +189,33 @@ def __init__(self):
             name=short_name,
             transform_class=Html2ParquetTransform,
         )
+
     def add_input_params(self, parser: ArgumentParser) -> None:
         parser.add_argument(
             f"--{html2parquet_output_format_cli_param}",
             type=html2parquet_output_format,
             choices=list(html2parquet_output_format),
             help="Output format for the contents column.",
-            default=html2parquet_output_format.MARKDOWN,
-        ) 
+            default=html2parquet_output_format.MARKDOWN
+        )
+
+        parser.add_argument(
+            f"--{html2parquet_favor_precision_cli_param}",
+            type=html2parquet_favor_precision,
+            choices=list(html2parquet_favor_precision),
+            help="Prefers less content but more accurate extraction.",
+            default=html2parquet_favor_precision.TRUE
+        )
+
+        parser.add_argument(
+            f"--{html2parquet_favor_recall_cli_param}",
+            type=html2parquet_favor_recall,
+            choices=list(html2parquet_favor_recall),
+            help="Extracts more content when uncertain.",
+            default=html2parquet_favor_recall.TRUE
+        )
+
+
 
     def apply_input_params(self, args: Namespace) -> bool:
         captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)

diff --git a/transforms/language/html2parquet/python/test-data/expected/html_zip.parquet b/transforms/language/html2parquet/python/test-data/expected/html_zip.parquet
diff --git a/transforms/language/html2parquet/python/test-data/expected/metadata.json b/transforms/language/html2parquet/python/test-data/expected/metadata.json
@@ -5,30 +5,41 @@
     "job name": "html2parquet",
     "job type": "pure python",
     "job id": "job_id",
-    "start_time": "2024-08-29 16:51:41",
-    "end_time": "2024-08-29 16:51:41",
+    "start_time": "2024-10-15 11:06:44",
+    "end_time": "2024-10-15 11:06:44",
     "status": "success"
   },
   "code": null,
   "job_input_params": {
+    "output_format": "markdown",
+    "favor_precision": "True",
+    "favor_recall": "True",
     "checkpointing": false,
     "max_files": -1,
     "random_samples": -1,
     "files_to_use": [
       ".html",
       ".zip"
-    ]
+    ],
+    "num_processors": 0
+  },
+  "execution_stats": {
+    "cpus": 106.4,
+    "gpus": 0,
+    "memory": 19.17,
+    "object_store": 0,
+    "execution time, min": 0.003
   },
   "job_output_stats": {
     "source_files": 2,
     "source_size": 460391,
     "result_files": 2,
-    "result_size": 13508,
-    "processing_time": 0.09080028533935547,
+    "result_size": 16898,
+    "processing_time": 0.182,
     "nrows": 3
   },
   "source": {
-    "name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/universal/html2parquet/python/test-data/input",
+    "name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/language/html2parquet/python/test-data/input",
     "type": "path"
   },
   "target": {

diff --git a/transforms/language/html2parquet/python/test-data/expected/test1.parquet b/transforms/language/html2parquet/python/test-data/expected/test1.parquet
diff --git a/transforms/language/html2parquet/ray/test-data/expected/html_zip.parquet b/transforms/language/html2parquet/ray/test-data/expected/html_zip.parquet
diff --git a/transforms/language/html2parquet/ray/test-data/expected/metadata.json b/transforms/language/html2parquet/ray/test-data/expected/metadata.json
@@ -3,32 +3,48 @@
   "job details": {
     "job category": "preprocessing",
     "job name": "html2parquet",
-    "job type": "pure python",
+    "job type": "ray",
     "job id": "job_id",
-    "start_time": "2024-08-29 16:51:41",
-    "end_time": "2024-08-29 16:51:41",
+    "start_time": "2024-10-15 11:13:06",
+    "end_time": "2024-10-15 11:13:07",
     "status": "success"
   },
   "code": null,
   "job_input_params": {
+    "output_format": "markdown",
+    "favor_precision": "True",
+    "favor_recall": "True",
     "checkpointing": false,
     "max_files": -1,
     "random_samples": -1,
     "files_to_use": [
       ".html",
       ".zip"
-    ]
+    ],
+    "number of workers": 1,
+    "worker options": {
+      "num_cpus": 0.8,
+      "max_restarts": -1
+    },
+    "actor creation delay": 0
+  },
+  "execution_stats": {
+    "cpus": 10,
+    "gpus": 0,
+    "memory": 38.15873718261719,
+    "object_store": 2.0,
+    "execution time, min": 0.008651264508565267
   },
   "job_output_stats": {
     "source_files": 2,
     "source_size": 460391,
     "result_files": 2,
-    "result_size": 13508,
-    "processing_time": 0.09080028533935547,
+    "result_size": 16898,
+    "processing_time": 0.16381311416625977,
     "nrows": 3
   },
   "source": {
-    "name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/universal/html2parquet/python/test-data/input",
+    "name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/language/html2parquet/ray/test-data/input",
     "type": "path"
   },
   "target": {

diff --git a/transforms/language/html2parquet/ray/test-data/expected/test1.parquet b/transforms/language/html2parquet/ray/test-data/expected/test1.parquet