From 188742d66b10cc254d2bbb636a0b7f6380de2832 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 23 Oct 2023 01:52:36 -0400 Subject: [PATCH] Add a Rust implementation of the Perl preprocessing logic This implementation uses a full HTML parser (Servo's, html5ever) and operates on a DOM. It ends up being somewhat more verbose than Perl, but hopefully also more extensible/maintainable. For now, this is hidden behind the PROCESS_WITH_RUST environment variable. That environment variable is used by default for the Docker path, but not otherwise. Subsequent work will remove the Perl implementations entirely. --- .dockerignore | 3 + .gitignore | 6 + Cargo.lock | 682 +++++++++++++++++++++++++++++++++++ Cargo.toml | 16 + Dockerfile | 9 + build.sh | 22 +- ci-build/Dockerfile | 8 + ci-build/inside-container.sh | 2 +- src/annotate_attributes.rs | 471 ++++++++++++++++++++++++ src/boilerplate.rs | 238 ++++++++++++ src/dom_utils.rs | 400 ++++++++++++++++++++ src/interface_index.rs | 412 +++++++++++++++++++++ src/io_utils.rs | 57 +++ src/main.rs | 77 ++++ src/parser.rs | 107 ++++++ src/represents.rs | 152 ++++++++ src/tag_omission.rs | 312 ++++++++++++++++ 17 files changed, 2968 insertions(+), 6 deletions(-) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/annotate_attributes.rs create mode 100644 src/boilerplate.rs create mode 100644 src/dom_utils.rs create mode 100644 src/interface_index.rs create mode 100644 src/io_utils.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs create mode 100644 src/represents.rs create mode 100644 src/tag_omission.rs diff --git a/.dockerignore b/.dockerignore index 7f80c3e4..216553b6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,3 +4,6 @@ !*.pl !build.sh !lint.sh +!Cargo.lock +!Cargo.toml +!src diff --git a/.gitignore b/.gitignore index 923fda16..5bbd5ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,9 @@ html/ output/ mdn/.id-list mdn/developer.mozilla.org/ +highlighter/ + + +# Added by cargo + +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..3e3aca45 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,682 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + +[[package]] +name = "html-build" +version = "0.0.0" +dependencies = [ + "html5ever", + "markup5ever_rcdom", + "regex", + "tempfile", + "tokio", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.1", + "libc", + "windows-sys", +] + +[[package]] +name = "libc" +version = "0.2.146" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +dependencies = [ + "libc", + "wasi", + "windows-sys", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "num_cpus" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +dependencies = [ + "hermit-abi 0.2.6", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro2" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + +[[package]] +name = "rustix" +version = "0.37.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +dependencies = [ + "autocfg", + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "tokio" +version = "1.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" +dependencies = [ + "autocfg", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + +[[package]] +name = "unicode-ident" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..ba472022 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "html-build" +version = "0.0.0" +publish = false +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tokio = { version = "1", features = ["full"] } +html5ever = "0.26.0" +markup5ever_rcdom = "0.2.0" +regex = "1" + +[dev-dependencies] +tempfile = "3" diff --git a/Dockerfile b/Dockerfile index 54b555bb..5dd646b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,16 @@ +FROM rust:1.73-slim as builder +WORKDIR /whatwg/html-build +COPY Cargo.lock Cargo.toml ./ +COPY src ./src/ +RUN cargo install --path . + FROM debian:stable-slim RUN apt-get update && \ apt-get install --yes --no-install-recommends ca-certificates curl git python3 python3-pip pipx && \ rm -rf /var/lib/apt/lists/* +COPY --from=builder /usr/local/cargo/bin/html-build /bin/html-build + COPY --from=ghcr.io/whatwg/wattsi:latest /whatwg/wattsi/bin/wattsi /bin/wattsi ENV PIPX_HOME /opt/pipx @@ -12,4 +20,5 @@ RUN pipx install bs-highlighter COPY . /whatwg/html-build/ ENV SKIP_BUILD_UPDATE_CHECK true +ENV PROCESS_WITH_RUST true ENTRYPOINT ["bash", "/whatwg/html-build/build.sh"] diff --git a/build.sh b/build.sh index 4ee9e3b2..a0f5a96e 100755 --- a/build.sh +++ b/build.sh @@ -32,6 +32,7 @@ HTML_CACHE=${HTML_CACHE:-$DIR/.cache} HTML_TEMP=${HTML_TEMP:-$DIR/.temp} HTML_OUTPUT=${HTML_OUTPUT:-$DIR/output} HTML_GIT_CLONE_OPTIONS=${HTML_GIT_CLONE_OPTIONS:-"--depth=2"} +PROCESS_WITH_RUST=${PROCESS_WITH_RUST:-false} # These are used by child scripts, and so we export them export HTML_CACHE @@ -529,13 +530,24 @@ function processSource { BUILD_TYPE="$2" cp -p entities/out/entities.inc "$HTML_CACHE" cp -p entities/out/entities-dtd.url "$HTML_CACHE" - if $VERBOSE; then - perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + if [[ $PROCESS_WITH_RUST == "true" ]]; then + if hash html-build 2>/dev/null; then + html-build <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" + else + CARGO_ARGS=() + $VERBOSE && CARGO_ARGS+=( --verbose ) + $QUIET && CARGO_ARGS+=( --quiet ) + cargo run "${CARGO_ARGS[@]}" --release <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" + fi else - perl .pre-process-main.pl < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + if $VERBOSE; then + perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + else + perl .pre-process-main.pl < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + fi + perl .pre-process-annotate-attributes.pl < "$HTML_TEMP/source-expanded-1" > "$HTML_TEMP/source-expanded-2" # this one could be merged + perl .pre-process-tag-omission.pl < "$HTML_TEMP/source-expanded-2" | perl .pre-process-index-generator.pl > "$HTML_TEMP/source-whatwg-complete" # this one could be merged fi - perl .pre-process-annotate-attributes.pl < "$HTML_TEMP/source-expanded-1" > "$HTML_TEMP/source-expanded-2" # this one could be merged - perl .pre-process-tag-omission.pl < "$HTML_TEMP/source-expanded-2" | perl .pre-process-index-generator.pl > "$HTML_TEMP/source-whatwg-complete" # this one could be merged runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output" "$HIGHLIGHT_SERVER_URL" if [[ $WATTSI_RESULT == "0" ]]; then diff --git a/ci-build/Dockerfile b/ci-build/Dockerfile index 9866366e..4312d778 100644 --- a/ci-build/Dockerfile +++ b/ci-build/Dockerfile @@ -1,5 +1,11 @@ # This Dockerfile is just used to run on Travis CI in an environment that can easily and repeatedly # install our build dependencies. +FROM rust:1.73-slim as builder +WORKDIR /whatwg/html-build +COPY Cargo.lock Cargo.toml ./ +COPY src ./src/ +RUN cargo install --path . + FROM debian:stable RUN apt-get update && \ @@ -17,6 +23,8 @@ RUN apt-get update && \ # - Prince # - fonts, for when Prince renders to PDF +COPY --from=builder /usr/local/cargo/bin/html-build /bin/html-build + COPY --from=ghcr.io/whatwg/wattsi:latest /whatwg/wattsi/bin/wattsi /bin/wattsi ENV PIPX_HOME /opt/pipx diff --git a/ci-build/inside-container.sh b/ci-build/inside-container.sh index efdd85ed..292da8b4 100644 --- a/ci-build/inside-container.sh +++ b/ci-build/inside-container.sh @@ -6,7 +6,7 @@ cd "$(dirname "$0")/../.." PDF_SERVE_PORT=8080 -SKIP_BUILD_UPDATE_CHECK=true ./html-build/build.sh +PROCESS_WITH_RUST=true SKIP_BUILD_UPDATE_CHECK=true ./html-build/build.sh echo "" echo "Running conformance checker..." diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs new file mode 100644 index 00000000..00720847 --- /dev/null +++ b/src/annotate_attributes.rs @@ -0,0 +1,471 @@ +//! Augments the content attribute list for each element with a description found in the Attributes table. + +use std::collections::HashMap; +use std::io; +use std::rc::Rc; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; + +use crate::dom_utils::{self, NodeHandleExt}; +use crate::parser; + +#[derive(Debug, Default)] +struct Descriptions { + /// The default description, as a list of nodes. + default: Vec, + + /// The variant description, if any, as an unparsed string. + variant: Option, +} + +#[derive(Debug)] +struct Edit { + /// Handle on the
element which is to be filled in. + dd: Handle, + + /// The data-x attribute which must be described. + key: StrTendril, + + /// Whether this location has requested the variant/alternate description. + wants_variant_description: bool, + + /// Whether this is described as having "special semantics" and so must be + /// formatted differently. + has_special_semantics: bool, +} + +pub struct Processor { + /// Map from attribute key (e.g., attr-elem-someattribute) to the + /// descriptions found in the Attributes table. + attributes: HashMap, + + /// List of
nodes in Content attributes sections that need to be filled in. + edits: Vec, +} + +impl Processor { + pub fn new() -> Self { + Processor { + attributes: HashMap::new(), + edits: Vec::new(), + } + } + + pub fn visit(&mut self, node: &Handle) { + // We're looking for a (which is under the Attributes heading). + if node.is_html_element(&local_name!("table")) && node.has_id("attributes-1") { + self.index_attribute_table(node); + } + + // We're looking for the following: + //
+ // ... + //
Content attributes:
+ //
Global attributes
+ //
href
+ //
someattribute
+ // ... + fn is_content_attribute_dt(dt: &Handle) -> bool { + if !dt.is_html_element(&local_name!("dt")) { + return false; + } + match dt.parent_node() { + Some(p) if p.is_html_element(&local_name!("dl")) && p.has_class("element") => (), + _ => return false, + } + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + dt.any_child(|c| c.attribute_is(&data_x, "concept-element-attributes")) + } + if is_content_attribute_dt(node) { + self.index_attribute_list(node); + } + } + + fn index_attribute_table(&mut self, table: &Handle) { + let tbody = match table + .children + .borrow() + .iter() + .find(|n| n.is_html_element(&local_name!("tbody"))) + { + Some(tbody) => tbody.clone(), + None => return, + }; + for row in tbody + .children + .borrow() + .iter() + .filter(|c| c.is_html_element(&local_name!("tr"))) + { + // Each row is expected to have this structure: + //
+ //
someattribute + // a; b; ... + // Description of how someattribute applies to a, b, etc. + // Description if the valid values + // And we want to extract the descriptions so that we can later insert them + // alongside the definitions of attr-a-someattribute, etc. + let row_children = row.children.borrow(); + let mut tds = row_children + .iter() + .filter(|c| c.is_html_element(&local_name!("td"))); + let (keys_td, description_td) = match (tds.next(), tds.next()) { + (Some(a), Some(b)) => (a, b), + _ => continue, + }; + + // These will be strings like "attr-input-maxlength", which identify particular element-attribute pairs. + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + for attr_key in keys_td + .children + .borrow() + .iter() + .filter_map(|c| c.get_attribute(&data_x).filter(|v| !v.is_empty())) + { + // Find the comment, if one exists, and extract its contents. + let description = description_td.children.borrow(); + let mut variant_comment = None; + let mut variant_str = None; + for node in description.iter() { + if let NodeData::Comment { ref contents } = node.data { + if contents.trim().starts_with("or:") { + variant_comment = Some(node); + variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start())); + } + } + } + + // Store the (already parsed) ordinary description. If a variant + // comment exists, omit it and instead store its unparsed + // string. + let descriptions = Descriptions { + default: description_td + .children + .borrow() + .iter() + .filter(|c| variant_comment.map_or(true, |vc| !Rc::ptr_eq(c, vc))) + .map(|c| c.deep_clone()) + .collect(), + variant: variant_str, + }; + let existing = self.attributes.entry(attr_key).or_default(); + if existing.default.is_empty() { + existing.default = descriptions.default; + } else if !descriptions.default.is_empty() { + if let NodeData::Text { ref contents } = existing.default.last().unwrap().data { + let mut borrow = contents.borrow_mut(); + if let Some(last_non_ws) = borrow.rfind(|c: char| !c.is_ascii_whitespace()) + { + let to_remove = borrow.len32() - (last_non_ws as u32) - 1; + borrow.pop_back(to_remove); + } + } + existing.default.push(Handle::create_text_node("; ")); + existing.default.extend(descriptions.default.into_iter()); + } + if existing.variant.is_none() { + existing.variant = descriptions.variant; + } else if descriptions.variant.is_some() { + let existing_variant = existing.variant.as_mut().unwrap(); + existing_variant.push_slice("; "); + existing_variant.push_tendril(&descriptions.variant.unwrap()); + } + } + } + } + + fn index_attribute_list(&mut self, dt: &Handle) { + // If a
contains , it is not annotated. + // If it contains , the description found in a comment is used instead. + // If it mentions "special semantics", it is joined with a colon rather than an em dash. + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + let parent = dt.parent_node().unwrap(); + let children = parent.children.borrow(); + self.edits.extend( + children + .iter() + .skip_while(|n| !Rc::ptr_eq(n, dt)) + .skip(1) + .filter(|n| n.is_element()) + .take_while(|e| e.is_html_element(&local_name!("dd"))) + .filter_map(|dd| { + let mut can_annotate = true; + let mut wants_variant_description = false; + let mut has_special_semantics = false; + let mut key = None; + dom_utils::scan_dom(dd, &mut |n| match &n.data { + NodeData::Comment { ref contents } if contents.trim() == "no-annotate" => { + can_annotate = false; + } + NodeData::Comment { ref contents } if contents.trim() == "variant" => { + wants_variant_description = true; + } + NodeData::Text { ref contents } + if contents.borrow().contains("has special semantics") => + { + has_special_semantics = true; + } + NodeData::Element { .. } => { + if key.is_none() { + key = n.get_attribute(&data_x); + } + } + _ => (), + }); + match (can_annotate, key) { + (true, Some(key)) => Some(Edit { + dd: dd.clone(), + key, + wants_variant_description, + has_special_semantics, + }), + _ => None, + } + }), + ); + } + + pub async fn apply(self) -> io::Result<()> { + let em_dash = StrTendril::from(" \u{2014} "); + + for Edit { + dd, + key, + wants_variant_description, + has_special_semantics, + } in self.edits + { + // Find the requested description to insert at this point. + let descriptions = match self.attributes.get(&key) { + Some(descriptions) => descriptions, + None => continue, + }; + let mut description: Vec = match descriptions { + Descriptions { + variant: Some(ref variant), + .. + } if wants_variant_description => { + parser::parse_fragment_async(variant[..].as_bytes(), &dd).await? + } + _ if wants_variant_description => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Attribute {key} wants variant description, but no was found" + ), + )) + } + Descriptions { ref default, .. } => { + default.iter().map(|n| n.deep_clone()).collect() + } + }; + + let mut dd_children = dd.children.borrow_mut(); + if has_special_semantics { + // Replace the trailing period with a separating colon. + if let Some(NodeData::Text { contents }) = dd_children.last_mut().map(|n| &n.data) { + let mut text = contents.borrow_mut(); + *text = StrTendril::from( + text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'), + ); + text.push_slice(": "); + } + } else { + // Insert an em dash. + description.insert(0, Handle::create_text_node(em_dash.clone())); + } + + // Insert the description. + for child in description.iter_mut() { + child.parent.set(Some(Rc::downgrade(&dd))); + } + dd_children.extend(description); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_simple() -> io::Result<()> { + // This is a simple document with enough stuff in it. Elements are shown + // before and after the attributes table, to demonstrate that this is + // not sensitive to which order they occur in (i.e., these could be + // reordered in the HTML spec). + let document = parse_document_async( + r#" +

The a element

+
+
Categories +
Flow content +
Content attributes +
href +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Categories +
Flow content +
Content attributes +
href +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Categories +
Flow content +
Content attributes +
href + — Destination of the hyperlink +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Categories +
Flow content +
Content attributes +
href + — Destination of the hyperlink +
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_variant() -> io::Result<()> { + // This checks that and work correctly. + // i.e., the variant description is used where requested + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
href +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Content attributes +
href +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
href + — Destination of the hyperlink +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Content attributes +
href + — click on shapes!
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_special_semantics() -> io::Result<()> { + // Checks that the special rules for using : instead of an em dash work. + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element. +
+

Attributes

+ +
nameaAnchor name +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element: Anchor name +
+

Attributes

+ +
nameaAnchor name +
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_special_semantics_multiple() -> io::Result<()> { + // Checks that the special rules for joining any special semantics with a ; work. + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element. +
+

Attributes

+ +
nameaAnchor name +
nameaName of the anchor +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor +
+

Attributes

+ +
nameaAnchor name +
nameaName of the anchor +
+ "#.trim() + ); + Ok(()) + } +} diff --git a/src/boilerplate.rs b/src/boilerplate.rs new file mode 100644 index 00000000..faab19f2 --- /dev/null +++ b/src/boilerplate.rs @@ -0,0 +1,238 @@ +//! Replaces comments. +//! These can either be comment nodes (in which case the resulting fragment will +//! be inserted), or the complete value of an element's attribute (in which case +//! the text will become the attribute value). + +use std::io; +use std::path::{Path, PathBuf}; + +use html5ever::tendril::{self, SendTendril}; +use html5ever::{local_name, Attribute, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; +use tokio::fs::File; +use tokio::task::JoinHandle; + +use crate::dom_utils::NodeHandleExt; +use crate::io_utils::{async_error, is_safe_path, read_to_str_tendril}; +use crate::parser; + +type SendStrTendril = SendTendril; + +enum Edit { + ReplaceHTML(Handle, JoinHandle>), + ReplaceAttr(Handle, QualName, JoinHandle>), + ReplaceText(Handle, JoinHandle>), +} + +pub struct Processor { + /// Path to look for boilerplate files. + path: PathBuf, + + /// Path to look for example files. + example_path: PathBuf, + + /// Changes to be made in the apply step. + edits: Vec, +} + +impl Processor { + pub fn new(path: impl Into, example_path: impl Into) -> Self { + Self { + path: path.into(), + example_path: example_path.into(), + edits: vec![], + } + } + + /// Should be called for each node in the document. + /// Identifies replacements which will be needed, and starts the necessary + /// I/O. + pub fn visit(&mut self, node: &Handle) { + match node.data { + // BOILERPLATE comments will need to be replaced with their + // corresponding HTML, parsed. Open the file so that we can do so on + // demand. + NodeData::Comment { ref contents } if contents.starts_with("BOILERPLATE ") => { + let path = Path::new(contents[12..].trim()); + let file = if is_safe_path(path) { + tokio::spawn(File::open(self.path.join(path))) + } else { + async_error(io::Error::new( + io::ErrorKind::PermissionDenied, + "cannot traverse to a parent directory in {path}", + )) + }; + self.edits.push(Edit::ReplaceHTML(node.clone(), file)); + } + // Pseudo-comments can also appear in element attributes. These are + // not parsed as HTML, so we simply want to read them into memory so + // they can be replaced. + NodeData::Element { ref attrs, .. } => { + for Attribute { + ref name, + ref value, + } in attrs.borrow().iter() + { + if value.starts_with("") { + let path = Path::new(value[16..value.len() - 3].trim()); + let file_contents = if is_safe_path(path) { + read_to_str_tendril(self.path.join(path)) + } else { + async_error(io::Error::new( + io::ErrorKind::PermissionDenied, + "cannot traverse to a parent directory in {path}", + )) + }; + self.edits.push(Edit::ReplaceAttr( + node.clone(), + name.clone(), + file_contents, + )); + } + } + } + //
 and 
 which contain EXAMPLE also need to be
+            // replaced, but as plain text. These are loaded from the "examples"
+            // directory instead.
+            NodeData::Text { ref contents } => {
+                let borrowed_contents = contents.borrow();
+                let text = borrowed_contents.trim();
+                if !text.starts_with("EXAMPLE ") {
+                    return;
+                }
+                const PRE: LocalName = local_name!("pre");
+                const CODE: LocalName = local_name!("code");
+                let has_suitable_parent = node.parent_node().map_or(false, |p| {
+                    p.is_html_element(&PRE)
+                        || (p.is_html_element(&CODE)
+                            && p.parent_node().map_or(false, |p2| p2.is_html_element(&PRE)))
+                });
+                if has_suitable_parent {
+                    let path = Path::new(text[8..].trim());
+                    let file_contents = if is_safe_path(path) {
+                        read_to_str_tendril(self.example_path.join(path))
+                    } else {
+                        async_error(io::Error::new(
+                            io::ErrorKind::PermissionDenied,
+                            "cannot traverse to a parent directory in {path}",
+                        ))
+                    };
+                    self.edits
+                        .push(Edit::ReplaceText(node.clone(), file_contents))
+                }
+            }
+            _ => (),
+        }
+    }
+
+    /// Applies the required replacements, in order.
+    pub async fn apply(self) -> io::Result<()> {
+        for edit in self.edits {
+            match edit {
+                // When parsing HTML, we need the context it's in so that the
+                // context-sensitive parsing behavior works correctly.
+                Edit::ReplaceHTML(node, replacement) => {
+                    let context = match node.parent_node() {
+                        Some(n) => n,
+                        _ => continue,
+                    };
+                    let file: File = replacement.await??;
+                    let new_children = parser::parse_fragment_async(file, &context).await?;
+                    node.replace_with(new_children);
+                }
+                Edit::ReplaceAttr(element, ref attr, replacement) => {
+                    element.set_attribute(attr, replacement.await??.into());
+                }
+                Edit::ReplaceText(element, replacement) => match element.data {
+                    NodeData::Text { ref contents } => {
+                        contents.replace(replacement.await??.into());
+                    }
+                    _ => panic!("not text"),
+                },
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dom_utils;
+    use crate::parser::{parse_document_async, tests::serialize_for_test};
+    use tempfile::TempDir;
+
+    #[tokio::test]
+    async fn test_replace_boilerplate_comment() -> io::Result<()> {
+        let boilerplate_dir = TempDir::new()?;
+        tokio::fs::write(
+            boilerplate_dir.path().join("languages"),
+            "
enEnglish", + ) + .await?; + let document = + parse_document_async("".as_bytes()).await?; + let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
enEnglish
"); + Ok(()) + } + + #[tokio::test] + async fn test_replace_boilerplate_attribute() -> io::Result<()> { + let boilerplate_dir = TempDir::new()?; + tokio::fs::write( + boilerplate_dir.path().join("data.url"), + "data:text/html,Hello, world!", + ) + .await?; + let document = + parse_document_async("\">hello".as_bytes()) + .await?; + let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "hello"); + Ok(()) + } + + #[tokio::test] + async fn test_replace_example() -> io::Result<()> { + let example_dir = TempDir::new()?; + tokio::fs::write(example_dir.path().join("ex1"), "first").await?; + tokio::fs::write(example_dir.path().join("ex2"), "second").await?; + tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; + let document = + parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) + .await?; + let mut proc = Processor::new(Path::new("."), example_dir.path()); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
first
second

EXAMPLE ignored

" ); + Ok(()) + } + + #[tokio::test] + async fn test_errors_unsafe_paths() -> io::Result<()> { + let bad_path_examples = [ + "", + "
\">
", + "
EXAMPLE ../foo
", + ]; + for example in bad_path_examples { + let document = parse_document_async(example.as_bytes()).await?; + let mut proc = Processor::new(Path::new("."), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply().await; + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::PermissionDenied)); + } + Ok(()) + } +} diff --git a/src/dom_utils.rs b/src/dom_utils.rs new file mode 100644 index 00000000..34b07aeb --- /dev/null +++ b/src/dom_utils.rs @@ -0,0 +1,400 @@ +use std::cell::RefCell; +use std::rc::Rc; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, Attribute, LocalName, QualName}; +use markup5ever_rcdom::{Handle, Node, NodeData}; + +/// Extensions to the DOM interface to make manipulation more ergonimc. +pub trait NodeHandleExt { + /// Returns a handle to the parent node, if there is one. + fn parent_node(&self) -> Option + where + Self: Sized; + + /// Gets an attribute on the element, or None if absent or not an element. + fn get_attribute(&self, name: &QualName) -> Option; + + /// Returns whether the node has the named attribute. + fn has_attribute(&self, name: &QualName) -> bool { + self.get_attribute(name).is_some() + } + + /// Returns true if the attribute exists and the predicate matches it. + fn attribute_matches(&self, name: &QualName, f: impl Fn(&str) -> bool) -> bool { + self.get_attribute(name).map_or(false, |v| f(&v)) + } + + /// Returns true if the attribute exists and has the value mentioned. + fn attribute_is(&self, name: &QualName, expected: &str) -> bool { + self.get_attribute(name).as_deref() == Some(expected) + } + + /// Sets an attribute on the element. Must be an element. + fn set_attribute(&self, name: &QualName, value: StrTendril); + + /// Returns true if the node is an element. + fn is_element(&self) -> bool; + + /// Returns true if the node is an HTML element with the given tag name. + fn is_html_element(&self, tag_name: &LocalName) -> bool; + + /// Returns true if the node is an element with the given class. + fn has_class(&self, class: &str) -> bool; + + /// Returns true if the node is an element with the given ID. + fn has_id(&self, id: &str) -> bool { + const ID: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("id"), + }; + self.attribute_is(&ID, id) + } + + /// If this is a text node, returns its text. + fn node_text(&self) -> Option; + + /// Concatenate the text of the node and its descendants. + fn text_content(&self) -> StrTendril; + + /// True if any child matches the predicate. + fn any_child(&self, f: impl Fn(&Self) -> bool) -> bool; + + /// Appends children (without checking node type). + fn append_children(&self, children: impl Iterator); + + /// Same, but just one. + fn append_child(&self, child: Self) + where + Self: Sized, + { + self.append_children(std::iter::once(child)) + } + + /// Inserts children before the specified child. + fn insert_children_before(&self, existing: &Self, new: impl Iterator); + + /// Same, but just one. + fn insert_child(&self, existing: &Self, new: Self) + where + Self: Sized, + { + self.insert_children_before(existing, std::iter::once(new)) + } + + /// Removes the node from its parent and replaces it with the nodes provided. + /// Does nothing if the node has no parent. + fn replace_with(&self, replacements: Vec) + where + Self: Sized; + + /// Clones the node and its entire subtree (including template contents). + fn deep_clone(&self) -> Self; + + /// Create a new element, with the given children. + fn create_element(name: LocalName) -> ElementBuilder + where + Self: Sized; + + /// Create a new text node. + fn create_text_node(text: impl Into) -> Self + where + Self: Sized; +} + +/// Convenience helper for constructing nodes. Use like: +/// Handle::create_element(local_name!("a")) +/// .attribute(&local_name!("href"), "/") +/// .text("Home") +/// .build() +pub struct ElementBuilder { + element: T, +} + +impl ElementBuilder { + pub fn attribute(self, name: &LocalName, value: impl Into) -> Self { + self.element + .set_attribute(&QualName::new(None, ns!(), name.clone()), value.into()); + self + } + + pub fn children(self, children: impl Iterator) -> Self { + self.element.append_children(children); + self + } + + pub fn child(self, child: T) -> Self { + self.children(std::iter::once(child)) + } + + pub fn text(self, text: impl Into) -> Self { + self.child(::create_text_node(text)) + } + + pub fn build(self) -> T { + self.element + } +} + +/// Recursively visits every DOM node (preorder). Template contents are visited +/// after children, but there are seldom both. +pub fn scan_dom(handle: &Handle, f: &mut F) { + f(handle); + + for child in handle.children.borrow().iter() { + scan_dom(child, f); + } + + if let NodeData::Element { + template_contents: ref tc, + .. + } = handle.data + { + if let Some(ref tc_handle) = *tc.borrow() { + scan_dom(tc_handle, f); + } + } +} + +/// Given a
element, find the corresponding
elements. +/// +/// This is more subtle than you might immediately think, because there can be +/// multiple
listing various terms with one or more common
+/// definitions. We need to find the
in the child list, and then skip it +/// and any other
, before providing the
that follow. +pub fn dt_descriptions(dt: &Handle) -> Vec { + assert!(dt.is_html_element(&local_name!("dt"))); + if let Some(ref dl) = dt + .parent_node() + .filter(|n| n.is_html_element(&local_name!("dl"))) + { + dl.children + .borrow() + .iter() + .filter(|n| n.is_element()) + .skip_while(|n| !Rc::ptr_eq(n, dt)) + .skip_while(|n| n.is_html_element(&local_name!("dt"))) + .take_while(|n| n.is_html_element(&local_name!("dd"))) + .cloned() + .collect() + } else { + Vec::new() + } +} + +/// Returns the heading level (from 1 to 6) that the

through

declares, or None for all other nodes. +pub fn heading_level(node: &Handle) -> Option { + let local = match node.data { + NodeData::Element { ref name, .. } if name.ns == ns!(html) => &name.local, + _ => return None, + }; + match *local { + local_name!("h1") => Some(1), + local_name!("h2") => Some(2), + local_name!("h3") => Some(3), + local_name!("h4") => Some(4), + local_name!("h5") => Some(5), + local_name!("h6") => Some(6), + _ => None, + } +} + +impl NodeHandleExt for Handle { + fn parent_node(&self) -> Option { + let weak_parent = self.parent.take()?; + let parent = weak_parent.upgrade().expect("dangling parent"); + self.parent.set(Some(weak_parent)); + Some(parent) + } + + fn get_attribute(&self, name: &QualName) -> Option { + let attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow(), + _ => return None, + }; + attrs + .iter() + .find(|a| &a.name == name) + .map(|a| a.value.clone()) + } + + fn set_attribute(&self, name: &QualName, value: StrTendril) { + let mut attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow_mut(), + _ => panic!("not an element"), + }; + if let Some(attr) = attrs.iter_mut().find(|a| &a.name == name) { + attr.value = value; + } else { + attrs.push(Attribute { + name: name.clone(), + value, + }); + } + } + + fn is_element(&self) -> bool { + matches!(&self.data, NodeData::Element { .. }) + } + + fn is_html_element(&self, tag_name: &LocalName) -> bool { + match &self.data { + NodeData::Element { + name: + QualName { + ns: ns!(html), + ref local, + .. + }, + .. + } => local == tag_name, + _ => false, + } + } + + fn has_class(&self, class: &str) -> bool { + const CLASS: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("class"), + }; + self.get_attribute(&CLASS) + .map_or(false, |v| v.split_ascii_whitespace().any(|c| c == class)) + } + + fn node_text(&self) -> Option { + match &self.data { + NodeData::Text { ref contents } => Some(contents.borrow().clone()), + _ => None, + } + } + + fn text_content(&self) -> StrTendril { + let mut text = StrTendril::new(); + scan_dom(self, &mut |n| { + if let NodeData::Text { ref contents } = &n.data { + text.push_tendril(&contents.borrow()); + } + }); + text + } + + fn any_child(&self, f: impl Fn(&Handle) -> bool) -> bool { + self.children.borrow().iter().any(f) + } + + fn append_children(&self, children: impl Iterator) { + self.children.borrow_mut().extend(children.inspect(|c| { + let old_parent = c.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + })); + } + + fn insert_children_before(&self, existing: &Handle, new: impl Iterator) { + let mut children = self.children.borrow_mut(); + let i = children + .iter() + .position(|c| Rc::ptr_eq(c, existing)) + .expect("corrupt child list"); + children.splice( + i..i, + new.inspect(|c| { + let old_parent = c.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + }), + ); + } + + fn replace_with(&self, replacements: Vec) { + let parent = match self.parent.take() { + Some(n) => n.upgrade().expect("dangling parent"), + _ => return, + }; + for new_child in replacements.iter() { + new_child.parent.replace(Some(Rc::downgrade(&parent))); + } + let mut children = parent.children.borrow_mut(); + let i = children + .iter() + .position(|c| Rc::ptr_eq(c, self)) + .expect("corrupt child list"); + children.splice(i..=i, replacements); + self.parent.take(); + } + + fn deep_clone(&self) -> Handle { + use NodeData::*; + let new_node_data = match &self.data { + Document => Document, + Doctype { + name, + public_id, + system_id, + } => Doctype { + name: name.clone(), + public_id: public_id.clone(), + system_id: system_id.clone(), + }, + Text { contents } => Text { + contents: contents.clone(), + }, + Comment { contents } => Comment { + contents: contents.clone(), + }, + Element { + name, + attrs, + template_contents, + mathml_annotation_xml_integration_point, + } => Element { + name: name.clone(), + attrs: attrs.clone(), + template_contents: RefCell::new( + template_contents + .borrow() + .as_ref() + .map(|tc| tc.deep_clone()), + ), + mathml_annotation_xml_integration_point: *mathml_annotation_xml_integration_point, + }, + ProcessingInstruction { target, contents } => ProcessingInstruction { + target: target.clone(), + contents: contents.clone(), + }, + }; + let node = Node::new(new_node_data); + let mut children = node.children.borrow_mut(); + *children = self + .children + .borrow() + .iter() + .map(|c| c.deep_clone()) + .collect(); + for child in children.iter_mut() { + let old_parent = child.parent.replace(Some(Rc::downgrade(&node))); + assert!(old_parent.is_none()); + } + drop(children); + node + } + + fn create_element(name: LocalName) -> ElementBuilder { + let new_node_data = NodeData::Element { + name: QualName::new(None, ns!(html), name), + attrs: RefCell::new(Vec::new()), + template_contents: RefCell::new(None), + mathml_annotation_xml_integration_point: false, + }; + ElementBuilder { + element: Node::new(new_node_data), + } + } + + fn create_text_node(text: impl Into) -> Handle { + let new_node_data = NodeData::Text { + contents: RefCell::new(text.into()), + }; + Node::new(new_node_data) + } +} diff --git a/src/interface_index.rs b/src/interface_index.rs new file mode 100644 index 00000000..7039fd9a --- /dev/null +++ b/src/interface_index.rs @@ -0,0 +1,412 @@ +//! Generates an index of WebIDL interfaces. +//! This index is inserted where "INSERT INTERFACES HERE" appears. + +use std::collections::BTreeMap; +use std::io; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, QualName}; +use markup5ever_rcdom::Handle; + +use crate::dom_utils::NodeHandleExt; + +#[derive(Default, Debug)] +struct InterfaceInfo { + /// Number of times the interface definition was seen. Should be one. + /// We store other numbers for convenience in error handling and reporting. + seen: u32, + + /// The IDs of the partial interfaces, in the order they appear in the document. + partials: Vec, + + /// Set to true if a partial is missing its ID. + has_partial_with_no_id: bool, +} + +pub struct Processor { + /// The interfaces encountered, keyed and sorted by name. + interfaces: BTreeMap, + + /// The text nodes which contains the text "INSERT INTERFACES HERE". + marker_nodes: Vec, +} + +/// The string which marks where the index belongs. Ideally this would be a node +/// and not plain text. +const MARKER: &str = "INSERT INTERFACES HERE"; + +impl Processor { + pub fn new() -> Self { + Processor { + interfaces: BTreeMap::new(), + marker_nodes: Vec::new(), + } + } + + pub fn visit(&mut self, node: &Handle) { + const ID: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("id"), + }; + // We're looking for inside a
, to find
+        // potential interfaces defined there.
+        //
+        // One surprise here -- there is an "interface Example" that is not defined
+        // according to Wattsi. It yells about this not being defined, and the
+        // prior Perl preprocessing actually requires the 
 have no
+        // attributes.
+        if node.is_html_element(&local_name!("code"))
+            && node.has_class("idl")
+            && node.parent_node().map_or(false, |p| {
+                p.is_html_element(&local_name!("pre")) && !p.has_class("extract")
+            })
+        {
+            let borrowed_children = node.children.borrow();
+            for window in borrowed_children.windows(2) {
+                let is_partial = match window[0].node_text() {
+                    Some(a) if a.ends_with("partial interface ") => true,
+                    Some(a) if a.ends_with("interface ") => false,
+                    _ => continue,
+                };
+                // These definitions must appear as a ,  or  element.
+                if !window[1].is_html_element(&local_name!("span"))
+                    && !window[1].is_html_element(&local_name!("dfn"))
+                    && !window[1].is_html_element(&local_name!("a"))
+                {
+                    continue;
+                }
+                let name = window[1].text_content();
+                let info = self.interfaces.entry(name).or_default();
+                if is_partial {
+                    if let Some(id) = window[1].get_attribute(&ID) {
+                        info.partials.push(id);
+                    } else {
+                        info.has_partial_with_no_id = true;
+                    }
+                } else {
+                    info.seen += 1;
+                }
+            }
+        }
+
+        if node.node_text().map_or(false, |t| t.contains(MARKER)) {
+            self.marker_nodes.push(node.clone());
+        }
+    }
+
+    pub fn apply(self) -> io::Result<()> {
+        // It is likely an author error to not include anywhere to insert an
+        // interface index. More than one is supported, mainly because it's no
+        // more work than enforcing that just one exists.
+        if self.marker_nodes.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Marker {MARKER:?} not found."),
+            ));
+        }
+        if self.marker_nodes.len() > 1 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!(
+                    "{MARKER:?} found {} times, expected just one.",
+                    self.marker_nodes.len()
+                ),
+            ));
+        }
+        for marker in self.marker_nodes {
+            // We need to find where the marker appears in the text so that we
+            // can split it into two text nodes.
+            let text = marker.node_text().expect("should still be a text node");
+            let position: u32 = match text.find(MARKER) {
+                None => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        format!("Marker {MARKER:?} not found (but was during first pass)."),
+                    ));
+                }
+                Some(p) => p.try_into().unwrap(),
+            };
+            let end_position: u32 = position + TryInto::::try_into(MARKER.len()).unwrap();
+            let before = text.subtendril(0, position);
+            let after = text.subtendril(end_position, text.len32() - end_position);
+
+            // Then, we need to construct a list of interfaces and their partial interfaces.
+            let mut ul =
+                Handle::create_element(local_name!("ul")).attribute(&local_name!("class"), "brief");
+            for (name, info) in &self.interfaces {
+                if info.seen > 1 {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        format!("Interface {name} defined {} times.", info.seen),
+                    ));
+                }
+                fn make_link(id: &str, text: &str) -> Handle {
+                    Handle::create_element(local_name!("a"))
+                        .attribute(&local_name!("href"), format!("#{id}"))
+                        .text(text)
+                        .build()
+                }
+                let mut li = Handle::create_element(local_name!("li")).child(
+                    Handle::create_element(local_name!("code"))
+                        .text(name.clone())
+                        .build(),
+                );
+                match &info.partials[..] {
+                    [] => (),
+                    [sole_partial] => {
+                        li = li.text(", ").child(make_link(sole_partial, "partial"));
+                    }
+                    [first, rest @ ..] => {
+                        li = li.text(", ").child(make_link(first, "partial 1"));
+                        for (i, p) in rest.iter().enumerate() {
+                            li = li.text(" ").child(make_link(p, &(i + 2).to_string()));
+                        }
+                    }
+                }
+                ul = ul.child(li.build());
+            }
+
+            // Finally, we replace the marker's text node with the combination of the two.
+            marker.replace_with(vec![
+                Handle::create_text_node(before),
+                ul.build(),
+                Handle::create_text_node(after),
+            ]);
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dom_utils;
+    use crate::parser::{parse_document_async, tests::serialize_for_test};
+
+    #[tokio::test]
+    async fn test_two_interfaces_in_one_block() -> io::Result<()> {
+        let document = parse_document_async(
+            r#"
+

+interface HTMLMarqueeElement { ... }
+interface HTMLBlinkElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

+interface HTMLMarqueeElement { ... }
+interface HTMLBlinkElement { ... }
+
+
  • HTMLBlinkElement
  • HTMLMarqueeElement
+ "#.trim()); + Ok(()) + } + + #[tokio::test] + async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+interface HTMLBlinkElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+interface HTMLBlinkElement { ... }
+
+
  • HTMLBlinkElement
  • HTMLMarqueeElement
+ "#.trim()); + Ok(()) + } + + #[tokio::test] + async fn interface_with_partial() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+interface HTMLMarqueeElement { ... }
+
+

+partial interface HTMLMarqueeElement { ... }
+
+
+ "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn interface_with_two_partials() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+ + "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn only_partials() -> io::Result<()> { + let document = parse_document_async( + r#" +

+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+ + "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn marker_before() -> io::Result<()> { + let document = parse_document_async( + r#" +INSERT INTERFACES HERE +

+interface HTMLMarqueeElement { ... }
+
+ "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +
  • HTMLMarqueeElement
+

+interface HTMLMarqueeElement { ... }
+
+ "## + .trim() + ); + Ok(()) + } + + #[tokio::test] + async fn no_marker() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } + + #[tokio::test] + async fn duplicate_marker() -> io::Result<()> { + let document = parse_document_async( + "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
".as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } + + #[tokio::test] + async fn duplicate_dfn() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+interface HTMLMarqueeElement { ... }
+
+ "# + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } +} diff --git a/src/io_utils.rs b/src/io_utils.rs new file mode 100644 index 00000000..d6a329d2 --- /dev/null +++ b/src/io_utils.rs @@ -0,0 +1,57 @@ +//! Misccellaneous utilities for I/O. + +use std::io; +use std::path::Path; + +use html5ever::tendril::{self, SendTendril, StrTendril}; +use tokio::task::JoinHandle; + +type SendStrTendril = SendTendril; + +/// Check that a path is safe to open, even if the source is potentially untrusted. +pub fn is_safe_path(path: impl AsRef) -> bool { + use std::path::Component; + path.as_ref() + .components() + .all(|c| matches!(c, Component::Normal(_) | Component::CurDir)) +} + +/// In a spawned task, read to a string, then move it to a tendril. +pub fn read_to_str_tendril(path: impl AsRef) -> JoinHandle> { + let path = path.as_ref().to_owned(); + tokio::spawn(async move { + let string = tokio::fs::read_to_string(path).await?; + Ok(StrTendril::from(string).into_send()) + }) +} + +/// Creates a join Handle for an error. Useful when an operation will fail, but +/// it's more convenient to handle later on. +pub fn async_error(err: io::Error) -> JoinHandle> { + tokio::spawn(async move { Err(err) }) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_is_safe_path() { + assert!(is_safe_path("a.txt")); + assert!(is_safe_path("a/b.txt")); + assert!(is_safe_path("a/b/c/./d.txt")); + assert!(!is_safe_path("../parent.txt")); + assert!(!is_safe_path("/etc/passwd")); + } + + #[tokio::test] + async fn test_read_to_str_tendril() -> io::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("a.txt"); + tokio::fs::write(&file_path, "Hello, world!").await?; + let send_tendril = read_to_str_tendril(&file_path).await??; + assert_eq!(StrTendril::from(send_tendril).as_ref(), "Hello, world!"); + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..975d24ef --- /dev/null +++ b/src/main.rs @@ -0,0 +1,77 @@ +use html5ever::serialize::{serialize, SerializeOpts}; +use std::borrow::Cow; +use std::default::Default; +use std::env; +use std::ffi::OsStr; +use std::io::{self, BufWriter}; +use std::path::{Path, PathBuf}; + +use markup5ever_rcdom::SerializableHandle; + +mod annotate_attributes; +mod boilerplate; +mod dom_utils; +mod interface_index; +mod io_utils; +mod parser; +mod represents; +mod tag_omission; + +#[tokio::main] +async fn main() -> io::Result<()> { + // Since we're using Rc in the DOM implementation, we must ensure that tasks + // which act on it are confined to this thread. + + // Find the paths we need. + let cache_dir = path_from_env("HTML_CACHE", ".cache"); + let source_dir = path_from_env("HTML_SOURCE", "../html"); + + // Because parsing can jump around the tree a little, it's most reasonable + // to just parse the whole document before doing any processing. Even for + // the HTML standard, this doesn't take too long. + let document = parser::parse_document_async(tokio::io::stdin()).await?; + + let mut boilerplate = boilerplate::Processor::new(cache_dir.clone(), source_dir.join("demos")); + let mut represents = represents::Processor::new(); + let mut annotate_attributes = annotate_attributes::Processor::new(); + let mut tag_omission = tag_omission::Processor::new(); + let mut interface_index = interface_index::Processor::new(); + + // We do exactly one pass to identify the changes that need to be made. + dom_utils::scan_dom(&document, &mut |h| { + boilerplate.visit(h); + represents.visit(h); + annotate_attributes.visit(h); + tag_omission.visit(h); + interface_index.visit(h); + }); + + // And then we apply all of the changes. These different processors mostly + // apply quite local changes, so hopefully we never have to deal with + // conflicts between them. + boilerplate.apply().await?; + represents.apply()?; + annotate_attributes.apply().await?; + tag_omission.apply()?; + interface_index.apply()?; + + // Finally, we write the result to standard out. + let serializable: SerializableHandle = document.into(); + serialize( + &mut BufWriter::with_capacity(128 * 1024, io::stdout()), + &serializable, + SerializeOpts::default(), + )?; + Ok(()) +} + +fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path> +where + V: AsRef + ?Sized, + D: AsRef + ?Sized, +{ + match env::var_os(var) { + Some(p) => PathBuf::from(p).into(), + None => default.as_ref().into(), + } +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 00000000..a10de56d --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,107 @@ +//! This module provides some mild integration between the html5ever parser and async I/O. + +use std::io; + +use html5ever::driver::{self, Parser}; +use html5ever::tendril::{ByteTendril, TendrilSink}; +use markup5ever_rcdom::{Handle, RcDom}; +use tokio::io::{AsyncRead, AsyncReadExt}; + +async fn parse_internal_async( + parser: Parser, + mut r: R, +) -> io::Result { + let mut tendril_sink = parser.from_utf8(); + + // This draws on the structure of the sync tendril read_from. + // https://docs.rs/tendril/latest/tendril/stream/trait.TendrilSink.html#method.read_from + const BUFFER_SIZE: u32 = 128 * 1024; + 'read: loop { + let mut tendril = ByteTendril::new(); + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril).await { + Ok(0) => break 'read, + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + tendril_sink.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => Err(e)?, + } + } + } + let dom = tendril_sink.finish(); + Ok(dom.document) +} + +pub async fn parse_fragment_async( + r: R, + context: &Handle, +) -> io::Result> { + let parser = driver::parse_fragment_for_element( + RcDom::default(), + Default::default(), + context.clone(), + None, + ); + let document = parse_internal_async(parser, r).await?; + let mut new_children = document.children.take()[0].children.take(); + for new_child in new_children.iter_mut() { + new_child.parent.take(); + } + Ok(new_children) +} + +pub async fn parse_document_async(r: R) -> io::Result { + let parser = driver::parse_document(RcDom::default(), Default::default()); + parse_internal_async(parser, r).await +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use crate::dom_utils::NodeHandleExt; + use html5ever::serialize::{SerializeOpts, TraversalScope}; + use html5ever::{local_name, serialize}; + use markup5ever_rcdom::{NodeData, SerializableHandle}; + + pub(crate) fn serialize_for_test(nodes: &[Handle]) -> String { + let mut output = vec![]; + for node in nodes { + let traversal_scope = match node.data { + NodeData::Document => TraversalScope::ChildrenOnly(None), + _ => TraversalScope::IncludeNode, + }; + serialize( + &mut output, + &SerializableHandle::from(node.clone()), + SerializeOpts { + traversal_scope, + ..Default::default() + }, + ) + .unwrap(); + } + String::from_utf8(output).unwrap() + } + + #[tokio::test] + async fn test_fragment_respects_context() -> io::Result<()> { + // Checks that we have the appropriate insertion mode for the element + // we're in. This is important because of the special rules + // surrounding, e.g., tables. If you change this to use the body as context, + // no element at all is emitted. + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let table = body.children.borrow()[0].clone(); + assert!(table.is_html_element(&local_name!("table"))); + let children = parse_fragment_async("".as_bytes(), &table).await?; + assert_eq!(serialize_for_test(&children), ""); + Ok(()) + } +} diff --git a/src/represents.rs b/src/represents.rs new file mode 100644 index 00000000..ebb0474d --- /dev/null +++ b/src/represents.rs @@ -0,0 +1,152 @@ +//! Replaces comments with the HTML which appears in a +//! paragraph of the form: +//!

The tagname element represents ...

+ +use std::collections::HashMap; +use std::io; +use std::rc::Rc; + +use crate::dom_utils::NodeHandleExt; +use html5ever::local_name; +use html5ever::tendril::StrTendril; +use markup5ever_rcdom::{Handle, NodeData}; + +pub struct Processor { + /// Map from tag name (as found in the paragraph) to the which + /// contains the text "represents". + represents: HashMap, + + /// List of comments to be replaced, and what tag name + /// they correspond to. + placeholders: Vec<(Handle, StrTendril)>, +} + +/// Walks from the text node "represents" and finds the tag name and the +/// span that marks where the description begins, or returns None if that +/// cannot be found. +fn find_tag_name(represents_text: &Handle) -> Option<(StrTendril, Handle)> { + let span = represents_text + .parent_node() + .filter(|p| p.is_html_element(&local_name!("span")))?; + let p = span + .parent_node() + .filter(|p| p.is_html_element(&local_name!("p")))?; + let children = p.children.borrow(); + match &children[..] { + [a, b, c, d, ..] + if a.node_text().as_deref().map(|x| x.trim()) == Some("The") + && b.is_html_element(&local_name!("code")) + && c.node_text().as_deref().map(|x| x.trim()) == Some("element") + && Rc::ptr_eq(d, &span) => + { + Some((b.text_content(), span)) + } + _ => None, + } +} + +impl Processor { + pub fn new() -> Self { + Self { + represents: HashMap::new(), + placeholders: Vec::new(), + } + } + + /// Should be called for each node the document. Records when it sees a + /// represents and which element it is defining + pub fn visit(&mut self, node: &Handle) { + match node.data { + NodeData::Text { ref contents } if contents.borrow().as_ref() == "represents" => { + if let Some((tag, span)) = find_tag_name(node) { + self.represents.insert(tag, span); + } + } + NodeData::Comment { ref contents } if contents.starts_with("REPRESENTS ") => { + self.placeholders + .push((node.clone(), contents.subtendril(11, contents.len32() - 11))); + } + _ => (), + } + } + + pub fn apply(self) -> io::Result<()> { + for (placeholder, ref tag) in self.placeholders { + let span = match self.represents.get(tag) { + Some(span) => span, + None => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!(" refers to unknown tag", tag), + )); + } + }; + let parent = match span.parent_node() { + Some(p) => p, + None => continue, + }; + let replacements = parent + .children + .borrow() + .iter() + .skip_while(|s| !Rc::ptr_eq(s, span)) + .skip(1) + .enumerate() + .map(|(index, sibling)| { + let clone = sibling.deep_clone(); + // Capitalize the first letter of the first node (which is expected to be text). + if let (0, NodeData::Text { ref contents }) = (index, &clone.data) { + contents.replace_with(|text| capitalize(text.trim_start())); + } + clone + }) + .collect(); + placeholder.replace_with(replacements); + } + Ok(()) + } +} + +fn capitalize(text: &str) -> StrTendril { + let mut chars = text.chars(); + match chars.next() { + Some(c) => { + let mut capitalized = StrTendril::from_char(c.to_ascii_uppercase()); + capitalized.push_slice(chars.as_str()); + capitalized + } + None => StrTendril::new(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dom_utils; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_represents() -> io::Result<()> { + // Uses can occur either before or after. + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" + ); + Ok(()) + } + + #[tokio::test] + async fn test_represents_undefined() -> io::Result<()> { + // Uses can occur either before or after. + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } +} diff --git a/src/tag_omission.rs b/src/tag_omission.rs new file mode 100644 index 00000000..edbb80a8 --- /dev/null +++ b/src/tag_omission.rs @@ -0,0 +1,312 @@ +//! Looks at the "Optional tags" and "Void elements" sections from the HTML +//! syntax spec and replicates that information into the descriptions of the +//! individual elements. + +use std::borrow::Borrow; +use std::collections::HashMap; +use std::io; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; +use regex::Regex; + +use crate::dom_utils::{self, heading_level, NodeHandleExt}; + +#[derive(Default)] +struct ElementInfo { + /// Handles on any paragraphs in the "Optional tags" section which refer to the element. + optional_tags_info: Vec, + + /// Whether the element appears in the "Void elements" list. + is_void_element: bool, + + ///

into which this info must be added. + dl: Option, +} + +#[derive(Default)] +pub struct Processor { + /// The heading level of the "Optional tags" heading, if inside one. + in_optional_tags_heading: Option, + + /// Most recently seen . + most_recent_element_dfn: Option, + + /// Info about elements which have been referred to in these sections. + elements: HashMap, +} + +impl Processor { + pub fn new() -> Self { + Default::default() + } + + pub fn visit(&mut self, node: &Handle) { + // If the heading ends the "Optional tags" section, clear that state. + if let Some(optional_tag_heading_level) = self.in_optional_tags_heading { + match heading_level(node) { + Some(level) if level <= optional_tag_heading_level => { + self.in_optional_tags_heading = None; + } + _ => (), + } + } + + // If we encounter an "Optional tags" section, start observing relevant paragraphs. + // When one is encountered, possibly add it. + if let Some(level) = heading_level(node) { + if node.text_content().trim() == "Optional tags" { + self.in_optional_tags_heading = Some(level); + } + } else if self.in_optional_tags_heading.is_some() && node.is_html_element(&local_name!("p")) + { + self.maybe_record_optional_tags_paragraph(node); + } + + // If we encounter the Void elements section, look for the next dt. + if node.is_html_element(&local_name!("dfn")) + && node.text_content().trim() == "Void elements" + { + if let Some(dt) = node + .parent_node() + .filter(|n| n.is_html_element(&local_name!("dt"))) + { + for dd in dom_utils::dt_descriptions(&dt) { + dom_utils::scan_dom(&dd, &mut |n| { + if n.is_html_element(&local_name!("code")) { + let info = self.elements.entry(n.text_content()).or_default(); + info.is_void_element = true; + } + }); + } + } + } + + // If we see an element dfn, watch out for the upcoming
. + if node.is_html_element(&local_name!("dfn")) + && node.has_attribute(&QualName::new(None, ns!(), LocalName::from("element"))) + { + self.most_recent_element_dfn = Some(node.text_content()); + } + + // If we see a
, record that. + if node.is_html_element(&local_name!("dl")) && node.has_class("element") { + if let Some(elem) = std::mem::take(&mut self.most_recent_element_dfn) { + let info = self.elements.entry(elem).or_default(); + if info.dl.is_none() { + info.dl = Some(node.clone()); + } + } + } + } + + fn maybe_record_optional_tags_paragraph(&mut self, paragraph: &Handle) { + // The paragraph must have the structure "A(n) img element..." + let children = paragraph.children.borrow(); + let mut iter = children.iter().fuse(); + match (iter.next(), iter.next(), iter.next()) { + (Some(a), Some(b), Some(c)) + if a.node_text() + .map_or(false, |t| t.trim() == "A" || t.trim() == "An") + && b.is_html_element(&local_name!("code")) + && c.node_text() + .map_or(false, |t| t.trim().starts_with("element")) => + { + let info = self.elements.entry(b.text_content()).or_default(); + info.optional_tags_info.push(paragraph.clone()); + } + _ => (), + } + } + + pub fn apply(self) -> io::Result<()> { + let data_x = LocalName::from("data-x"); + let qual_data_x = QualName::new(None, ns!(), data_x.clone()); + let dt = Handle::create_element(local_name!("dt")) + .child( + Handle::create_element(local_name!("span")) + .attribute(&data_x, "concept-element-tag-omission") + .text("Tag omission in text/html") + .build(), + ) + .text(":") + .build(); + let void_dd = Handle::create_element(local_name!("dd")) + .text("No ") + .child( + Handle::create_element(local_name!("span")) + .attribute(&data_x, "syntax-end-tag") + .text("end tag") + .build(), + ) + .text(".") + .build(); + let default_dd = Handle::create_element(local_name!("dd")) + .text("Neither tag is omissible.") + .build(); + let may_re = Regex::new(r"\bmay\b").unwrap(); + + for info in self.elements.into_values() { + let dl = match info.dl { + Some(dl) => dl, + None => continue, + }; + + let mut to_insert = vec![dt.deep_clone()]; + if !info.optional_tags_info.is_empty() { + // Convert

to

, replacing "may" with "can". + for p in info.optional_tags_info { + let borrowed_children = p.children.borrow(); + let new_children = borrowed_children.iter().map(|n| { + let new_node = n.deep_clone(); + dom_utils::scan_dom(&new_node, &mut |c| { + if let NodeData::Text { ref contents } = c.data { + let mut text = contents.borrow_mut(); + *text = StrTendril::from(may_re.replace(&text, "can").borrow()); + } + }); + new_node + }); + let dd = Handle::create_element(local_name!("dd")) + .children(new_children) + .build(); + to_insert.push(dd); + } + } else if info.is_void_element { + to_insert.push(void_dd.deep_clone()); + } else { + to_insert.push(default_dd.deep_clone()); + } + to_insert.push(Handle::create_text_node("\n")); + + let dl_children = dl.children.borrow(); + let attributes_dt = if let Some(attributes_dt) = dl_children.iter().find(|child| { + child.is_html_element(&local_name!("dt")) + && child + .any_child(|c| c.attribute_is(&qual_data_x, "concept-element-attributes")) + }) { + attributes_dt.clone() + } else { + continue; + }; + drop(dl_children); + dl.insert_children_before(&attributes_dt, to_insert.into_iter()); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_simple() -> io::Result<()> { + let document = parse_document_async( + r#" +

Optional tags

+

A td element does very tdish things and may be very cellular.

+

An audio element is quite audible.

+

Another section

+

A body element is ignored because it's in another section. +

+
Void elements +
img and meta are void. +
input is too. +
Non-void elements +
html is interesting but not void. +
+

Elements

+

audio +

+
+
+

body +

+
+
+

html +

+
+
+

img +

+
+
+

input +

+
+
+

meta +

+
+
+

td +

+
+
+ "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

Optional tags

+

A td element does very tdish things and may be very cellular.

+

An audio element is quite audible.

+

Another section

+

A body element is ignored because it's in another section. +

+
Void elements +
img and meta are void. +
input is too. +
Non-void elements +
html is interesting but not void. +
+

Elements

+

audio +

+
Tag omission in text/html:
An audio element is quite audible.
+
+
+

body +

+
Tag omission in text/html:
Neither tag is omissible.
+
+
+

html +

+
Tag omission in text/html:
Neither tag is omissible.
+
+
+

img +

+
Tag omission in text/html:
No end tag.
+
+
+

input +

+
Tag omission in text/html:
No end tag.
+
+
+

meta +

+
Tag omission in text/html:
No end tag.
+
+
+

td +

+
Tag omission in text/html:
A td element does very tdish things and can be very cellular.
+
+
+ "#.trim()); + Ok(()) + } +}