Skip to content

Commit

Permalink
feature: PDF HTML service (#397)
Browse files Browse the repository at this point in the history
  • Loading branch information
andiwand authored Jan 6, 2025
1 parent 6ec27f5 commit 79b2bae
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 14 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def requirements(self):
self.requires("uchardet/0.0.8")
self.requires("utfcpp/4.0.4")
if self.options.get_safe("with_pdf2htmlEX", False):
self.requires("pdf2htmlex/0.18.8.rc1-git-6f85c88-odr")
self.requires("pdf2htmlex/0.18.8.rc1-odr-pr1")
if self.options.get_safe("with_wvWare", False):
self.requires("wvware/1.2.9-odr")

Expand Down
2 changes: 2 additions & 0 deletions src/odr/html_service.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ class HtmlFragment;
class HtmlResource;

enum class HtmlResourceType {
html_fragment,
css,
js,
image,
font,
};

using HtmlResourceLocation = std::optional<std::string>;
Expand Down
176 changes: 166 additions & 10 deletions src/odr/internal/html/pdf2htmlex_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@
#include <odr/file.hpp>
#include <odr/html.hpp>

#include <odr/internal/html/common.hpp>
#include <odr/internal/html/html_service.hpp>
#include <odr/internal/html/html_writer.hpp>
#include <odr/internal/pdf_poppler/poppler_pdf_file.hpp>
#include <odr/internal/util/stream_util.hpp>

#include <pdf2htmlEX/HTMLRenderer/HTMLRenderer.h>
#include <pdf2htmlEX/Param.h>

#include <poppler/GlobalParams.h>
#include <poppler/PDFDoc.h>

namespace odr::internal {
namespace odr::internal::html {

Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,
const std::string &output_path,
const HtmlConfig &config) {
PDFDoc &pdf_doc = pdf_file.pdf_doc();
namespace {

pdf2htmlEX::Param create_params(PDFDoc &pdf_doc, const HtmlConfig &config,
const std::string &output_path) {
pdf2htmlEX::Param param;

// pages
Expand All @@ -30,7 +33,7 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,
param.fit_width = 0;
param.fit_height = 0;
param.use_cropbox = 1;
param.desired_dpi = 144.0;
param.desired_dpi = 144;

// output
param.embed_css = 1;
Expand All @@ -40,16 +43,17 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,
param.embed_outline = 1;
param.split_pages = 0;
param.dest_dir = output_path;
param.css_filename = "";
param.page_filename = "";
param.outline_filename = "";
param.css_filename = "style.css";
param.page_filename = "page%i.html";
param.outline_filename = "outline.html";
param.process_nontext = 1;
param.process_outline = 1;
param.process_annotation = 0;
param.process_form = 0;
param.printing = 1;
param.fallback = 0;
param.tmp_file_size_limit = -1;
param.delay_background = 0;

// font
param.embed_external_font = 0; // TODO 1
Expand Down Expand Up @@ -86,7 +90,7 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,

// misc
param.clean_tmp = 1;
param.tmp_dir = "/tmp";
param.tmp_dir = output_path;
param.data_dir = config.pdf2htmlex_data_path;
param.poppler_data_dir = config.poppler_data_path;
param.debug = 0;
Expand All @@ -97,6 +101,158 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,
param.input_filename = "";
param.output_filename = "document.html";

return param;
}

} // namespace

class BackgroundImageResource : public HtmlResource {
public:
static std::string file_name(std::size_t page_number,
const std::string &format) {
std::stringstream stream;
stream << "bg" << page_number;
stream << std::hex << page_number;
stream << "." << format;
return stream.str();
}

BackgroundImageResource(
PopplerPdfFile pdf_file, std::string output_path,
std::shared_ptr<pdf2htmlEX::HTMLRenderer> html_renderer,
std::shared_ptr<std::mutex> html_renderer_mutex, int page_number,
const std::string &format)
: HtmlResource(HtmlResourceType::image, file_name(page_number, format),
output_path + "/" + file_name(page_number, format),
odr::File(), false, false),
m_pdf_file{std::move(pdf_file)}, m_output_path{std::move(output_path)},
m_html_renderer{std::move(html_renderer)},
m_html_renderer_mutex{std::move(html_renderer_mutex)},
m_page_number{page_number} {}

void write_resource(std::ostream &os) const override {
PDFDoc &pdf_doc = m_pdf_file.pdf_doc();

std::lock_guard lock(m_mutex);

if (!std::filesystem::exists(path())) {
std::lock_guard renderer_lock(*m_html_renderer_mutex);

m_html_renderer->renderPage(&pdf_doc, m_page_number);
}

{
std::ifstream in(path());
util::stream::pipe(in, os);
}
}

private:
PopplerPdfFile m_pdf_file;
std::string m_output_path;
std::shared_ptr<pdf2htmlEX::HTMLRenderer> m_html_renderer;
std::shared_ptr<std::mutex> m_html_renderer_mutex;
int m_page_number;
mutable std::mutex m_mutex;
};

class HtmlServiceImpl : public HtmlService {
public:
HtmlServiceImpl(PopplerPdfFile pdf_file, std::string output_path,
std::shared_ptr<pdf2htmlEX::HTMLRenderer> html_renderer,
std::shared_ptr<std::mutex> html_renderer_mutex,
std::shared_ptr<pdf2htmlEX::Param> html_renderer_param,
HtmlConfig config, HtmlResourceLocator resource_locator)
: HtmlService(std::move(config), std::move(resource_locator), {}),
m_pdf_file{std::move(pdf_file)}, m_output_path{std::move(output_path)},
m_html_renderer{std::move(html_renderer)},
m_html_renderer_mutex{std::move(html_renderer_mutex)},
m_html_renderer_param{std::move(html_renderer_param)} {
for (int i = 1; i <= m_pdf_file.pdf_doc().getNumPages(); ++i) {
auto resource = std::make_shared<BackgroundImageResource>(
m_pdf_file, m_output_path, m_html_renderer, m_html_renderer_mutex, i,
m_html_renderer_param->bg_format);
std::string file_name = BackgroundImageResource::file_name(
i, m_html_renderer_param->bg_format);
m_resources.emplace_back(std::move(resource), std::move(file_name));
}
}

HtmlResources write_document(HtmlWriter &out) const override {
HtmlResources resources;

{
std::ifstream in(m_output_path + "/document.html");
util::stream::pipe(in, out.out());
}

return resources;
}

private:
PopplerPdfFile m_pdf_file;
std::string m_output_path;
std::shared_ptr<pdf2htmlEX::HTMLRenderer> m_html_renderer;
std::shared_ptr<std::mutex> m_html_renderer_mutex;
std::shared_ptr<pdf2htmlEX::Param> m_html_renderer_param;

HtmlResources m_resources;
};

} // namespace odr::internal::html

namespace odr::internal {

odr::HtmlService
html::create_poppler_pdf_service(const PopplerPdfFile &pdf_file,
const std::string &output_path,
const HtmlConfig &config) {
PDFDoc &pdf_doc = pdf_file.pdf_doc();

auto html_renderer_param = std::make_shared<pdf2htmlEX::Param>(
create_params(pdf_doc, config, output_path));
html_renderer_param->embed_image = 0;
html_renderer_param->delay_background = 1;

if (!pdf_doc.okToCopy()) {
if (html_renderer_param->no_drm == 0) {
throw DocumentCopyProtectedException("");
}
}

globalParams = std::make_unique<GlobalParams>(
!html_renderer_param->poppler_data_dir.empty()
? html_renderer_param->poppler_data_dir.c_str()
: nullptr);

// TODO not sure what the `progPath` is used for. it cannot be `nullptr`
// TODO potentially just a cache dir?
auto html_renderer = std::make_shared<pdf2htmlEX::HTMLRenderer>(
config.fontforge_data_path.c_str(), *html_renderer_param);
html_renderer->process(&pdf_doc);

globalParams.reset();

HtmlResourceLocator resource_locator =
local_resource_locator(output_path, config);

// renderer is not thread safe
// TODO check if this can be achieved in pdf2htmlEX
auto html_renderer_mutex = std::make_shared<std::mutex>();

return odr::HtmlService(std::make_shared<HtmlServiceImpl>(
pdf_file, output_path, std::move(html_renderer),
std::move(html_renderer_mutex), std::move(html_renderer_param), config,
resource_locator));
}

Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,
const std::string &output_path,
const HtmlConfig &config) {
PDFDoc &pdf_doc = pdf_file.pdf_doc();

pdf2htmlEX::Param param = create_params(pdf_doc, config, output_path);

if (!pdf_doc.okToCopy()) {
if (param.no_drm == 0) {
throw DocumentCopyProtectedException("");
Expand Down
4 changes: 3 additions & 1 deletion src/odr/internal/html/pdf2htmlex_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ class PopplerPdfFile;

namespace odr::internal::html {

HtmlService translate_document(const PopplerPdfFile &pdf_file);
odr::HtmlService create_poppler_pdf_service(const PopplerPdfFile &pdf_file,
const std::string &output_path,
const HtmlConfig &config);

Html translate_poppler_pdf_file(const PopplerPdfFile &pdf_file,
const std::string &output_path,
Expand Down
2 changes: 1 addition & 1 deletion test/data/reference-output/odr-private
2 changes: 1 addition & 1 deletion test/data/reference-output/odr-public
Submodule odr-public updated 41 files
+15 −13 output/docx/11KB.docx/document.html
+15 −13 output/docx/12KB.docx/document.html
+68 −66 output/docx/15-MB-docx-file-download.docx/document.html
+51 −49 output/docx/15KB.docx/document.html
+651 −649 output/docx/170KB.docx/document.html
+24 −22 output/docx/23KB.docx/document.html
+3,513 −3,511 output/docx/785KB.docx/document.html
+3 −1 output/docx/empty.docx/document.html
+3 −1 output/docx/encrypted.docx/document.html
+163 −161 output/docx/file-sample_100kB.docx/document.html
+175 −173 output/docx/file-sample_1MB.docx/document.html
+175 −173 output/docx/file-sample_500kB.docx/document.html
+61 −59 output/docx/sample-docx-file-for-testing.docx/document.html
+983 −981 output/docx/sample1.docx/document.html
+7 −5 output/docx/sample2.docx/document.html
+244 −242 output/docx/sample3.docx/document.html
+653 −651 output/docx/sample4.docx/document.html
+91 −89 output/docx/style-various-1.docx/document.html
+917 −915 output/docx/tabletest.docx/document.html
+3 −1 output/odm/test.odm/document.html
+1 −1 output/odp/image-1.odp/slide0.html
+1 −1 output/odp/image-2.odp/slide0.html
+2 −2 output/odp/image-2.odp/slide1.html
+2 −2 output/odp/image-2.odp/slide2.html
+2 −2 output/odp/image-2.odp/slide3.html
+20 −18 output/odt/XYZ-Rechnung.odt/document.html
+51 −49 output/odt/about.odt/document.html
+3 −1 output/odt/empty.odt/document.html
+175 −173 output/odt/file-sample_100kB.odt/document.html
+199 −197 output/odt/file-sample_1MB.odt/document.html
+199 −197 output/odt/file-sample_500kB.odt/document.html
+12 −10 output/odt/image-text-wrap.odt/document.html
+6 −4 output/odt/mixed-layout.odt/document.html
+99 −97 output/odt/style-various-1.odt/document.html
+50 −48 output/odt/table-span.odt/document.html
+17 −9 output/pdf/empty.pdf-poppler/document.html
+119 −105 output/pdf/style-various-1.pdf-poppler/document.html
+3 −1 output/txt/iso_8859-1.txt/text.html
+3 −1 output/txt/lorem ipsum.txt/text.html
+3 −1 output/txt/open_document_reader_h.txt/text.html
+3 −3 output/xlsx/sample.xlsx/sheet0.html
1 change: 1 addition & 0 deletions test/docker/compare_output_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ fi
docker run -ti \
-v $(pwd):/repo \
-p 8000:8000 \
--platform linux/amd64 \
ghcr.io/opendocument-app/odr_core_test \
python3 /repo/test/scripts/compare_output_server.py /repo/$REF /repo/$OBS --compare --driver $DRIVER --port 8000

0 comments on commit 79b2bae

Please sign in to comment.