diff --git a/CMakeLists.txt b/CMakeLists.txt index d804a522..bb414b75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/html/html_writer.cpp" "src/odr/internal/html/image_file.cpp" "src/odr/internal/html/pdf_file.cpp" + "src/odr/internal/html/pdf_poppler_file.hpp" "src/odr/internal/html/text_file.cpp" "src/odr/internal/json/json_file.cpp" @@ -194,16 +195,32 @@ target_link_libraries(odr utf8::cpp ) -if(WITH_PDF2HTMLEX) - target_sources(odr PRIVATE "src/odr/internal/html/pdf2htmlEX_wrapper.cpp") +if (WITH_PDF2HTMLEX) find_package(pdf2htmlEX REQUIRED) - target_link_libraries(odr PRIVATE pdf2htmlex::pdf2htmlex) -endif(WITH_PDF2HTMLEX) -if(WITH_WVWARE) - target_sources(odr PRIVATE "src/odr/internal/html/wvWare_wrapper.cpp") + find_package(poppler REQUIRED) + target_sources(odr + PRIVATE + "src/odr/internal/html/pdf2htmlEX_wrapper.cpp" + "src/odr/internal/html/pdf_poppler_file.cpp" + "src/odr/internal/pdf_poppler/poppler_pdf_file.cpp" + ) + target_link_libraries(odr + PRIVATE + pdf2htmlex::pdf2htmlex + poppler::poppler + ) +endif () +if (WITH_WVWARE) find_package(wvware REQUIRED) - target_link_libraries(odr PRIVATE wvware::wvware) -endif(WITH_WVWARE) + target_sources(odr + PRIVATE + "src/odr/internal/html/wvWare_wrapper.cpp" + ) + target_link_libraries(odr + PRIVATE + wvware::wvware + ) +endif () if (EXISTS "${PROJECT_SOURCE_DIR}/.git") add_dependencies(odr check_git) diff --git a/conanfile.py b/conanfile.py index 16eab5f2..6b772b7a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -44,7 +44,7 @@ def requirements(self): self.requires("uchardet/0.0.8") self.requires("utfcpp/4.0.4") if self.options.get_safe("with_pdf2htmlEX", False): - self.requires("pdf2htmlex/0.18.8.rc1-20240905-git") + self.requires("pdf2htmlex/0.18.8.rc1-20240905-git", transitive_headers=True, transitive_libs=True) if self.options.get_safe("with_wvWare", False): self.requires("wvware/1.2.9") diff --git a/src/odr/file.hpp b/src/odr/file.hpp index 0b9f4823..f3abe05d 100644 --- a/src/odr/file.hpp +++ b/src/odr/file.hpp @@ -100,7 +100,7 @@ enum class FileLocation { /// @brief Collection of decoder engines. enum class DecoderEngine { odr, - pdf2html_ex, + poppler, wv_ware, }; diff --git a/src/odr/internal/html/pdf_file.hpp b/src/odr/internal/html/pdf_file.hpp index 6df0f55a..068b822a 100644 --- a/src/odr/internal/html/pdf_file.hpp +++ b/src/odr/internal/html/pdf_file.hpp @@ -1,5 +1,5 @@ -#ifndef ODR_INTERNAL_PDF_FILE_HPP -#define ODR_INTERNAL_PDF_FILE_HPP +#ifndef ODR_INTERNAL_HTML_PDF_FILE_HPP +#define ODR_INTERNAL_HTML_PDF_FILE_HPP #include @@ -17,4 +17,4 @@ Html translate_pdf_file(const PdfFile &pdf_file, const std::string &output_path, } -#endif // ODR_INTERNAL_PDF_FILE_HPP +#endif // ODR_INTERNAL_HTML_PDF_FILE_HPP diff --git a/src/odr/internal/html/pdf_poppler_file.cpp b/src/odr/internal/html/pdf_poppler_file.cpp new file mode 100644 index 00000000..23c2ef62 --- /dev/null +++ b/src/odr/internal/html/pdf_poppler_file.cpp @@ -0,0 +1,58 @@ +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace odr::internal { + +Html html::translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + static const char *fontconfig_path = getenv("FONTCONFIG_PATH"); + if (nullptr == fontconfig_path) { + // Storage is allocated and after successful putenv, it will never be freed. + // This is the way of putenv. + char *storage = strdup("FONTCONFIG_PATH=" FONTCONFIG_PATH); + if (0 != putenv(storage)) { + free(storage); + } + fontconfig_path = getenv("FONTCONFIG_PATH"); + } + + pdf2htmlEX::pdf2htmlEX pdf2htmlEX; + pdf2htmlEX.setDataDir(PDF2HTMLEX_DATA_DIR); + pdf2htmlEX.setPopplerDataDir(POPPLER_DATA_DIR); + + pdf2htmlEX.setDestinationDir(output_path); + auto output_file_name = "document.html"; + pdf2htmlEX.setOutputFilename(output_file_name); + + pdf2htmlEX.setDRM(false); + pdf2htmlEX.setProcessOutline(false); + pdf2htmlEX.setProcessAnnotation(true); + + try { + pdf2htmlEX.convert(); + } catch (const pdf2htmlEX::EncryptionPasswordException &e) { + throw WrongPassword(); + } catch (const pdf2htmlEX::DocumentCopyProtectedException &e) { + throw std::runtime_error("document is copy protected"); + } catch (const pdf2htmlEX::ConversionFailedException &e) { + throw std::runtime_error(std::string("conversion error ") + e.what()); + } + + return {FileType::portable_document_format, + config, + {{"document", output_path + "/" + output_file_name}}}; +} + +} // namespace odr::internal diff --git a/src/odr/internal/html/pdf_poppler_file.hpp b/src/odr/internal/html/pdf_poppler_file.hpp new file mode 100644 index 00000000..1bad1ac2 --- /dev/null +++ b/src/odr/internal/html/pdf_poppler_file.hpp @@ -0,0 +1,21 @@ +#ifndef ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP +#define ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP + +#include + +namespace odr { +class PopplerPdfFile; + +struct HtmlConfig; +class Html; +} // namespace odr + +namespace odr::internal::html { + +Html translate_pdf_poppler_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config); + +} + +#endif // ODR_INTERNAL_HTML_PDF_POPPLER_FILE_HPP diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp new file mode 100644 index 00000000..462776e4 --- /dev/null +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.cpp @@ -0,0 +1,36 @@ +#include + +#include + +#include +#include + +namespace odr::internal::poppler_pdf { + +PopplerPdfFile::PopplerPdfFile(std::shared_ptr file) + : m_file{std::move(file)} { + GooString file_path(file->disk_path()->string().c_str()); + m_pdf_doc = std::unique_ptr(PDFDocFactory().createPDFDoc(file_path)); +} + +FileCategory PopplerPdfFile::file_category() const noexcept { + return FileCategory::document; +} + +std::shared_ptr PopplerPdfFile::file() const noexcept { + return m_file; +} + +FileType PopplerPdfFile::file_type() const noexcept { + return FileType::portable_document_format; +} + +FileMeta PopplerPdfFile::file_meta() const noexcept { return {}; } + +DecoderEngine PopplerPdfFile::decoder_engine() const noexcept { + return DecoderEngine::poppler; +} + +const PDFDoc &PopplerPdfFile::pdf_doc() const { return *m_pdf_doc; } + +} // namespace odr::internal::poppler_pdf diff --git a/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp new file mode 100644 index 00000000..06daa7ea --- /dev/null +++ b/src/odr/internal/pdf_poppler/poppler_pdf_file.hpp @@ -0,0 +1,30 @@ +#ifndef ODR_INTERNAL_POPPLER_PDF_FILE_HPP +#define ODR_INTERNAL_POPPLER_PDF_FILE_HPP + +#include + +class PDFDoc; + +namespace odr::internal::poppler_pdf { + +class PopplerPdfFile : public abstract::DecodedFile { +public: + explicit PopplerPdfFile(std::shared_ptr file); + + [[nodiscard]] std::shared_ptr file() const noexcept final; + + [[nodiscard]] FileType file_type() const noexcept final; + [[nodiscard]] FileCategory file_category() const noexcept final; + [[nodiscard]] FileMeta file_meta() const noexcept final; + [[nodiscard]] DecoderEngine decoder_engine() const noexcept final; + + [[nodiscard]] const PDFDoc &pdf_doc() const; + +private: + std::shared_ptr m_file; + std::unique_ptr m_pdf_doc; +}; + +} // namespace odr::internal::poppler_pdf + +#endif // ODR_INTERNAL_POPPLER_PDF_FILE_HPP