From 69c6d11f72465fd9083da7ef5eb4a466c7a1c314 Mon Sep 17 00:00:00 2001 From: dhx Date: Fri, 20 Dec 2024 22:01:35 +0100 Subject: [PATCH] implement pdf content extraction Signed-off-by: dhx --- composer.json | 3 +- composer.lock | 133 +++++++++++++++++++++++++++- lib/Service/AssistantService.php | 8 ++ src/components/fields/TextInput.vue | 2 +- 4 files changed, 143 insertions(+), 3 deletions(-) diff --git a/composer.json b/composer.json index c094a9b0..07622dc2 100644 --- a/composer.json +++ b/composer.json @@ -15,7 +15,8 @@ "erusev/parsedown": "^1.7", "henck/rtf-to-html": "^1.2", "html2text/html2text": "^4.3", - "phpoffice/phpword": "^1.2" + "phpoffice/phpword": "^1.2", + "smalot/pdfparser": "^2.10" }, "scripts": { "lint": "find . -name \\*.php -not -path './vendor/*' -print0 | xargs -0 -n1 php -l", diff --git a/composer.lock b/composer.lock index d1dcfc64..93344313 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "0ca6f4f9023891c92d09312fec397405", + "content-hash": "685a9abb953b330d6dcc1822087f13cc", "packages": [ { "name": "erusev/parsedown", @@ -302,6 +302,137 @@ "source": "https://github.com/PHPOffice/PHPWord/tree/1.2.0" }, "time": "2023-11-30T11:22:23+00:00" + }, + { + "name": "smalot/pdfparser", + "version": "v2.11.0", + "source": { + "type": "git", + "url": "https://github.com/smalot/pdfparser.git", + "reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/smalot/pdfparser/zipball/ac8e6678b0940e4b2ccd5caadd3fb18e68093be6", + "reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6", + "shasum": "" + }, + "require": { + "ext-iconv": "*", + "ext-zlib": "*", + "php": ">=7.1", + "symfony/polyfill-mbstring": "^1.18" + }, + "type": "library", + "autoload": { + "psr-0": { + "Smalot\\PdfParser\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Sebastien MALOT", + "email": "sebastien@malot.fr" + } + ], + "description": "Pdf parser library. Can read and extract information from pdf file.", + "homepage": "https://www.pdfparser.org", + "keywords": [ + "extract", + "parse", + "parser", + "pdf", + "text" + ], + "support": { + "issues": "https://github.com/smalot/pdfparser/issues", + "source": "https://github.com/smalot/pdfparser/tree/v2.11.0" + }, + "time": "2024-08-16T06:48:03+00:00" + }, + { + "name": "symfony/polyfill-mbstring", + "version": "v1.31.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-mbstring.git", + "reference": "85181ba99b2345b0ef10ce42ecac37612d9fd341" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/85181ba99b2345b0ef10ce42ecac37612d9fd341", + "reference": "85181ba99b2345b0ef10ce42ecac37612d9fd341", + "shasum": "" + }, + "require": { + "php": ">=7.2" + }, + "provide": { + "ext-mbstring": "*" + }, + "suggest": { + "ext-mbstring": "For best performance" + }, + "type": "library", + "extra": { + "thanks": { + "url": "https://github.com/symfony/polyfill", + "name": "symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Mbstring\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for the Mbstring extension", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "mbstring", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.31.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-09-09T11:45:10+00:00" } ], "packages-dev": [ diff --git a/lib/Service/AssistantService.php b/lib/Service/AssistantService.php index 1710bdc5..c7406add 100644 --- a/lib/Service/AssistantService.php +++ b/lib/Service/AssistantService.php @@ -46,6 +46,7 @@ use RtfHtmlPhp\Document; use RtfHtmlPhp\Html\HtmlFormatter; use RuntimeException; +use Smalot\PdfParser\Parser; /** * @psalm-import-type AssistantTaskProcessingTaskType from ResponseDefinitions @@ -613,6 +614,13 @@ public function parseTextFromFile(string $userId, ?string $filePath = null, ?int $this->tempManager->clean(); break; } + case 'application/pdf': + { + $parser = new Parser(); + $pdf = $parser->parseContent($fileContent); + $text = $pdf->getText(); + break; + } } return $text; } diff --git a/src/components/fields/TextInput.vue b/src/components/fields/TextInput.vue index 3d7d1e2c..a3c533fb 100644 --- a/src/components/fields/TextInput.vue +++ b/src/components/fields/TextInput.vue @@ -60,7 +60,7 @@ const VALID_MIME_TYPES = [ 'application/msword', // doc 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // docx 'application/vnd.oasis.opendocument.text', // odt - // 'application/pdf', // Not yet supported + 'application/pdf', // pdf ] const picker = (callback, target) => getFilePickerBuilder(t('assistant', 'Choose a text file'))