diff --git a/composer.json b/composer.json index c9d597aa..87a0ba91 100644 --- a/composer.json +++ b/composer.json @@ -13,6 +13,8 @@ "require": { "php": "^8.1", "erusev/parsedown": "^1.7", + "henck/rtf-to-html": "^1.2", + "html2text/html2text": "^4.3", "phpoffice/phpword": "^1.2" }, "scripts": { diff --git a/composer.lock b/composer.lock index 3e0bfb26..be314901 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "973a79ca7eed8906c0085fac94e38a3c", + "content-hash": "2cb47e073c2bf1c487c4e986bb98e2a0", "packages": [ { "name": "erusev/parsedown", @@ -56,6 +56,94 @@ }, "time": "2019-12-30T22:54:17+00:00" }, + { + "name": "henck/rtf-to-html", + "version": "1.2", + "source": { + "type": "git", + "url": "https://github.com/henck/rtf-html-php.git", + "reference": "ff984a1d44fa359c85c0cffa0d66e991bf4dc967" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/henck/rtf-html-php/zipball/ff984a1d44fa359c85c0cffa0d66e991bf4dc967", + "reference": "ff984a1d44fa359c85c0cffa0d66e991bf4dc967", + "shasum": "" + }, + "require": { + "ext-mbstring": "*" + }, + "require-dev": { + "phpunit/phpunit": "7" + }, + "type": "library", + "autoload": { + "psr-4": { + "RtfHtmlPhp\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "GPL-2.0" + ], + "authors": [ + { + "name": "Alexander van Oostenrijk", + "email": "alex.vanoostenrijk@gmail.com" + } + ], + "description": "RTF to HTML converter in PHP", + "keywords": [ + "converter", + "rtf" + ], + "support": { + "issues": "https://github.com/henck/rtf-html-php/issues", + "source": "https://github.com/henck/rtf-html-php/tree/v1.2" + }, + "time": "2021-09-10T15:19:14+00:00" + }, + { + "name": "html2text/html2text", + "version": "4.3.1", + "source": { + "type": "git", + "url": "https://github.com/mtibben/html2text.git", + "reference": "61ad68e934066a6f8df29a3d23a6460536d0855c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/mtibben/html2text/zipball/61ad68e934066a6f8df29a3d23a6460536d0855c", + "reference": "61ad68e934066a6f8df29a3d23a6460536d0855c", + "shasum": "" + }, + "require-dev": { + "phpunit/phpunit": "~4" + }, + "suggest": { + "ext-mbstring": "For best performance", + "symfony/polyfill-mbstring": "If you can't install ext-mbstring" + }, + "type": "library", + "autoload": { + "psr-4": { + "Html2Text\\": [ + "src/", + "test/" + ] + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "GPL-2.0-or-later" + ], + "description": "Converts HTML to formatted plain text", + "support": { + "issues": "https://github.com/mtibben/html2text/issues", + "source": "https://github.com/mtibben/html2text/tree/4.3.1" + }, + "time": "2020-04-16T23:44:31+00:00" + }, { "name": "phpoffice/math", "version": "0.1.0", @@ -222,16 +310,16 @@ "packages-dev": [ { "name": "adhocore/cli", - "version": "v1.6.2", + "version": "v1.7.1", "source": { "type": "git", "url": "https://github.com/adhocore/php-cli.git", - "reference": "34191315b0da20b9b4ecad783d91db992fa209a4" + "reference": "3fde60a838912e71c82ed0f48048685dc32dbc77" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/adhocore/php-cli/zipball/34191315b0da20b9b4ecad783d91db992fa209a4", - "reference": "34191315b0da20b9b4ecad783d91db992fa209a4", + "url": "https://api.github.com/repos/adhocore/php-cli/zipball/3fde60a838912e71c82ed0f48048685dc32dbc77", + "reference": "3fde60a838912e71c82ed0f48048685dc32dbc77", "shasum": "" }, "require": { @@ -276,7 +364,7 @@ ], "support": { "issues": "https://github.com/adhocore/php-cli/issues", - "source": "https://github.com/adhocore/php-cli/tree/v1.6.2" + "source": "https://github.com/adhocore/php-cli/tree/v1.7.1" }, "funding": [ { @@ -288,7 +376,7 @@ "type": "github" } ], - "time": "2024-01-22T22:37:23+00:00" + "time": "2024-03-28T08:30:12+00:00" }, { "name": "doctrine/instantiator", @@ -466,12 +554,12 @@ "source": { "type": "git", "url": "https://github.com/nextcloud-deps/ocp.git", - "reference": "b8ae36e3e98db74bafd3dc7360cc758572e939b0" + "reference": "c8c98085481bc485e101b0992f808bf82df705f7" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/nextcloud-deps/ocp/zipball/b8ae36e3e98db74bafd3dc7360cc758572e939b0", - "reference": "b8ae36e3e98db74bafd3dc7360cc758572e939b0", + "url": "https://api.github.com/repos/nextcloud-deps/ocp/zipball/c8c98085481bc485e101b0992f808bf82df705f7", + "reference": "c8c98085481bc485e101b0992f808bf82df705f7", "shasum": "" }, "require": { @@ -503,7 +591,7 @@ "issues": "https://github.com/nextcloud-deps/ocp/issues", "source": "https://github.com/nextcloud-deps/ocp/tree/master" }, - "time": "2024-04-03T00:32:10+00:00" + "time": "2024-04-15T07:27:17+00:00" }, { "name": "nextcloud/openapi-extractor", @@ -511,23 +599,23 @@ "source": { "type": "git", "url": "https://github.com/nextcloud/openapi-extractor.git", - "reference": "5379789c3207b81a5b71960c047cfbf50b5be0d6" + "reference": "50acc06715d9ac182e40d31d19aa3e4de517e1da" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/nextcloud/openapi-extractor/zipball/5379789c3207b81a5b71960c047cfbf50b5be0d6", - "reference": "5379789c3207b81a5b71960c047cfbf50b5be0d6", + "url": "https://api.github.com/repos/nextcloud/openapi-extractor/zipball/50acc06715d9ac182e40d31d19aa3e4de517e1da", + "reference": "50acc06715d9ac182e40d31d19aa3e4de517e1da", "shasum": "" }, "require": { - "adhocore/cli": "^v1.6", + "adhocore/cli": "^1.7", "ext-simplexml": "*", - "nikic/php-parser": "^4.16", + "nikic/php-parser": "^5.0", "php": "^8.1", - "phpstan/phpdoc-parser": "^1.23" + "phpstan/phpdoc-parser": "^1.28" }, "require-dev": { - "nextcloud/coding-standard": "^1.1" + "nextcloud/coding-standard": "^1.2" }, "default-branch": true, "bin": [ @@ -558,29 +646,31 @@ "source": "https://github.com/nextcloud/openapi-extractor/tree/main", "issues": "https://github.com/nextcloud/openapi-extractor/issues" }, - "time": "2024-03-28T15:07:16+00:00" + "time": "2024-04-15T07:08:14+00:00" }, { "name": "nikic/php-parser", - "version": "v4.19.1", + "version": "v5.0.2", "source": { "type": "git", "url": "https://github.com/nikic/PHP-Parser.git", - "reference": "4e1b88d21c69391150ace211e9eaf05810858d0b" + "reference": "139676794dc1e9231bf7bcd123cfc0c99182cb13" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/nikic/PHP-Parser/zipball/4e1b88d21c69391150ace211e9eaf05810858d0b", - "reference": "4e1b88d21c69391150ace211e9eaf05810858d0b", + "url": "https://api.github.com/repos/nikic/PHP-Parser/zipball/139676794dc1e9231bf7bcd123cfc0c99182cb13", + "reference": "139676794dc1e9231bf7bcd123cfc0c99182cb13", "shasum": "" }, "require": { + "ext-ctype": "*", + "ext-json": "*", "ext-tokenizer": "*", - "php": ">=7.1" + "php": ">=7.4" }, "require-dev": { "ircmaxell/php-yacc": "^0.0.7", - "phpunit/phpunit": "^6.5 || ^7.0 || ^8.0 || ^9.0" + "phpunit/phpunit": "^7.0 || ^8.0 || ^9.0" }, "bin": [ "bin/php-parse" @@ -588,7 +678,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "4.9-dev" + "dev-master": "5.0-dev" } }, "autoload": { @@ -612,9 +702,9 @@ ], "support": { "issues": "https://github.com/nikic/PHP-Parser/issues", - "source": "https://github.com/nikic/PHP-Parser/tree/v4.19.1" + "source": "https://github.com/nikic/PHP-Parser/tree/v5.0.2" }, - "time": "2024-03-17T08:10:35+00:00" + "time": "2024-03-05T20:51:40+00:00" }, { "name": "phar-io/manifest", @@ -736,16 +826,16 @@ }, { "name": "php-cs-fixer/shim", - "version": "v3.52.1", + "version": "v3.53.0", "source": { "type": "git", "url": "https://github.com/PHP-CS-Fixer/shim.git", - "reference": "baec5a6d4b24bad4c930d39fde34b2b0c1c8cd94" + "reference": "1b2fab8b7351ce1feb7cadec1f0db4b43056f735" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/PHP-CS-Fixer/shim/zipball/baec5a6d4b24bad4c930d39fde34b2b0c1c8cd94", - "reference": "baec5a6d4b24bad4c930d39fde34b2b0c1c8cd94", + "url": "https://api.github.com/repos/PHP-CS-Fixer/shim/zipball/1b2fab8b7351ce1feb7cadec1f0db4b43056f735", + "reference": "1b2fab8b7351ce1feb7cadec1f0db4b43056f735", "shasum": "" }, "require": { @@ -782,22 +872,22 @@ "description": "A tool to automatically fix PHP code style", "support": { "issues": "https://github.com/PHP-CS-Fixer/shim/issues", - "source": "https://github.com/PHP-CS-Fixer/shim/tree/v3.52.1" + "source": "https://github.com/PHP-CS-Fixer/shim/tree/v3.53.0" }, - "time": "2024-03-19T21:03:12+00:00" + "time": "2024-04-08T15:08:36+00:00" }, { "name": "phpstan/phpdoc-parser", - "version": "1.27.0", + "version": "1.28.0", "source": { "type": "git", "url": "https://github.com/phpstan/phpdoc-parser.git", - "reference": "86e4d5a4b036f8f0be1464522f4c6b584c452757" + "reference": "cd06d6b1a1b3c75b0b83f97577869fd85a3cd4fb" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpstan/phpdoc-parser/zipball/86e4d5a4b036f8f0be1464522f4c6b584c452757", - "reference": "86e4d5a4b036f8f0be1464522f4c6b584c452757", + "url": "https://api.github.com/repos/phpstan/phpdoc-parser/zipball/cd06d6b1a1b3c75b0b83f97577869fd85a3cd4fb", + "reference": "cd06d6b1a1b3c75b0b83f97577869fd85a3cd4fb", "shasum": "" }, "require": { @@ -829,9 +919,9 @@ "description": "PHPDoc parser with support for nullable, intersection and generic types", "support": { "issues": "https://github.com/phpstan/phpdoc-parser/issues", - "source": "https://github.com/phpstan/phpdoc-parser/tree/1.27.0" + "source": "https://github.com/phpstan/phpdoc-parser/tree/1.28.0" }, - "time": "2024-03-21T13:14:53+00:00" + "time": "2024-04-03T18:51:33+00:00" }, { "name": "phpunit/php-code-coverage", @@ -1154,16 +1244,16 @@ }, { "name": "phpunit/phpunit", - "version": "9.6.18", + "version": "9.6.19", "source": { "type": "git", "url": "https://github.com/sebastianbergmann/phpunit.git", - "reference": "32c2c2d6580b1d8ab3c10b1e9e4dc263cc69bb04" + "reference": "a1a54a473501ef4cdeaae4e06891674114d79db8" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/32c2c2d6580b1d8ab3c10b1e9e4dc263cc69bb04", - "reference": "32c2c2d6580b1d8ab3c10b1e9e4dc263cc69bb04", + "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/a1a54a473501ef4cdeaae4e06891674114d79db8", + "reference": "a1a54a473501ef4cdeaae4e06891674114d79db8", "shasum": "" }, "require": { @@ -1237,7 +1327,7 @@ "support": { "issues": "https://github.com/sebastianbergmann/phpunit/issues", "security": "https://github.com/sebastianbergmann/phpunit/security/policy", - "source": "https://github.com/sebastianbergmann/phpunit/tree/9.6.18" + "source": "https://github.com/sebastianbergmann/phpunit/tree/9.6.19" }, "funding": [ { @@ -1253,7 +1343,7 @@ "type": "tidelift" } ], - "time": "2024-03-21T12:07:32+00:00" + "time": "2024-04-05T04:35:58+00:00" }, { "name": "psalm/phar", diff --git a/lib/Service/AssistantService.php b/lib/Service/AssistantService.php index 833631fd..f4572d40 100644 --- a/lib/Service/AssistantService.php +++ b/lib/Service/AssistantService.php @@ -4,6 +4,7 @@ require_once __DIR__ . '/../../vendor/autoload.php'; +use Html2Text\Html2Text; use OC\SpeechToText\TranscriptionJob; use OCA\Assistant\AppInfo\Application; use OCA\Assistant\Db\MetaTask; @@ -34,6 +35,8 @@ use Psr\Container\ContainerInterface; use Psr\Container\NotFoundExceptionInterface; use Psr\Log\LoggerInterface; +use RtfHtmlPhp\Document; +use RtfHtmlPhp\Html\HtmlFormatter; use RuntimeException; /** @@ -401,7 +404,7 @@ public function parseTextFromFile(string $filePath, string $userId): string { try { if ($file instanceof File) { - $contents = $file->getContent(); + $fileContent = $file->getContent(); } else { throw new \Exception('Provided path does not point to a file.'); } @@ -415,21 +418,25 @@ public function parseTextFromFile(string $filePath, string $userId): string { default: case 'text/plain': { - $text = $contents; + $text = $fileContent; break; } case 'text/markdown': { $parser = new Parsedown(); - $text = $parser->text($contents); + $text = $parser->text($fileContent); // Remove HTML tags: $text = strip_tags($text); break; } + case 'text/rtf': + { + $text = $this->parseRtfDocument($fileContent); + break; + } case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': case 'application/msword': - case 'application/rtf': case 'application/vnd.oasis.opendocument.text': { // Store the file in a temp dir and provide a path for the doc parser to use @@ -438,7 +445,7 @@ public function parseTextFromFile(string $filePath, string $userId): string { if (!file_exists(dirname($tempFilePath))) { mkdir(dirname($tempFilePath), 0700, true); } - file_put_contents($tempFilePath, $contents); + file_put_contents($tempFilePath, $fileContent); $text = $this->parseDocument($tempFilePath, $mimeType); @@ -470,11 +477,14 @@ private function parseDocument(string $filePath, string $mimeType): string { $readerType = 'MsDoc'; break; } - case 'application/rtf': - { - $readerType = 'RTF'; - break; - } + // RTF parsing is buggy in phpoffice + /* + case 'text/rtf': + { + $readerType = 'RTF'; + break; + } + */ case 'application/vnd.oasis.opendocument.text': { $readerType = 'ODText'; @@ -502,4 +512,15 @@ private function parseDocument(string $filePath, string $mimeType): string { return $outText; } + + private function parseRtfDocument(string $content): string { + // henck/rtf-to-html + $document = new Document($content); + $formatter = new HtmlFormatter('UTF-8'); + $htmlText = $formatter->Format($document); + + // html2text/html2text + $html = new Html2Text($htmlText); + return $html->getText(); + } } diff --git a/src/components/AssistantFormInputs.vue b/src/components/AssistantFormInputs.vue index 1841457f..2dccc54f 100644 --- a/src/components/AssistantFormInputs.vue +++ b/src/components/AssistantFormInputs.vue @@ -112,6 +112,7 @@ import { getFilePickerBuilder, showError } from '@nextcloud/dialogs' import { generateOcsUrl } from '@nextcloud/router' const VALID_MIME_TYPES = [ + 'text/rtf', 'text/plain', 'text/markdown', 'application/msword', // doc