From 0d7c466ca08b268cbee94358167b5a8a2d1986a8 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 18 Jul 2024 08:00:39 -0700 Subject: [PATCH 01/11] Fixes version for recent fix --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13aadb7..d4be18e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update development to use `hatch test` and `hatch fmt` ([#17](https://github.com/stumpylog/tika-client/pull/17)) - Included `mypy` typing in the linting checks +### Fixed + +- Typo in README codeblock by @Chaostheorie ([#19](https://github.com/stumpylog/tika-client/pull/19)) + ## [0.5.0] - 2023-11-07 ### Added @@ -28,10 +32,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `pypa/gh-action-pypi-publish` updated to v1.8.10 - CI testing now uses the official Apache Tika image (minimal) instead of the paperless-ngx image -### Fixed - -- Typo in README codeblock by @Chaostheorie ([#19](https://github.com/stumpylog/tika-client/pull/19)) - ## [0.4.0] - 2023-07-27 ### Added From fc96a15c9b6d43c9b75196d2bf52a13979aa97f6 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:58:05 -0700 Subject: [PATCH 02/11] Fixes README referring to the incorrect license --- CHANGELOG.md | 6 ++++++ README.md | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4be18e..4251f4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- Fixed the README referring to the wrong license text + ## [0.6.0] - 2024-07-18 ### Changed diff --git a/README.md b/README.md index dd12e53..1e8df65 100644 --- a/README.md +++ b/README.md @@ -87,4 +87,4 @@ This library attempts to provide a simpler interface, minimal lines of code and ## License -`tika-client` is distributed under the terms of the [GPL-3.0-only](https://spdx.org/licenses/GPL-3.0-only.html) license. +`tika-client` is distributed under the terms of the [Mozilla Public License 2.0](https://spdx.org/licenses/MPL-2.0.html) license. From 5de24d69ef3fc494277ca7ad709ceffd09e30a14 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:58:16 -0700 Subject: [PATCH 03/11] Adds SPDX license header to all source files --- CHANGELOG.md | 4 ++++ src/tika_client/__about__.py | 3 +++ src/tika_client/__init__.py | 4 ++++ src/tika_client/_constants.py | 4 ++++ src/tika_client/_resource_meta.py | 4 ++++ src/tika_client/_resource_recursive.py | 4 ++++ src/tika_client/_resource_tika.py | 4 ++++ src/tika_client/_types.py | 4 ++++ src/tika_client/_utils.py | 4 ++++ src/tika_client/client.py | 4 ++++ src/tika_client/data_models.py | 3 +++ 11 files changed, 42 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4251f4b..24f3d6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- SPDX license headers were added to source files + ### Fixed - Fixed the README referring to the wrong license text diff --git a/src/tika_client/__about__.py b/src/tika_client/__about__.py index 906d362..60cd4f7 100644 --- a/src/tika_client/__about__.py +++ b/src/tika_client/__about__.py @@ -1 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 __version__ = "0.6.0" diff --git a/src/tika_client/__init__.py b/src/tika_client/__init__.py index c58c0be..26ea4af 100644 --- a/src/tika_client/__init__.py +++ b/src/tika_client/__init__.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from tika_client.client import TikaClient from tika_client.data_models import DublinCoreKey from tika_client.data_models import TikaKey diff --git a/src/tika_client/_constants.py b/src/tika_client/_constants.py index 6881726..02c32e5 100644 --- a/src/tika_client/_constants.py +++ b/src/tika_client/_constants.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from typing import Final # Only compress content which is larger than this diff --git a/src/tika_client/_resource_meta.py b/src/tika_client/_resource_meta.py index 70081cb..861c06e 100644 --- a/src/tika_client/_resource_meta.py +++ b/src/tika_client/_resource_meta.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from pathlib import Path from typing import Final diff --git a/src/tika_client/_resource_recursive.py b/src/tika_client/_resource_recursive.py index f04b46b..14a2bb3 100644 --- a/src/tika_client/_resource_recursive.py +++ b/src/tika_client/_resource_recursive.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from __future__ import annotations import logging diff --git a/src/tika_client/_resource_tika.py b/src/tika_client/_resource_tika.py index 3e156b9..d1bff2b 100644 --- a/src/tika_client/_resource_tika.py +++ b/src/tika_client/_resource_tika.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from pathlib import Path from typing import Final diff --git a/src/tika_client/_types.py b/src/tika_client/_types.py index e8f878a..1f0e5fe 100644 --- a/src/tika_client/_types.py +++ b/src/tika_client/_types.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from __future__ import annotations import sys diff --git a/src/tika_client/_utils.py b/src/tika_client/_utils.py index 7b8b895..3f17c0c 100644 --- a/src/tika_client/_utils.py +++ b/src/tika_client/_utils.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from __future__ import annotations import logging diff --git a/src/tika_client/client.py b/src/tika_client/client.py index 390196a..208cb4b 100644 --- a/src/tika_client/client.py +++ b/src/tika_client/client.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 + from __future__ import annotations import logging diff --git a/src/tika_client/data_models.py b/src/tika_client/data_models.py index a39da4b..c85de10 100644 --- a/src/tika_client/data_models.py +++ b/src/tika_client/data_models.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present Trenton H +# +# SPDX-License-Identifier: MPL-2.0 from __future__ import annotations import logging From d71318c1e1621842858c550a6a874cb332470a81 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:58:22 -0700 Subject: [PATCH 04/11] Fixes the creation of unused loggers --- CHANGELOG.md | 1 + src/tika_client/_resource_recursive.py | 3 --- src/tika_client/_utils.py | 3 --- src/tika_client/data_models.py | 3 --- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24f3d6a..1032e36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed the README referring to the wrong license text +- Fixed the creation of loggers for the library which were never utilized ## [0.6.0] - 2024-07-18 diff --git a/src/tika_client/_resource_recursive.py b/src/tika_client/_resource_recursive.py index 14a2bb3..e23a264 100644 --- a/src/tika_client/_resource_recursive.py +++ b/src/tika_client/_resource_recursive.py @@ -4,7 +4,6 @@ from __future__ import annotations -import logging from typing import TYPE_CHECKING from typing import Final @@ -18,8 +17,6 @@ from tika_client._types import MimeType from tika_client.data_models import TikaResponse -logger = logging.getLogger("tika-client.rmeta") - class _TikaRmetaBase(BaseResource): def _common_call( diff --git a/src/tika_client/_utils.py b/src/tika_client/_utils.py index 3f17c0c..83dc7cb 100644 --- a/src/tika_client/_utils.py +++ b/src/tika_client/_utils.py @@ -4,7 +4,6 @@ from __future__ import annotations -import logging import urllib.parse from typing import TYPE_CHECKING @@ -19,8 +18,6 @@ from tika_client._types import MimeType from tika_client._types import RequestContent -logger = logging.getLogger("tika-client.utils") - class BaseResource: def __init__(self, client: Client, *, compress: bool) -> None: diff --git a/src/tika_client/data_models.py b/src/tika_client/data_models.py index c85de10..b541e32 100644 --- a/src/tika_client/data_models.py +++ b/src/tika_client/data_models.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: MPL-2.0 from __future__ import annotations -import logging import re from datetime import datetime from datetime import timedelta @@ -12,8 +11,6 @@ # Based on https://cwiki.apache.org/confluence/display/TIKA/Metadata+Overview -logger = logging.getLogger("tika-client.data") - _TIME_RE = re.compile( r"(?P\d{4})-" r"(?P\d{2})-" From 8e0152845a4b4c9fcb5af425317108b6d93cbd78 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:58:29 -0700 Subject: [PATCH 05/11] Update development tool version restrictions --- pyproject.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3fa4443..4082016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ installer = "uv" [tool.hatch.envs.hatch-static-analysis] # https://hatch.pypa.io/latest/config/internal/static-analysis/ -dependencies = ["ruff ~= 0.4.8"] +dependencies = ["ruff ~= 0.5.6"] config-path = "none" [tool.hatch.envs.hatch-test] @@ -71,13 +71,13 @@ parallel = true randomize = true dependencies = [ "coverage-enable-subprocess == 1.0", - "coverage[toml] ~= 7.4", + "coverage[toml] ~= 7.6", "pytest < 8.0; python_version < '3.9'", - "pytest ~= 8.2; python_version >= '3.9'", - "pytest-mock ~= 3.12", + "pytest ~= 8.3; python_version >= '3.9'", + "pytest-mock ~= 3.14", "pytest-randomly ~= 3.15", "pytest-rerunfailures ~= 14.0", - "pytest-xdist[psutil] ~= 3.5", + "pytest-xdist[psutil] ~= 3.6", ] extra-dependencies = [ "pytest-sugar", @@ -113,7 +113,7 @@ python = ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10" [tool.hatch.envs.typing] detached = true dependencies = [ - "mypy ~= 1.10.0", + "mypy ~= 1.11.0", "httpx", ] @@ -127,7 +127,7 @@ run = [ template = "pre-commit" detached = true dependencies = [ - "pre-commit ~= 3.7.0", + "pre-commit ~= 3.8.0", ] [tool.hatch.envs.pre-commit.scripts] From e302d1513a9ca961af5a184f4af0dbf2f73fa921 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:50:39 -0700 Subject: [PATCH 06/11] Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (#22) * Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.9.0 to 1.10.2. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.9.0...v1.10.2) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * Changelog note --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Trenton Holmes --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b525a88..b841968 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -193,4 +193,4 @@ jobs: path: dist - name: Publish build to PyPI - uses: pypa/gh-action-pypi-publish@v1.9.0 + uses: pypa/gh-action-pypi-publish@v1.10.2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 1032e36..afa73b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed the README referring to the wrong license text - Fixed the creation of loggers for the library which were never utilized +### Changed + +- Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22)) + ## [0.6.0] - 2024-07-18 ### Changed From 8741c3bf019f8f4b9641a2efd324128eb7cb3bc9 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:57:53 -0700 Subject: [PATCH 07/11] Chore: Dependency updates (#23) * Upgrades dependencies and hook versions * Changelog note * Spelling fix --------- Co-authored-by: Trenton Holmes --- .docker/docker-compose.ci-test.yml | 2 +- .pre-commit-config.yaml | 17 +++- CHANGELOG.md | 1 + pyproject.toml | 135 +++++++++++++++-------------- tests/conftest.py | 4 +- 5 files changed, 89 insertions(+), 70 deletions(-) diff --git a/.docker/docker-compose.ci-test.yml b/.docker/docker-compose.ci-test.yml index c7b50d8..a273f14 100644 --- a/.docker/docker-compose.ci-test.yml +++ b/.docker/docker-compose.ci-test.yml @@ -1,6 +1,6 @@ # docker-compose file for running testing with Tika container # for a more end to end test of the Tika related functionality -# Can be used locally or by the CI to start the nessecary container with the +# Can be used locally or by the CI to start the necessary container with the # correct networking for the tests version: "3" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 07d1065..e19aeb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: # General hooks - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-docstring-first - id: check-json @@ -26,19 +26,28 @@ repos: - svg - id: check-case-conflict - id: detect-private-key - - repo: https://github.com/pre-commit/mirrors-prettier - rev: 'v3.1.0' + # See https://github.com/prettier/prettier/issues/15742 for the fork reason + - repo: https://github.com/rbubley/mirrors-prettier + rev: "v3.3.3" hooks: - id: prettier types_or: - javascript - ts - markdown + - repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell # Python hooks - repo: https://github.com/astral-sh/ruff-pre-commit - rev: 'v0.4.8' + rev: 'v0.6.9' hooks: # Run the linter. - id: ruff # Run the formatter. - id: ruff-format + - repo: https://github.com/tox-dev/pyproject-fmt + rev: "2.2.4" + hooks: + - id: pyproject-fmt diff --git a/CHANGELOG.md b/CHANGELOG.md index afa73b4..61412db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22)) +- Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23)) ## [0.6.0] - 2024-07-18 diff --git a/pyproject.toml b/pyproject.toml index 4082016..50621ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,29 +1,29 @@ # # Project Configuration # + [build-system] -requires = ["hatchling"] build-backend = "hatchling.build" +requires = [ "hatchling" ] + [project] name = "tika-client" -dynamic = ["version"] description = "A modern REST client for Apache Tika server" readme = "README.md" -requires-python = ">=3.8" +keywords = [ "api", "client", "html", "office", "pdf", "tika" ] license = "MPL-2.0" -keywords = ["api", "pdf", "html", "client", "office", "tika"] authors = [ { name = "Trenton H", email = "rda0128ou@mozmail.com" }, ] +requires-python = ">=3.8" classifiers = [ "Development Status :: 4 - Beta", + "Environment :: Web Environment", + "Intended Audience :: Developers", "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", "Operating System :: OS Independent", - "Intended Audience :: Developers", - "Environment :: Web Environment", "Programming Language :: Python", - "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", @@ -33,28 +33,29 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] +dynamic = [ "version" ] dependencies = [ - "httpx ~= 0.27; python_version >= '3.9'", - "httpx ~= 0.24; python_version < '3.9'", - "typing-extensions; python_version < '3.11'" + "httpx~=0.24; python_version<'3.9'", + "httpx~=0.27; python_version>='3.9'", + "typing-extensions; python_version<'3.11'", ] -[project.urls] -Documentation = "https://github.com/stumpylog/tika-rest-client#readme" -Issues = "https://github.com/stumpylog/tika-rest-client/issues" -Source = "https://github.com/stumpylog/tika-rest-client" -Changelog = "https://github.com/stumpylog/tika-rest-client/blob/main/CHANGELOG.md" +urls.Changelog = "https://github.com/stumpylog/tika-rest-client/blob/main/CHANGELOG.md" # # Hatch Configuration # +urls.Documentation = "https://github.com/stumpylog/tika-rest-client#readme" +urls.Issues = "https://github.com/stumpylog/tika-rest-client/issues" +urls.Source = "https://github.com/stumpylog/tika-rest-client" + [tool.hatch.version] path = "src/tika_client/__about__.py" [tool.hatch.build.targets.sdist] exclude = [ ".github", - ".docker" + ".docker", ] [tool.hatch.envs.default] @@ -62,7 +63,7 @@ installer = "uv" [tool.hatch.envs.hatch-static-analysis] # https://hatch.pypa.io/latest/config/internal/static-analysis/ -dependencies = ["ruff ~= 0.5.6"] +dependencies = [ "ruff ~= 0.6" ] config-path = "none" [tool.hatch.envs.hatch-test] @@ -81,31 +82,32 @@ dependencies = [ ] extra-dependencies = [ "pytest-sugar", - "pytest-httpx ~= 0.30; python_version >= '3.9'", + "pytest-httpx == 0.30.0; python_version >= '3.9'", "pytest-httpx ~= 0.22; python_version < '3.9'", "python-magic", ] -extra-args = ["--maxprocesses=8", "--pythonwarnings=all"] +extra-args = [ "--maxprocesses=8", "--pythonwarnings=all" ] [tool.hatch.envs.hatch-test.scripts] run = [ "python3 --version", - "pytest{env:HATCH_TEST_ARGS:} {args}"] + "pytest{env:HATCH_TEST_ARGS:} {args}", +] run-cov = [ "python3 --version", "coverage erase", - "coverage run -m pytest{env:HATCH_TEST_ARGS:} {args}" + "coverage run -m pytest{env:HATCH_TEST_ARGS:} {args}", ] -cov-combine = ["coverage combine"] +cov-combine = [ "coverage combine" ] cov-report = [ "coverage report", "coverage json", - "coverage html" + "coverage html", ] [[tool.hatch.envs.hatch-test.matrix]] -python = ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10"] +python = [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10" ] # # Custom Environments @@ -113,40 +115,41 @@ python = ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10" [tool.hatch.envs.typing] detached = true dependencies = [ - "mypy ~= 1.11.0", + "mypy ~= 1.11", "httpx", ] [tool.hatch.envs.typing.scripts] run = [ "mypy --version", - "mypy --install-types --non-interactive {args:src/tika_client}" + "mypy --install-types --non-interactive {args:src/tika_client}", ] [tool.hatch.envs.pre-commit] template = "pre-commit" detached = true dependencies = [ - "pre-commit ~= 3.8.0", + "pre-commit ~= 4.0", + "pre-commit-uv", ] [tool.hatch.envs.pre-commit.scripts] -check = ["pre-commit run --all-files"] -update = ["pre-commit autoupdate"] +check = [ "pre-commit run --all-files" ] +update = [ "pre-commit autoupdate" ] # # Tool Configuration # + [tool.ruff] -# https://docs.astral.sh/ruff/settings/ -fix = true -output-format = "grouped" target-version = "py38" line-length = 120 -[tool.ruff.lint] +# https://docs.astral.sh/ruff/settings/ +fix = true +output-format = "grouped" # https://docs.astral.sh/ruff/rules/ -extend-select = [ +lint.extend-select = [ "A", "ARG", "B", @@ -169,15 +172,15 @@ extend-select = [ "ISC", "N", "PERF", - "PIE", "PGH", - "PTH", + "PIE", "PL", "PLC", "PLE", "PLR", "PLW", "PT", + "PTH", "Q", "RSE", "RUF", @@ -195,38 +198,47 @@ extend-select = [ "W", "YTT", ] -ignore = [ +lint.ignore = [ # Allow non-abstract empty methods in abstract base classes "B027", + # Ignore complexity + "C901", # Allow boolean positional values in function calls, like `dict.get(... True)` "FBT003", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", # Ignore checks for possible passwords - "S105", "S106", "S107", - # Ignore complexity - "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", + "S105", + "S106", + "S107", # Ignore no author and missing issue link in TODO tags - "TD002", "TD003" + "TD002", + "TD003", ] - -[tool.ruff.lint.isort] -force-single-line = true -known-first-party = ["tika_client"] - -[tool.ruff.lint.flake8-tidy-imports] -ban-relative-imports = "all" - -[tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, and relative imports -"tests/**/*" = ["PLR2004", - "S101", - "TID252", - # Allow more complex pytest.raises - "PT012", - "DTZ001" - ] +lint.per-file-ignores."tests/**/*" = [ + "DTZ001", + "PLR2004", + # Allow more complex pytest.raises + "PT012", + "S101", + "TID252", +] +# No relative imports +lint.flake8-tidy-imports.ban-relative-imports = "all" +# One import per line +lint.isort.force-single-line = true +# Recognize us please +lint.isort.known-first-party = [ "tika_client" ] + +[tool.pytest.ini_options] +minversion = "7.0" +testpaths = [ "tests" ] [tool.coverage.run] -source_pkgs = ["tika_client", "tests"] +source_pkgs = [ "tika_client", "tests" ] branch = true parallel = true omit = [ @@ -234,8 +246,8 @@ omit = [ ] [tool.coverage.paths] -tika_client = ["src/tika_client", "*/tika-client/src/tika_client"] -tests = ["tests", "*/tika-client/tests"] +tika_client = [ "src/tika_client", "*/tika-client/src/tika_client" ] +tests = [ "tests", "*/tika-client/tests" ] [tool.coverage.report] exclude_lines = [ @@ -255,6 +267,3 @@ warn_redundant_casts = true warn_unused_ignores = true warn_unreachable = true warn_unused_configs = true - -[tool.pytest.ini_options] -minversion = "7.0" diff --git a/tests/conftest.py b/tests/conftest.py index d0bf901..f52a47c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,13 +12,13 @@ SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples" -@pytest.fixture() +@pytest.fixture def tika_client() -> TikaClient: with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO) as client: yield client -@pytest.fixture() +@pytest.fixture def tika_client_compressed() -> TikaClient: with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO, compress=True) as client: yield client From 4d9cbf48a3b48c8cac519714e9f33e892d19027a Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 9 Oct 2024 07:56:53 -0700 Subject: [PATCH 08/11] Chore: Use pytest fixtures effectively (#24) --- CHANGELOG.md | 1 + tests/conftest.py | 75 ++++++++++-- tests/test_datetime_formats.py | 80 +++++++++---- tests/test_file_formats.py | 11 +- tests/test_image_files.py | 13 +-- tests/test_resource_metadata.py | 48 ++++---- tests/test_resource_recursive_metadata.py | 70 ++++++++---- tests/test_resource_tika.py | 132 +++++++++++++--------- 8 files changed, 296 insertions(+), 134 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61412db..3307421 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22)) - Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23)) +- Use pytest fixtures effectively ([#24](https://github.com/stumpylog/tika-client/pull/24)) ## [0.6.0] - 2024-07-18 diff --git a/tests/conftest.py b/tests/conftest.py index f52a47c..2a97621 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,24 +1,85 @@ import logging import os from pathlib import Path -from typing import Final +from typing import Generator import pytest from tika_client.client import TikaClient -TIKA_URL: Final[str] = os.getenv("TIKA_URL", "http://localhost:9998") -SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples" +@pytest.fixture(scope="session") +def tika_host() -> str: + return os.getenv("TIKA_URL", "http://localhost:9998") + + +@pytest.fixture(scope="session") +def samples_dir() -> Path: + return Path(__file__).parent.resolve() / "samples" + + +@pytest.fixture(scope="session") +def sample_libre_office_writer_file(samples_dir: Path) -> Path: + return samples_dir / "sample-libre-office.odt" + + +@pytest.fixture(scope="session") +def sample_google_docs_to_libre_office_writer_file(samples_dir: Path) -> Path: + return samples_dir / "sample.odt" + + +@pytest.fixture(scope="session") +def sample_google_docs_to_docx_file(samples_dir: Path) -> Path: + return samples_dir / "sample.docx" + + +@pytest.fixture(scope="session") +def sample_docx_file(samples_dir: Path) -> Path: + return samples_dir / "microsoft-sample.docx" + + +@pytest.fixture(scope="session") +def sample_doc_file(samples_dir: Path) -> Path: + return samples_dir / "sample.doc" + + +@pytest.fixture(scope="session") +def sample_html_file(samples_dir: Path) -> Path: + return samples_dir / "sample.html" + + +@pytest.fixture(scope="session") +def sample_office_doc_with_images_file(samples_dir: Path) -> Path: + return samples_dir / "test-document-images.odt" + + +@pytest.fixture(scope="session") +def sample_jpeg_file(samples_dir: Path) -> Path: + return samples_dir / "sample.jpg" + + +@pytest.fixture(scope="session") +def sample_png_file(samples_dir: Path) -> Path: + return samples_dir / "sample.png" + + +@pytest.fixture(scope="session") +def sample_ods_file(samples_dir: Path) -> Path: + return samples_dir / "sample-spreadsheet.ods" + + +@pytest.fixture(scope="session") +def sample_xlsx_file(samples_dir: Path) -> Path: + return samples_dir / "sample-spreadsheet.xlsx" @pytest.fixture -def tika_client() -> TikaClient: - with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO) as client: +def tika_client(tika_host: str) -> Generator[TikaClient, None, None]: + with TikaClient(tika_url=tika_host, log_level=logging.INFO) as client: yield client @pytest.fixture -def tika_client_compressed() -> TikaClient: - with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO, compress=True) as client: +def tika_client_compressed(tika_host: str) -> Generator[TikaClient, None, None]: + with TikaClient(tika_url=tika_host, log_level=logging.INFO, compress=True) as client: yield client diff --git a/tests/test_datetime_formats.py b/tests/test_datetime_formats.py index 303b3e5..c944058 100644 --- a/tests/test_datetime_formats.py +++ b/tests/test_datetime_formats.py @@ -1,29 +1,36 @@ from datetime import datetime from datetime import timedelta from datetime import timezone +from pathlib import Path import magic import pytest from pytest_httpx import HTTPXMock -from tests.conftest import SAMPLE_DIR from tika_client.client import TikaClient from tika_client.data_models import DublinCoreKey from tika_client.data_models import TikaKey class TestDateTimeFormat: - def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: HTTPXMock): + def test_parse_offset_date_format_utc( + self, + tika_client: TikaClient, + sample_libre_office_writer_file: Path, + httpx_mock: HTTPXMock, + ): """ Test the datetime parsing properly handles a time with a UTC timezone in the +xx:yy format """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" httpx_mock.add_response( json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-05-17T16:30:44+00:00"}, ) - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.created == datetime( year=2023, @@ -35,17 +42,24 @@ def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: tzinfo=timezone.utc, ) - def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock: HTTPXMock): + def test_parse_offset_date_format_zulu( + self, + tika_client: TikaClient, + sample_libre_office_writer_file: Path, + httpx_mock: HTTPXMock, + ): """ Test the datetime parsing properly handles a time with a UTC timezone in the Z format """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" httpx_mock.add_response( json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-01-17T16:35:44Z"}, ) - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.created == datetime( year=2023, @@ -57,34 +71,48 @@ def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock tzinfo=timezone.utc, ) - def test_parse_offset_date_format_positive(self, tika_client: TikaClient, httpx_mock: HTTPXMock): + def test_parse_offset_date_format_positive( + self, + tika_client: TikaClient, + sample_libre_office_writer_file: Path, + httpx_mock: HTTPXMock, + ): """ Test the datetime parsing properly handles a time with a timezone in the +xx:yy format offset from UTC """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" httpx_mock.add_response( json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44+08:00"}, ) - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.created == pytest.approx( datetime(year=2023, month=6, day=17, hour=16, minute=30, second=44, tzinfo=timezone(timedelta(hours=8))), rel=timedelta(seconds=1), ) - def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_mock: HTTPXMock): + def test_parse_offset_date_format_negative( + self, + tika_client: TikaClient, + sample_libre_office_writer_file: Path, + httpx_mock: HTTPXMock, + ): """ Test the datetime parsing properly handles a time with a timezone in the -xx:yy format offset from UTC """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" httpx_mock.add_response( json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44-08:00"}, ) - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.created == pytest.approx( datetime( @@ -99,11 +127,15 @@ def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_ rel=timedelta(seconds=1), ) - def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient, httpx_mock: HTTPXMock): + def test_parse_offset_date_format_python_isoformat( + self, + tika_client: TikaClient, + sample_libre_office_writer_file: Path, + httpx_mock: HTTPXMock, + ): """ Test the datetime parsing properly handles a time with a timezone in the ISO 8061 format (as done by Python) """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" expected = datetime.now(tz=timezone.utc) @@ -111,20 +143,30 @@ def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: expected.isoformat()}, ) - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.created == pytest.approx(expected, rel=timedelta(seconds=1)) - def test_parse_offset_date_no_match(self, tika_client: TikaClient, httpx_mock: HTTPXMock): + def test_parse_offset_date_no_match( + self, + tika_client: TikaClient, + sample_libre_office_writer_file: Path, + httpx_mock: HTTPXMock, + ): """ Test the datetime parsing properly handles a time string which doesn't match the correct formats """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" httpx_mock.add_response( json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "202-06-17T16:30:44-0"}, ) - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.created is None diff --git a/tests/test_file_formats.py b/tests/test_file_formats.py index e6f91fd..cd6ab16 100644 --- a/tests/test_file_formats.py +++ b/tests/test_file_formats.py @@ -1,20 +1,23 @@ from datetime import datetime +from pathlib import Path import magic -from tests.conftest import SAMPLE_DIR from tika_client.client import TikaClient class TestLibreOfficeFormats: - def test_parse_libre_office_writer_document(self, tika_client: TikaClient): + def test_parse_libre_office_writer_document(self, tika_client: TikaClient, sample_libre_office_writer_file: Path): """ Test handling of a ODT document produced by LibreOffice """ - test_file = SAMPLE_DIR / "sample-libre-office.odt" - resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.tika.as_html.from_file( + sample_libre_office_writer_file, + magic.from_file(str(sample_libre_office_writer_file), mime=True), + ) assert resp.type == "application/vnd.oasis.opendocument.text" + assert resp.content is not None assert ( "

This is a document created by LibreOffice Writer 7.5.12, on July 19th, 2023

\n" in resp.content diff --git a/tests/test_image_files.py b/tests/test_image_files.py index 3b93070..d79a08b 100644 --- a/tests/test_image_files.py +++ b/tests/test_image_files.py @@ -1,24 +1,23 @@ +from pathlib import Path + import magic -from tests.conftest import SAMPLE_DIR from tika_client.client import TikaClient class TestParseImageMetadata: - def test_image_jpeg(self, tika_client: TikaClient): + def test_image_jpeg(self, tika_client: TikaClient, sample_jpeg_file: Path): """ Test the handling of a JPEG file metadata retrieval """ - test_file = SAMPLE_DIR / "sample.jpg" - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file(sample_jpeg_file, magic.from_file(str(sample_jpeg_file), mime=True)) assert resp.type == "image/jpeg" - def test_image_png(self, tika_client: TikaClient): + def test_image_png(self, tika_client: TikaClient, sample_png_file: Path): """ Test the handling of a PNG file metadata retrieval """ - test_file = SAMPLE_DIR / "sample.png" - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file(sample_png_file, magic.from_file(str(sample_png_file), mime=True)) assert resp.type == "image/png" diff --git a/tests/test_resource_metadata.py b/tests/test_resource_metadata.py index a623a7d..fc9a784 100644 --- a/tests/test_resource_metadata.py +++ b/tests/test_resource_metadata.py @@ -1,76 +1,82 @@ from datetime import datetime from datetime import timezone +from pathlib import Path import httpx import magic import pytest from pytest_httpx import HTTPXMock -from tests.conftest import SAMPLE_DIR -from tests.conftest import TIKA_URL from tika_client.client import TikaClient class TestMetadataResource: - def test_metadata_from_docx(self, tika_client: TikaClient): + def test_metadata_from_docx(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): """ Test parsing of a DOCX produced by Google Docs conversion """ - test_file = SAMPLE_DIR / "sample.docx" - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + + resp = tika_client.metadata.from_file( + sample_google_docs_to_docx_file, + magic.from_file(str(sample_google_docs_to_docx_file), mime=True), + ) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" assert resp.created is None - def test_metadata_from_docx_no_mime(self, tika_client: TikaClient): + def test_metadata_from_docx_no_mime(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): """ Test parsing of a DOCX produced by Google Docs conversion, when no mime type is provided """ - test_file = SAMPLE_DIR / "sample.docx" - resp = tika_client.metadata.from_file(test_file) + + resp = tika_client.metadata.from_file(sample_google_docs_to_docx_file) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" assert resp.created is None - def test_metadata_from_word_docx(self, tika_client: TikaClient): + def test_metadata_from_word_docx(self, tika_client: TikaClient, sample_docx_file: Path): """ Test parsing of a DOCX produced by Microsoft Word """ - test_file = SAMPLE_DIR / "microsoft-sample.docx" - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file(sample_docx_file, magic.from_file(str(sample_docx_file), mime=True)) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" assert resp.created == datetime(year=2023, month=5, day=17, hour=16, minute=41, tzinfo=timezone.utc) assert resp.modified == datetime(year=2023, month=5, day=17, hour=16, minute=44, tzinfo=timezone.utc) - def test_metadata_from_odt(self, tika_client: TikaClient): + def test_metadata_from_odt(self, tika_client: TikaClient, sample_google_docs_to_libre_office_writer_file: Path): """ Test parsing of a ODT produced by Google Docs conversion """ - test_file = SAMPLE_DIR / "sample.odt" - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file( + sample_google_docs_to_libre_office_writer_file, + magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True), + ) assert resp.type == "application/vnd.oasis.opendocument.text" assert resp.data["generator"] == "LibreOfficeDev/6.0.5.2$Linux_X86_64 LibreOffice_project/" assert resp.created is None - def test_metadata_from_doc(self, tika_client: TikaClient): + def test_metadata_from_doc(self, tika_client: TikaClient, sample_doc_file: Path): """ Test parsing of a DOC produced by Google Docs conversion """ - test_file = SAMPLE_DIR / "sample.doc" - resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True)) + resp = tika_client.metadata.from_file(sample_doc_file, magic.from_file(str(sample_doc_file), mime=True)) assert resp.type == "application/msword" assert resp.language == "en" - def test_http_error(self, httpx_mock: HTTPXMock): + def test_http_error( + self, + httpx_mock: HTTPXMock, + tika_host: str, + sample_google_docs_to_libre_office_writer_file: Path, + ): """ Test handling of HTTP errors returned from Tika """ - test_file = SAMPLE_DIR / "sample.odt" httpx_mock.add_response(status_code=500) - with pytest.raises(httpx.HTTPStatusError) as err, TikaClient(tika_url=TIKA_URL) as client: - client.metadata.from_file(test_file) + with pytest.raises(httpx.HTTPStatusError) as err, TikaClient(tika_url=tika_host) as client: + client.metadata.from_file(sample_google_docs_to_libre_office_writer_file) assert err.value.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR diff --git a/tests/test_resource_recursive_metadata.py b/tests/test_resource_recursive_metadata.py index 311300e..aeb2d90 100644 --- a/tests/test_resource_recursive_metadata.py +++ b/tests/test_resource_recursive_metadata.py @@ -1,88 +1,114 @@ +from pathlib import Path + import magic -from tests.conftest import SAMPLE_DIR from tika_client.client import TikaClient class TestRecursiveMetadataResource: - def test_r_metadata_from_docx(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - documents = tika_client.rmeta.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_metadata_from_docx(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + documents = tika_client.rmeta.as_html.from_file( + sample_google_docs_to_docx_file, + magic.from_file(str(sample_google_docs_to_docx_file), mime=True), + ) assert len(documents) == 1 document = documents[0] assert document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert document.content is not None assert "

This is an DOCX test document, also made September 14, 2022

\n" in document.content assert document.created is None - def test_r_metadata_from_docx_plain(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_metadata_from_docx_plain(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + documents = tika_client.rmeta.as_text.from_file( + sample_google_docs_to_docx_file, + magic.from_file(str(sample_google_docs_to_docx_file), mime=True), + ) assert len(documents) == 1 document = documents[0] assert document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert document.content is not None assert "This is an DOCX test document, also made September 14, 2022" in document.content assert document.created is None - def test_r_meta_microsoft_word_docx(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "microsoft-sample.docx" - documents = tika_client.rmeta.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_meta_microsoft_word_docx(self, tika_client: TikaClient, sample_docx_file: Path): + documents = tika_client.rmeta.as_html.from_file( + sample_docx_file, + magic.from_file(str(sample_docx_file), mime=True), + ) assert len(documents) == 1 document = documents[0] assert document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert document.content is not None assert ( "

This is a sample document, generated by Microsoft Office on Wednesday, May 17, 2023.

\n

It is in English.

\n" # noqa: E501 in document.content ) - def test_r_metadata_from_odt(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.odt" - documents = tika_client.rmeta.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_metadata_from_odt(self, tika_client: TikaClient, sample_google_docs_to_libre_office_writer_file: Path): + documents = tika_client.rmeta.as_html.from_file( + sample_google_docs_to_libre_office_writer_file, + magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True), + ) assert len(documents) == 2 document = documents[0] assert document.type == "application/vnd.oasis.opendocument.text" + assert document.content is not None assert "

This is an ODT test document, created September 14, 2022

\n" in document.content assert document.created is None - def test_r_metadata_from_odt_plain(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.odt" - documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_metadata_from_odt_plain( + self, + tika_client: TikaClient, + sample_google_docs_to_libre_office_writer_file: Path, + ): + documents = tika_client.rmeta.as_text.from_file( + sample_google_docs_to_libre_office_writer_file, + magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True), + ) assert len(documents) == 2 document = documents[0] assert document.type == "application/vnd.oasis.opendocument.text" + assert document.content is not None assert "This is an ODT test document, created September 14, 2022" in document.content document = documents[1] assert document.type == "image/png" - def test_r_metadata_from_ods_plain(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample-spreadsheet.ods" - documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_metadata_from_ods_plain(self, tika_client: TikaClient, sample_ods_file: Path): + documents = tika_client.rmeta.as_text.from_file( + sample_ods_file, + magic.from_file(str(sample_ods_file), mime=True), + ) assert len(documents) == 2 document = documents[0] + assert document.content is not None assert "This is cell A1" in document.content assert "You sunk my battleship" in document.content document = documents[1] assert document.type == "image/png" - def test_r_metadata_from_xlsx_plain(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample-spreadsheet.xlsx" - documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_r_metadata_from_xlsx_plain(self, tika_client: TikaClient, sample_xlsx_file: Path): + documents = tika_client.rmeta.as_text.from_file( + sample_xlsx_file, + magic.from_file(str(sample_xlsx_file), mime=True), + ) assert len(documents) == 1 document = documents[0] + assert document.content is not None assert "This is cell A1" in document.content assert "You sunk my battleship" in document.content diff --git a/tests/test_resource_tika.py b/tests/test_resource_tika.py index c7296c0..989db67 100644 --- a/tests/test_resource_tika.py +++ b/tests/test_resource_tika.py @@ -1,10 +1,8 @@ import shutil -import tempfile from pathlib import Path import magic -from tests.conftest import SAMPLE_DIR from tika_client.client import TikaClient @@ -13,19 +11,22 @@ class TestParseFormatted: Test the Tika endpoint for returning HTML formatted content """ - def test_parse_docx_from_file_as_html(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_parse_docx_from_file_as_html(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + resp = tika_client.tika.as_html.from_file( + sample_google_docs_to_docx_file, + magic.from_file(str(sample_google_docs_to_docx_file), mime=True), + ) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None assert "

This is an DOCX test document, also made September 14, 2022

\n" in resp.content assert resp.content_length == 6424 - def test_parse_doc_from_file_as_html(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.doc" - resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_parse_doc_from_file_as_html(self, tika_client: TikaClient, sample_doc_file: Path): + resp = tika_client.tika.as_html.from_file(sample_doc_file, magic.from_file(str(sample_doc_file), mime=True)) assert resp.type == "application/msword" + assert resp.content is not None assert ( "body>

This is a test document, saved in the older .doc format for Word documents (but created in Google Drive)

\n" # noqa: E501 in resp.content @@ -36,89 +37,111 @@ def test_parse_doc_from_file_as_html(self, tika_client: TikaClient): assert resp.revision == 1 assert resp.last_author == "cloudconvert_4" - def test_parse_docx_no_mime_from_file_as_html(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - resp = tika_client.tika.as_html.from_file(test_file) + def test_parse_docx_no_mime_from_file_as_html(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + resp = tika_client.tika.as_html.from_file(sample_google_docs_to_docx_file) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None assert "

This is an DOCX test document, also made September 14, 2022

\n" in resp.content - def test_parse_microsoft_word_docx_from_file_as_html(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "microsoft-sample.docx" - resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_parse_microsoft_word_docx_from_file_as_html(self, tika_client: TikaClient, sample_docx_file: Path): + resp = tika_client.tika.as_html.from_file(sample_docx_file, magic.from_file(str(sample_docx_file), mime=True)) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None assert ( "

This is a sample document, generated by Microsoft Office on Wednesday, May 17, 2023.

\n

It is in English.

\n" # noqa: E501 in resp.content ) - def test_parse_odt_from_file_as_html(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.odt" - resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_parse_odt_from_file_as_html( + self, + tika_client: TikaClient, + sample_google_docs_to_libre_office_writer_file: Path, + ): + resp = tika_client.tika.as_html.from_file( + sample_google_docs_to_libre_office_writer_file, + magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True), + ) assert resp.type == "application/vnd.oasis.opendocument.text" + assert resp.content is not None assert "

This is an ODT test document, created September 14, 2022

\n" in resp.content - def test_parse_docx_from_buffer_as_html(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - content = test_file.read_bytes() + def test_parse_docx_from_buffer_as_html(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + content = sample_google_docs_to_docx_file.read_bytes() resp = tika_client.tika.as_html.from_buffer(content, magic.from_buffer(content, mime=True)) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None assert "

This is an DOCX test document, also made September 14, 2022

\n" in resp.content assert resp.content_length == 6183 class TestParsePlain: - def test_parse_docx_from_file_as_text(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - - resp = tika_client.tika.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_parse_docx_from_file_as_text(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + resp = tika_client.tika.as_text.from_file( + sample_google_docs_to_docx_file, + magic.from_file(str(sample_google_docs_to_docx_file), mime=True), + ) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None assert "This is an DOCX test document, also made September 14, 2022" in resp.content - def test_parse_odt_from_file_as_text(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.odt" - - resp = tika_client.tika.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True)) + def test_parse_odt_from_file_as_text( + self, + tika_client: TikaClient, + sample_google_docs_to_libre_office_writer_file: Path, + ): + resp = tika_client.tika.as_text.from_file( + sample_google_docs_to_libre_office_writer_file, + magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True), + ) assert resp.type == "application/vnd.oasis.opendocument.text" + assert resp.content is not None assert "This is an ODT test document, created September 14, 2022" in resp.content class TestParseContentPlain: - def test_parse_docx_from_bytes_buffer(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - buffer = test_file.read_bytes() + def test_parse_docx_from_bytes_buffer(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path): + buffer = sample_google_docs_to_docx_file.read_bytes() resp = tika_client.tika.as_text.from_buffer(buffer) assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None assert "This is an DOCX test document, also made September 14, 2022" in resp.content - def test_parse_odt_from_bytes_buffer(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.odt" - buffer = test_file.read_bytes() + def test_parse_odt_from_bytes_buffer( + self, + tika_client: TikaClient, + sample_google_docs_to_libre_office_writer_file: Path, + ): + buffer = sample_google_docs_to_libre_office_writer_file.read_bytes() resp = tika_client.tika.as_text.from_buffer(buffer) assert resp.type == "application/vnd.oasis.opendocument.text" + assert resp.content is not None assert "This is an ODT test document, created September 14, 2022" in resp.content - def test_parse_odt_from_bytes_buffer_with_mime(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.odt" - buffer = test_file.read_bytes() + def test_parse_odt_from_bytes_buffer_with_mime( + self, + tika_client: TikaClient, + sample_google_docs_to_libre_office_writer_file: Path, + ): + buffer = sample_google_docs_to_libre_office_writer_file.read_bytes() resp = tika_client.tika.as_text.from_buffer(buffer, "application/vnd.oasis.opendocument.text") assert resp.type == "application/vnd.oasis.opendocument.text" + assert resp.content is not None assert "This is an ODT test document, created September 14, 2022" in resp.content - def test_html_document_from_string_buffer(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.html" - buffer = test_file.read_text() + def test_html_document_from_string_buffer(self, tika_client: TikaClient, sample_html_file: Path): + buffer = sample_html_file.read_text() resp = tika_client.tika.as_text.from_buffer(buffer) @@ -130,9 +153,12 @@ def test_html_document_from_string_buffer(self, tika_client: TikaClient): class TestParseContentCompress: - def test_parse_odt_from_bytes_buffer_compress(self, tika_client_compressed: TikaClient): - test_file = SAMPLE_DIR / "test-document-images.odt" - buffer = test_file.read_bytes() + def test_parse_odt_from_bytes_buffer_compress( + self, + tika_client_compressed: TikaClient, + sample_office_doc_with_images_file: Path, + ): + buffer = sample_office_doc_with_images_file.read_bytes() resp = tika_client_compressed.tika.as_text.from_buffer(buffer) @@ -140,16 +166,14 @@ def test_parse_odt_from_bytes_buffer_compress(self, tika_client_compressed: Tika class TestFilenameContentDisposition: - def test_non_ascii_filename(self, tika_client: TikaClient): - test_file = SAMPLE_DIR / "sample.docx" - - with tempfile.TemporaryDirectory() as temp_dir: - copy = shutil.copy( - test_file, - Path(temp_dir) / "Kostenerstattung für Meldebescheinigung Familienzuschlag.docx", - ) + def test_non_ascii_filename(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path, tmp_path: Path): + copy = shutil.copy( + sample_google_docs_to_docx_file, + tmp_path / "Kostenerstattung für Meldebescheinigung Familienzuschlag.docx", + ) - resp = tika_client.tika.as_text.from_file(copy, magic.from_file(str(copy), mime=True)) + resp = tika_client.tika.as_text.from_file(copy, magic.from_file(str(copy), mime=True)) - assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - assert "This is an DOCX test document, also made September 14, 2022" in resp.content + assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert resp.content is not None + assert "This is an DOCX test document, also made September 14, 2022" in resp.content From 52a19ef9d6776c39d70184cef12b5d92a1b23e77 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 9 Oct 2024 08:06:00 -0700 Subject: [PATCH 09/11] Chore: Python 3.13 testing and support (#25) --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 1 + pyproject.toml | 6 +++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b841968..b6abed2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,7 +58,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', 'pypy3.8', 'pypy3.9', 'pypy3.10'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13', 'pypy3.8', 'pypy3.9', 'pypy3.10'] steps: - diff --git a/CHANGELOG.md b/CHANGELOG.md index 3307421..9957f20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - SPDX license headers were added to source files +- Official support and testing for Python 3.13 ([#25](https://github.com/stumpylog/tika-client/pull/25)) ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 50621ae..40621e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] @@ -107,7 +108,7 @@ cov-report = [ ] [[tool.hatch.envs.hatch-test.matrix]] -python = [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10" ] +python = [ "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.8", "pypy3.9", "pypy3.10" ] # # Custom Environments @@ -233,6 +234,9 @@ lint.isort.force-single-line = true # Recognize us please lint.isort.known-first-party = [ "tika_client" ] +[tool.pyproject-fmt] +max_supported_python = "3.13" + [tool.pytest.ini_options] minversion = "7.0" testpaths = [ "tests" ] From 6748f16cf6baf8f3b3551ad74c919948e6884c27 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 9 Oct 2024 09:17:49 -0700 Subject: [PATCH 10/11] Chore: Use `pytest-docker` in place of manual Docker (#26) --- .github/workflows/ci.yml | 13 -------- CHANGELOG.md | 1 + pyproject.toml | 1 + tests/conftest.py | 31 +++++++++++++++++-- .../docker}/docker-compose.ci-test.yml | 7 ++--- 5 files changed, 32 insertions(+), 21 deletions(-) rename {.docker => tests/docker}/docker-compose.ci-test.yml (73%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b6abed2..203f90e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,13 +63,6 @@ jobs: steps: - uses: actions/checkout@v4 - - - name: Start containers - run: | - docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml pull --quiet - docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml up --detach - echo "Wait for container to be started" - sleep 5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 @@ -99,12 +92,6 @@ jobs: with: # not required for public repos, but intermittently fails otherwise token: ${{ secrets.CODECOV_TOKEN }} - - - name: Stop containers - if: always() - run: | - docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml logs - docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml down build: name: Build diff --git a/CHANGELOG.md b/CHANGELOG.md index 9957f20..51f05c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22)) - Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23)) - Use pytest fixtures effectively ([#24](https://github.com/stumpylog/tika-client/pull/24)) +- Use pytest-docker in place of manual Docker ([#26](https://github.com/stumpylog/tika-client/pull/26)) ## [0.6.0] - 2024-07-18 diff --git a/pyproject.toml b/pyproject.toml index 40621e9..7407992 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,7 @@ extra-dependencies = [ "pytest-httpx == 0.30.0; python_version >= '3.9'", "pytest-httpx ~= 0.22; python_version < '3.9'", "python-magic", + "pytest-docker ~= 3.1", ] extra-args = [ "--maxprocesses=8", "--pythonwarnings=all" ] diff --git a/tests/conftest.py b/tests/conftest.py index 2a97621..e9789c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,41 @@ import logging -import os from pathlib import Path from typing import Generator import pytest +from pytest_docker.plugin import Services from tika_client.client import TikaClient +logger = logging.getLogger("tika-client.tests") + + +@pytest.fixture(scope="session") +def docker_compose_file() -> Path: + return Path(__file__).parent / "docker" / "docker-compose.ci-test.yml" + @pytest.fixture(scope="session") -def tika_host() -> str: - return os.getenv("TIKA_URL", "http://localhost:9998") +def tika_host(docker_services: Services, docker_ip: str) -> str: + def is_responsive(url): + import httpx + + try: + response = httpx.get(url) + except httpx.HTTPError: + logger.exception("Error connecting to service") + return False + else: + return response.status_code == httpx.codes.OK + + url = f"http://{docker_ip}:{docker_services.port_for('tika', 9998)}" + + docker_services.wait_until_responsive( + timeout=30.0, + pause=1, + check=lambda: is_responsive(url), + ) + return url @pytest.fixture(scope="session") diff --git a/.docker/docker-compose.ci-test.yml b/tests/docker/docker-compose.ci-test.yml similarity index 73% rename from .docker/docker-compose.ci-test.yml rename to tests/docker/docker-compose.ci-test.yml index a273f14..c67feb2 100644 --- a/.docker/docker-compose.ci-test.yml +++ b/tests/docker/docker-compose.ci-test.yml @@ -3,11 +3,8 @@ # Can be used locally or by the CI to start the necessary container with the # correct networking for the tests -version: "3" services: tika: image: docker.io/apache/tika:latest - hostname: tika - container_name: tika - network_mode: host - restart: unless-stopped + ports: + - "9998/tcp" From ee344aa75ca5098b8e9d0cd6d11006e1ecdc56e2 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 9 Oct 2024 09:19:06 -0700 Subject: [PATCH 11/11] Bumps version to 0.7.0 --- CHANGELOG.md | 2 +- src/tika_client/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51f05c2..d0681ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.7.0] - 2024-10-09 ### Added diff --git a/src/tika_client/__about__.py b/src/tika_client/__about__.py index 60cd4f7..7404394 100644 --- a/src/tika_client/__about__.py +++ b/src/tika_client/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present Trenton H # # SPDX-License-Identifier: MPL-2.0 -__version__ = "0.6.0" +__version__ = "0.7.0"