From 0d7c466ca08b268cbee94358167b5a8a2d1986a8 Mon Sep 17 00:00:00 2001
From: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
Date: Thu, 18 Jul 2024 08:00:39 -0700
Subject: [PATCH 01/11] Fixes version for recent fix
---
CHANGELOG.md | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13aadb7..d4be18e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Update development to use `hatch test` and `hatch fmt` ([#17](https://github.com/stumpylog/tika-client/pull/17))
- Included `mypy` typing in the linting checks
+### Fixed
+
+- Typo in README codeblock by @Chaostheorie ([#19](https://github.com/stumpylog/tika-client/pull/19))
+
## [0.5.0] - 2023-11-07
### Added
@@ -28,10 +32,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `pypa/gh-action-pypi-publish` updated to v1.8.10
- CI testing now uses the official Apache Tika image (minimal) instead of the paperless-ngx image
-### Fixed
-
-- Typo in README codeblock by @Chaostheorie ([#19](https://github.com/stumpylog/tika-client/pull/19))
-
## [0.4.0] - 2023-07-27
### Added
From fc96a15c9b6d43c9b75196d2bf52a13979aa97f6 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 8 Aug 2024 19:58:05 -0700
Subject: [PATCH 02/11] Fixes README referring to the incorrect license
---
CHANGELOG.md | 6 ++++++
README.md | 2 +-
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d4be18e..4251f4b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+
+### Fixed
+
+- Fixed the README referring to the wrong license text
+
## [0.6.0] - 2024-07-18
### Changed
diff --git a/README.md b/README.md
index dd12e53..1e8df65 100644
--- a/README.md
+++ b/README.md
@@ -87,4 +87,4 @@ This library attempts to provide a simpler interface, minimal lines of code and
## License
-`tika-client` is distributed under the terms of the [GPL-3.0-only](https://spdx.org/licenses/GPL-3.0-only.html) license.
+`tika-client` is distributed under the terms of the [Mozilla Public License 2.0](https://spdx.org/licenses/MPL-2.0.html) license.
From 5de24d69ef3fc494277ca7ad709ceffd09e30a14 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 8 Aug 2024 19:58:16 -0700
Subject: [PATCH 03/11] Adds SPDX license header to all source files
---
CHANGELOG.md | 4 ++++
src/tika_client/__about__.py | 3 +++
src/tika_client/__init__.py | 4 ++++
src/tika_client/_constants.py | 4 ++++
src/tika_client/_resource_meta.py | 4 ++++
src/tika_client/_resource_recursive.py | 4 ++++
src/tika_client/_resource_tika.py | 4 ++++
src/tika_client/_types.py | 4 ++++
src/tika_client/_utils.py | 4 ++++
src/tika_client/client.py | 4 ++++
src/tika_client/data_models.py | 3 +++
11 files changed, 42 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4251f4b..24f3d6a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+
+- SPDX license headers were added to source files
+
### Fixed
- Fixed the README referring to the wrong license text
diff --git a/src/tika_client/__about__.py b/src/tika_client/__about__.py
index 906d362..60cd4f7 100644
--- a/src/tika_client/__about__.py
+++ b/src/tika_client/__about__.py
@@ -1 +1,4 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
__version__ = "0.6.0"
diff --git a/src/tika_client/__init__.py b/src/tika_client/__init__.py
index c58c0be..26ea4af 100644
--- a/src/tika_client/__init__.py
+++ b/src/tika_client/__init__.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from tika_client.client import TikaClient
from tika_client.data_models import DublinCoreKey
from tika_client.data_models import TikaKey
diff --git a/src/tika_client/_constants.py b/src/tika_client/_constants.py
index 6881726..02c32e5 100644
--- a/src/tika_client/_constants.py
+++ b/src/tika_client/_constants.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from typing import Final
# Only compress content which is larger than this
diff --git a/src/tika_client/_resource_meta.py b/src/tika_client/_resource_meta.py
index 70081cb..861c06e 100644
--- a/src/tika_client/_resource_meta.py
+++ b/src/tika_client/_resource_meta.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from pathlib import Path
from typing import Final
diff --git a/src/tika_client/_resource_recursive.py b/src/tika_client/_resource_recursive.py
index f04b46b..14a2bb3 100644
--- a/src/tika_client/_resource_recursive.py
+++ b/src/tika_client/_resource_recursive.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from __future__ import annotations
import logging
diff --git a/src/tika_client/_resource_tika.py b/src/tika_client/_resource_tika.py
index 3e156b9..d1bff2b 100644
--- a/src/tika_client/_resource_tika.py
+++ b/src/tika_client/_resource_tika.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from pathlib import Path
from typing import Final
diff --git a/src/tika_client/_types.py b/src/tika_client/_types.py
index e8f878a..1f0e5fe 100644
--- a/src/tika_client/_types.py
+++ b/src/tika_client/_types.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from __future__ import annotations
import sys
diff --git a/src/tika_client/_utils.py b/src/tika_client/_utils.py
index 7b8b895..3f17c0c 100644
--- a/src/tika_client/_utils.py
+++ b/src/tika_client/_utils.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from __future__ import annotations
import logging
diff --git a/src/tika_client/client.py b/src/tika_client/client.py
index 390196a..208cb4b 100644
--- a/src/tika_client/client.py
+++ b/src/tika_client/client.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
+
from __future__ import annotations
import logging
diff --git a/src/tika_client/data_models.py b/src/tika_client/data_models.py
index a39da4b..c85de10 100644
--- a/src/tika_client/data_models.py
+++ b/src/tika_client/data_models.py
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: 2023-present Trenton H
+#
+# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import logging
From d71318c1e1621842858c550a6a874cb332470a81 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 8 Aug 2024 19:58:22 -0700
Subject: [PATCH 04/11] Fixes the creation of unused loggers
---
CHANGELOG.md | 1 +
src/tika_client/_resource_recursive.py | 3 ---
src/tika_client/_utils.py | 3 ---
src/tika_client/data_models.py | 3 ---
4 files changed, 1 insertion(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 24f3d6a..1032e36 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed the README referring to the wrong license text
+- Fixed the creation of loggers for the library which were never utilized
## [0.6.0] - 2024-07-18
diff --git a/src/tika_client/_resource_recursive.py b/src/tika_client/_resource_recursive.py
index 14a2bb3..e23a264 100644
--- a/src/tika_client/_resource_recursive.py
+++ b/src/tika_client/_resource_recursive.py
@@ -4,7 +4,6 @@
from __future__ import annotations
-import logging
from typing import TYPE_CHECKING
from typing import Final
@@ -18,8 +17,6 @@
from tika_client._types import MimeType
from tika_client.data_models import TikaResponse
-logger = logging.getLogger("tika-client.rmeta")
-
class _TikaRmetaBase(BaseResource):
def _common_call(
diff --git a/src/tika_client/_utils.py b/src/tika_client/_utils.py
index 3f17c0c..83dc7cb 100644
--- a/src/tika_client/_utils.py
+++ b/src/tika_client/_utils.py
@@ -4,7 +4,6 @@
from __future__ import annotations
-import logging
import urllib.parse
from typing import TYPE_CHECKING
@@ -19,8 +18,6 @@
from tika_client._types import MimeType
from tika_client._types import RequestContent
-logger = logging.getLogger("tika-client.utils")
-
class BaseResource:
def __init__(self, client: Client, *, compress: bool) -> None:
diff --git a/src/tika_client/data_models.py b/src/tika_client/data_models.py
index c85de10..b541e32 100644
--- a/src/tika_client/data_models.py
+++ b/src/tika_client/data_models.py
@@ -3,7 +3,6 @@
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
-import logging
import re
from datetime import datetime
from datetime import timedelta
@@ -12,8 +11,6 @@
# Based on https://cwiki.apache.org/confluence/display/TIKA/Metadata+Overview
-logger = logging.getLogger("tika-client.data")
-
_TIME_RE = re.compile(
r"(?P\d{4})-"
r"(?P\d{2})-"
From 8e0152845a4b4c9fcb5af425317108b6d93cbd78 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 8 Aug 2024 19:58:29 -0700
Subject: [PATCH 05/11] Update development tool version restrictions
---
pyproject.toml | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 3fa4443..4082016 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ installer = "uv"
[tool.hatch.envs.hatch-static-analysis]
# https://hatch.pypa.io/latest/config/internal/static-analysis/
-dependencies = ["ruff ~= 0.4.8"]
+dependencies = ["ruff ~= 0.5.6"]
config-path = "none"
[tool.hatch.envs.hatch-test]
@@ -71,13 +71,13 @@ parallel = true
randomize = true
dependencies = [
"coverage-enable-subprocess == 1.0",
- "coverage[toml] ~= 7.4",
+ "coverage[toml] ~= 7.6",
"pytest < 8.0; python_version < '3.9'",
- "pytest ~= 8.2; python_version >= '3.9'",
- "pytest-mock ~= 3.12",
+ "pytest ~= 8.3; python_version >= '3.9'",
+ "pytest-mock ~= 3.14",
"pytest-randomly ~= 3.15",
"pytest-rerunfailures ~= 14.0",
- "pytest-xdist[psutil] ~= 3.5",
+ "pytest-xdist[psutil] ~= 3.6",
]
extra-dependencies = [
"pytest-sugar",
@@ -113,7 +113,7 @@ python = ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10"
[tool.hatch.envs.typing]
detached = true
dependencies = [
- "mypy ~= 1.10.0",
+ "mypy ~= 1.11.0",
"httpx",
]
@@ -127,7 +127,7 @@ run = [
template = "pre-commit"
detached = true
dependencies = [
- "pre-commit ~= 3.7.0",
+ "pre-commit ~= 3.8.0",
]
[tool.hatch.envs.pre-commit.scripts]
From e302d1513a9ca961af5a184f4af0dbf2f73fa921 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 14:50:39 -0700
Subject: [PATCH 06/11] Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2
(#22)
* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2
Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.9.0 to 1.10.2.
- [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases)
- [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.9.0...v1.10.2)
---
updated-dependencies:
- dependency-name: pypa/gh-action-pypi-publish
dependency-type: direct:production
update-type: version-update:semver-minor
...
Signed-off-by: dependabot[bot]
* Changelog note
---------
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Trenton Holmes
---
.github/workflows/ci.yml | 2 +-
CHANGELOG.md | 4 ++++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b525a88..b841968 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -193,4 +193,4 @@ jobs:
path: dist
-
name: Publish build to PyPI
- uses: pypa/gh-action-pypi-publish@v1.9.0
+ uses: pypa/gh-action-pypi-publish@v1.10.2
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1032e36..afa73b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed the README referring to the wrong license text
- Fixed the creation of loggers for the library which were never utilized
+### Changed
+
+- Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22))
+
## [0.6.0] - 2024-07-18
### Changed
From 8741c3bf019f8f4b9641a2efd324128eb7cb3bc9 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Tue, 8 Oct 2024 14:57:53 -0700
Subject: [PATCH 07/11] Chore: Dependency updates (#23)
* Upgrades dependencies and hook versions
* Changelog note
* Spelling fix
---------
Co-authored-by: Trenton Holmes
---
.docker/docker-compose.ci-test.yml | 2 +-
.pre-commit-config.yaml | 17 +++-
CHANGELOG.md | 1 +
pyproject.toml | 135 +++++++++++++++--------------
tests/conftest.py | 4 +-
5 files changed, 89 insertions(+), 70 deletions(-)
diff --git a/.docker/docker-compose.ci-test.yml b/.docker/docker-compose.ci-test.yml
index c7b50d8..a273f14 100644
--- a/.docker/docker-compose.ci-test.yml
+++ b/.docker/docker-compose.ci-test.yml
@@ -1,6 +1,6 @@
# docker-compose file for running testing with Tika container
# for a more end to end test of the Tika related functionality
-# Can be used locally or by the CI to start the nessecary container with the
+# Can be used locally or by the CI to start the necessary container with the
# correct networking for the tests
version: "3"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 07d1065..e19aeb4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@
repos:
# General hooks
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.6.0
+ rev: v5.0.0
hooks:
- id: check-docstring-first
- id: check-json
@@ -26,19 +26,28 @@ repos:
- svg
- id: check-case-conflict
- id: detect-private-key
- - repo: https://github.com/pre-commit/mirrors-prettier
- rev: 'v3.1.0'
+ # See https://github.com/prettier/prettier/issues/15742 for the fork reason
+ - repo: https://github.com/rbubley/mirrors-prettier
+ rev: "v3.3.3"
hooks:
- id: prettier
types_or:
- javascript
- ts
- markdown
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.3.0
+ hooks:
+ - id: codespell
# Python hooks
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: 'v0.4.8'
+ rev: 'v0.6.9'
hooks:
# Run the linter.
- id: ruff
# Run the formatter.
- id: ruff-format
+ - repo: https://github.com/tox-dev/pyproject-fmt
+ rev: "2.2.4"
+ hooks:
+ - id: pyproject-fmt
diff --git a/CHANGELOG.md b/CHANGELOG.md
index afa73b4..61412db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22))
+- Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23))
## [0.6.0] - 2024-07-18
diff --git a/pyproject.toml b/pyproject.toml
index 4082016..50621ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,29 +1,29 @@
#
# Project Configuration
#
+
[build-system]
-requires = ["hatchling"]
build-backend = "hatchling.build"
+requires = [ "hatchling" ]
+
[project]
name = "tika-client"
-dynamic = ["version"]
description = "A modern REST client for Apache Tika server"
readme = "README.md"
-requires-python = ">=3.8"
+keywords = [ "api", "client", "html", "office", "pdf", "tika" ]
license = "MPL-2.0"
-keywords = ["api", "pdf", "html", "client", "office", "tika"]
authors = [
{ name = "Trenton H", email = "rda0128ou@mozmail.com" },
]
+requires-python = ">=3.8"
classifiers = [
"Development Status :: 4 - Beta",
+ "Environment :: Web Environment",
+ "Intended Audience :: Developers",
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
"Operating System :: OS Independent",
- "Intended Audience :: Developers",
- "Environment :: Web Environment",
"Programming Language :: Python",
- "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
@@ -33,28 +33,29 @@ classifiers = [
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
+dynamic = [ "version" ]
dependencies = [
- "httpx ~= 0.27; python_version >= '3.9'",
- "httpx ~= 0.24; python_version < '3.9'",
- "typing-extensions; python_version < '3.11'"
+ "httpx~=0.24; python_version<'3.9'",
+ "httpx~=0.27; python_version>='3.9'",
+ "typing-extensions; python_version<'3.11'",
]
-[project.urls]
-Documentation = "https://github.com/stumpylog/tika-rest-client#readme"
-Issues = "https://github.com/stumpylog/tika-rest-client/issues"
-Source = "https://github.com/stumpylog/tika-rest-client"
-Changelog = "https://github.com/stumpylog/tika-rest-client/blob/main/CHANGELOG.md"
+urls.Changelog = "https://github.com/stumpylog/tika-rest-client/blob/main/CHANGELOG.md"
#
# Hatch Configuration
#
+urls.Documentation = "https://github.com/stumpylog/tika-rest-client#readme"
+urls.Issues = "https://github.com/stumpylog/tika-rest-client/issues"
+urls.Source = "https://github.com/stumpylog/tika-rest-client"
+
[tool.hatch.version]
path = "src/tika_client/__about__.py"
[tool.hatch.build.targets.sdist]
exclude = [
".github",
- ".docker"
+ ".docker",
]
[tool.hatch.envs.default]
@@ -62,7 +63,7 @@ installer = "uv"
[tool.hatch.envs.hatch-static-analysis]
# https://hatch.pypa.io/latest/config/internal/static-analysis/
-dependencies = ["ruff ~= 0.5.6"]
+dependencies = [ "ruff ~= 0.6" ]
config-path = "none"
[tool.hatch.envs.hatch-test]
@@ -81,31 +82,32 @@ dependencies = [
]
extra-dependencies = [
"pytest-sugar",
- "pytest-httpx ~= 0.30; python_version >= '3.9'",
+ "pytest-httpx == 0.30.0; python_version >= '3.9'",
"pytest-httpx ~= 0.22; python_version < '3.9'",
"python-magic",
]
-extra-args = ["--maxprocesses=8", "--pythonwarnings=all"]
+extra-args = [ "--maxprocesses=8", "--pythonwarnings=all" ]
[tool.hatch.envs.hatch-test.scripts]
run = [
"python3 --version",
- "pytest{env:HATCH_TEST_ARGS:} {args}"]
+ "pytest{env:HATCH_TEST_ARGS:} {args}",
+]
run-cov = [
"python3 --version",
"coverage erase",
- "coverage run -m pytest{env:HATCH_TEST_ARGS:} {args}"
+ "coverage run -m pytest{env:HATCH_TEST_ARGS:} {args}",
]
-cov-combine = ["coverage combine"]
+cov-combine = [ "coverage combine" ]
cov-report = [
"coverage report",
"coverage json",
- "coverage html"
+ "coverage html",
]
[[tool.hatch.envs.hatch-test.matrix]]
-python = ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10"]
+python = [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10" ]
#
# Custom Environments
@@ -113,40 +115,41 @@ python = ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.8", "pypy3.9", "pypy3.10"
[tool.hatch.envs.typing]
detached = true
dependencies = [
- "mypy ~= 1.11.0",
+ "mypy ~= 1.11",
"httpx",
]
[tool.hatch.envs.typing.scripts]
run = [
"mypy --version",
- "mypy --install-types --non-interactive {args:src/tika_client}"
+ "mypy --install-types --non-interactive {args:src/tika_client}",
]
[tool.hatch.envs.pre-commit]
template = "pre-commit"
detached = true
dependencies = [
- "pre-commit ~= 3.8.0",
+ "pre-commit ~= 4.0",
+ "pre-commit-uv",
]
[tool.hatch.envs.pre-commit.scripts]
-check = ["pre-commit run --all-files"]
-update = ["pre-commit autoupdate"]
+check = [ "pre-commit run --all-files" ]
+update = [ "pre-commit autoupdate" ]
#
# Tool Configuration
#
+
[tool.ruff]
-# https://docs.astral.sh/ruff/settings/
-fix = true
-output-format = "grouped"
target-version = "py38"
line-length = 120
-[tool.ruff.lint]
+# https://docs.astral.sh/ruff/settings/
+fix = true
+output-format = "grouped"
# https://docs.astral.sh/ruff/rules/
-extend-select = [
+lint.extend-select = [
"A",
"ARG",
"B",
@@ -169,15 +172,15 @@ extend-select = [
"ISC",
"N",
"PERF",
- "PIE",
"PGH",
- "PTH",
+ "PIE",
"PL",
"PLC",
"PLE",
"PLR",
"PLW",
"PT",
+ "PTH",
"Q",
"RSE",
"RUF",
@@ -195,38 +198,47 @@ extend-select = [
"W",
"YTT",
]
-ignore = [
+lint.ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
+ # Ignore complexity
+ "C901",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
+ "PLR0911",
+ "PLR0912",
+ "PLR0913",
+ "PLR0915",
# Ignore checks for possible passwords
- "S105", "S106", "S107",
- # Ignore complexity
- "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
+ "S105",
+ "S106",
+ "S107",
# Ignore no author and missing issue link in TODO tags
- "TD002", "TD003"
+ "TD002",
+ "TD003",
]
-
-[tool.ruff.lint.isort]
-force-single-line = true
-known-first-party = ["tika_client"]
-
-[tool.ruff.lint.flake8-tidy-imports]
-ban-relative-imports = "all"
-
-[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
-"tests/**/*" = ["PLR2004",
- "S101",
- "TID252",
- # Allow more complex pytest.raises
- "PT012",
- "DTZ001"
- ]
+lint.per-file-ignores."tests/**/*" = [
+ "DTZ001",
+ "PLR2004",
+ # Allow more complex pytest.raises
+ "PT012",
+ "S101",
+ "TID252",
+]
+# No relative imports
+lint.flake8-tidy-imports.ban-relative-imports = "all"
+# One import per line
+lint.isort.force-single-line = true
+# Recognize us please
+lint.isort.known-first-party = [ "tika_client" ]
+
+[tool.pytest.ini_options]
+minversion = "7.0"
+testpaths = [ "tests" ]
[tool.coverage.run]
-source_pkgs = ["tika_client", "tests"]
+source_pkgs = [ "tika_client", "tests" ]
branch = true
parallel = true
omit = [
@@ -234,8 +246,8 @@ omit = [
]
[tool.coverage.paths]
-tika_client = ["src/tika_client", "*/tika-client/src/tika_client"]
-tests = ["tests", "*/tika-client/tests"]
+tika_client = [ "src/tika_client", "*/tika-client/src/tika_client" ]
+tests = [ "tests", "*/tika-client/tests" ]
[tool.coverage.report]
exclude_lines = [
@@ -255,6 +267,3 @@ warn_redundant_casts = true
warn_unused_ignores = true
warn_unreachable = true
warn_unused_configs = true
-
-[tool.pytest.ini_options]
-minversion = "7.0"
diff --git a/tests/conftest.py b/tests/conftest.py
index d0bf901..f52a47c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,13 +12,13 @@
SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples"
-@pytest.fixture()
+@pytest.fixture
def tika_client() -> TikaClient:
with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO) as client:
yield client
-@pytest.fixture()
+@pytest.fixture
def tika_client_compressed() -> TikaClient:
with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO, compress=True) as client:
yield client
From 4d9cbf48a3b48c8cac519714e9f33e892d19027a Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Wed, 9 Oct 2024 07:56:53 -0700
Subject: [PATCH 08/11] Chore: Use pytest fixtures effectively (#24)
---
CHANGELOG.md | 1 +
tests/conftest.py | 75 ++++++++++--
tests/test_datetime_formats.py | 80 +++++++++----
tests/test_file_formats.py | 11 +-
tests/test_image_files.py | 13 +--
tests/test_resource_metadata.py | 48 ++++----
tests/test_resource_recursive_metadata.py | 70 ++++++++----
tests/test_resource_tika.py | 132 +++++++++++++---------
8 files changed, 296 insertions(+), 134 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61412db..3307421 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22))
- Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23))
+- Use pytest fixtures effectively ([#24](https://github.com/stumpylog/tika-client/pull/24))
## [0.6.0] - 2024-07-18
diff --git a/tests/conftest.py b/tests/conftest.py
index f52a47c..2a97621 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,24 +1,85 @@
import logging
import os
from pathlib import Path
-from typing import Final
+from typing import Generator
import pytest
from tika_client.client import TikaClient
-TIKA_URL: Final[str] = os.getenv("TIKA_URL", "http://localhost:9998")
-SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples"
+@pytest.fixture(scope="session")
+def tika_host() -> str:
+ return os.getenv("TIKA_URL", "http://localhost:9998")
+
+
+@pytest.fixture(scope="session")
+def samples_dir() -> Path:
+ return Path(__file__).parent.resolve() / "samples"
+
+
+@pytest.fixture(scope="session")
+def sample_libre_office_writer_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample-libre-office.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_google_docs_to_libre_office_writer_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_google_docs_to_docx_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(samples_dir: Path) -> Path:
+ return samples_dir / "microsoft-sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_html_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample.html"
+
+
+@pytest.fixture(scope="session")
+def sample_office_doc_with_images_file(samples_dir: Path) -> Path:
+ return samples_dir / "test-document-images.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_jpeg_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample.jpg"
+
+
+@pytest.fixture(scope="session")
+def sample_png_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample.png"
+
+
+@pytest.fixture(scope="session")
+def sample_ods_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample-spreadsheet.ods"
+
+
+@pytest.fixture(scope="session")
+def sample_xlsx_file(samples_dir: Path) -> Path:
+ return samples_dir / "sample-spreadsheet.xlsx"
@pytest.fixture
-def tika_client() -> TikaClient:
- with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO) as client:
+def tika_client(tika_host: str) -> Generator[TikaClient, None, None]:
+ with TikaClient(tika_url=tika_host, log_level=logging.INFO) as client:
yield client
@pytest.fixture
-def tika_client_compressed() -> TikaClient:
- with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO, compress=True) as client:
+def tika_client_compressed(tika_host: str) -> Generator[TikaClient, None, None]:
+ with TikaClient(tika_url=tika_host, log_level=logging.INFO, compress=True) as client:
yield client
diff --git a/tests/test_datetime_formats.py b/tests/test_datetime_formats.py
index 303b3e5..c944058 100644
--- a/tests/test_datetime_formats.py
+++ b/tests/test_datetime_formats.py
@@ -1,29 +1,36 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
+from pathlib import Path
import magic
import pytest
from pytest_httpx import HTTPXMock
-from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
from tika_client.data_models import DublinCoreKey
from tika_client.data_models import TikaKey
class TestDateTimeFormat:
- def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
+ def test_parse_offset_date_format_utc(
+ self,
+ tika_client: TikaClient,
+ sample_libre_office_writer_file: Path,
+ httpx_mock: HTTPXMock,
+ ):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the +xx:yy format
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-05-17T16:30:44+00:00"},
)
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.created == datetime(
year=2023,
@@ -35,17 +42,24 @@ def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock:
tzinfo=timezone.utc,
)
- def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
+ def test_parse_offset_date_format_zulu(
+ self,
+ tika_client: TikaClient,
+ sample_libre_office_writer_file: Path,
+ httpx_mock: HTTPXMock,
+ ):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the Z format
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-01-17T16:35:44Z"},
)
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.created == datetime(
year=2023,
@@ -57,34 +71,48 @@ def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock
tzinfo=timezone.utc,
)
- def test_parse_offset_date_format_positive(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
+ def test_parse_offset_date_format_positive(
+ self,
+ tika_client: TikaClient,
+ sample_libre_office_writer_file: Path,
+ httpx_mock: HTTPXMock,
+ ):
"""
Test the datetime parsing properly handles a time with a timezone in the +xx:yy format offset from UTC
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44+08:00"},
)
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.created == pytest.approx(
datetime(year=2023, month=6, day=17, hour=16, minute=30, second=44, tzinfo=timezone(timedelta(hours=8))),
rel=timedelta(seconds=1),
)
- def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
+ def test_parse_offset_date_format_negative(
+ self,
+ tika_client: TikaClient,
+ sample_libre_office_writer_file: Path,
+ httpx_mock: HTTPXMock,
+ ):
"""
Test the datetime parsing properly handles a time with a timezone in the -xx:yy format offset from UTC
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44-08:00"},
)
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.created == pytest.approx(
datetime(
@@ -99,11 +127,15 @@ def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_
rel=timedelta(seconds=1),
)
- def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
+ def test_parse_offset_date_format_python_isoformat(
+ self,
+ tika_client: TikaClient,
+ sample_libre_office_writer_file: Path,
+ httpx_mock: HTTPXMock,
+ ):
"""
Test the datetime parsing properly handles a time with a timezone in the ISO 8061 format (as done by Python)
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
expected = datetime.now(tz=timezone.utc)
@@ -111,20 +143,30 @@ def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: expected.isoformat()},
)
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.created == pytest.approx(expected, rel=timedelta(seconds=1))
- def test_parse_offset_date_no_match(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
+ def test_parse_offset_date_no_match(
+ self,
+ tika_client: TikaClient,
+ sample_libre_office_writer_file: Path,
+ httpx_mock: HTTPXMock,
+ ):
"""
Test the datetime parsing properly handles a time string which doesn't match the correct formats
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "202-06-17T16:30:44-0"},
)
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.created is None
diff --git a/tests/test_file_formats.py b/tests/test_file_formats.py
index e6f91fd..cd6ab16 100644
--- a/tests/test_file_formats.py
+++ b/tests/test_file_formats.py
@@ -1,20 +1,23 @@
from datetime import datetime
+from pathlib import Path
import magic
-from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
class TestLibreOfficeFormats:
- def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
+ def test_parse_libre_office_writer_document(self, tika_client: TikaClient, sample_libre_office_writer_file: Path):
"""
Test handling of a ODT document produced by LibreOffice
"""
- test_file = SAMPLE_DIR / "sample-libre-office.odt"
- resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.tika.as_html.from_file(
+ sample_libre_office_writer_file,
+ magic.from_file(str(sample_libre_office_writer_file), mime=True),
+ )
assert resp.type == "application/vnd.oasis.opendocument.text"
+ assert resp.content is not None
assert (
"This is a document created by LibreOffice Writer 7.5.12, on July 19th, 2023
\n"
in resp.content
diff --git a/tests/test_image_files.py b/tests/test_image_files.py
index 3b93070..d79a08b 100644
--- a/tests/test_image_files.py
+++ b/tests/test_image_files.py
@@ -1,24 +1,23 @@
+from pathlib import Path
+
import magic
-from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
class TestParseImageMetadata:
- def test_image_jpeg(self, tika_client: TikaClient):
+ def test_image_jpeg(self, tika_client: TikaClient, sample_jpeg_file: Path):
"""
Test the handling of a JPEG file metadata retrieval
"""
- test_file = SAMPLE_DIR / "sample.jpg"
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(sample_jpeg_file, magic.from_file(str(sample_jpeg_file), mime=True))
assert resp.type == "image/jpeg"
- def test_image_png(self, tika_client: TikaClient):
+ def test_image_png(self, tika_client: TikaClient, sample_png_file: Path):
"""
Test the handling of a PNG file metadata retrieval
"""
- test_file = SAMPLE_DIR / "sample.png"
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(sample_png_file, magic.from_file(str(sample_png_file), mime=True))
assert resp.type == "image/png"
diff --git a/tests/test_resource_metadata.py b/tests/test_resource_metadata.py
index a623a7d..fc9a784 100644
--- a/tests/test_resource_metadata.py
+++ b/tests/test_resource_metadata.py
@@ -1,76 +1,82 @@
from datetime import datetime
from datetime import timezone
+from pathlib import Path
import httpx
import magic
import pytest
from pytest_httpx import HTTPXMock
-from tests.conftest import SAMPLE_DIR
-from tests.conftest import TIKA_URL
from tika_client.client import TikaClient
class TestMetadataResource:
- def test_metadata_from_docx(self, tika_client: TikaClient):
+ def test_metadata_from_docx(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path):
"""
Test parsing of a DOCX produced by Google Docs conversion
"""
- test_file = SAMPLE_DIR / "sample.docx"
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+
+ resp = tika_client.metadata.from_file(
+ sample_google_docs_to_docx_file,
+ magic.from_file(str(sample_google_docs_to_docx_file), mime=True),
+ )
assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert resp.created is None
- def test_metadata_from_docx_no_mime(self, tika_client: TikaClient):
+ def test_metadata_from_docx_no_mime(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path):
"""
Test parsing of a DOCX produced by Google Docs conversion, when no mime type is provided
"""
- test_file = SAMPLE_DIR / "sample.docx"
- resp = tika_client.metadata.from_file(test_file)
+
+ resp = tika_client.metadata.from_file(sample_google_docs_to_docx_file)
assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert resp.created is None
- def test_metadata_from_word_docx(self, tika_client: TikaClient):
+ def test_metadata_from_word_docx(self, tika_client: TikaClient, sample_docx_file: Path):
"""
Test parsing of a DOCX produced by Microsoft Word
"""
- test_file = SAMPLE_DIR / "microsoft-sample.docx"
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(sample_docx_file, magic.from_file(str(sample_docx_file), mime=True))
assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert resp.created == datetime(year=2023, month=5, day=17, hour=16, minute=41, tzinfo=timezone.utc)
assert resp.modified == datetime(year=2023, month=5, day=17, hour=16, minute=44, tzinfo=timezone.utc)
- def test_metadata_from_odt(self, tika_client: TikaClient):
+ def test_metadata_from_odt(self, tika_client: TikaClient, sample_google_docs_to_libre_office_writer_file: Path):
"""
Test parsing of a ODT produced by Google Docs conversion
"""
- test_file = SAMPLE_DIR / "sample.odt"
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(
+ sample_google_docs_to_libre_office_writer_file,
+ magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True),
+ )
assert resp.type == "application/vnd.oasis.opendocument.text"
assert resp.data["generator"] == "LibreOfficeDev/6.0.5.2$Linux_X86_64 LibreOffice_project/"
assert resp.created is None
- def test_metadata_from_doc(self, tika_client: TikaClient):
+ def test_metadata_from_doc(self, tika_client: TikaClient, sample_doc_file: Path):
"""
Test parsing of a DOC produced by Google Docs conversion
"""
- test_file = SAMPLE_DIR / "sample.doc"
- resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ resp = tika_client.metadata.from_file(sample_doc_file, magic.from_file(str(sample_doc_file), mime=True))
assert resp.type == "application/msword"
assert resp.language == "en"
- def test_http_error(self, httpx_mock: HTTPXMock):
+ def test_http_error(
+ self,
+ httpx_mock: HTTPXMock,
+ tika_host: str,
+ sample_google_docs_to_libre_office_writer_file: Path,
+ ):
"""
Test handling of HTTP errors returned from Tika
"""
- test_file = SAMPLE_DIR / "sample.odt"
httpx_mock.add_response(status_code=500)
- with pytest.raises(httpx.HTTPStatusError) as err, TikaClient(tika_url=TIKA_URL) as client:
- client.metadata.from_file(test_file)
+ with pytest.raises(httpx.HTTPStatusError) as err, TikaClient(tika_url=tika_host) as client:
+ client.metadata.from_file(sample_google_docs_to_libre_office_writer_file)
assert err.value.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR
diff --git a/tests/test_resource_recursive_metadata.py b/tests/test_resource_recursive_metadata.py
index 311300e..aeb2d90 100644
--- a/tests/test_resource_recursive_metadata.py
+++ b/tests/test_resource_recursive_metadata.py
@@ -1,88 +1,114 @@
+from pathlib import Path
+
import magic
-from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
class TestRecursiveMetadataResource:
- def test_r_metadata_from_docx(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample.docx"
- documents = tika_client.rmeta.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_metadata_from_docx(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path):
+ documents = tika_client.rmeta.as_html.from_file(
+ sample_google_docs_to_docx_file,
+ magic.from_file(str(sample_google_docs_to_docx_file), mime=True),
+ )
assert len(documents) == 1
document = documents[0]
assert document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ assert document.content is not None
assert "This is an DOCX test document, also made September 14, 2022
\n" in document.content
assert document.created is None
- def test_r_metadata_from_docx_plain(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample.docx"
- documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_metadata_from_docx_plain(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path):
+ documents = tika_client.rmeta.as_text.from_file(
+ sample_google_docs_to_docx_file,
+ magic.from_file(str(sample_google_docs_to_docx_file), mime=True),
+ )
assert len(documents) == 1
document = documents[0]
assert document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ assert document.content is not None
assert "This is an DOCX test document, also made September 14, 2022" in document.content
assert document.created is None
- def test_r_meta_microsoft_word_docx(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "microsoft-sample.docx"
- documents = tika_client.rmeta.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_meta_microsoft_word_docx(self, tika_client: TikaClient, sample_docx_file: Path):
+ documents = tika_client.rmeta.as_html.from_file(
+ sample_docx_file,
+ magic.from_file(str(sample_docx_file), mime=True),
+ )
assert len(documents) == 1
document = documents[0]
assert document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ assert document.content is not None
assert (
"This is a sample document, generated by Microsoft Office on Wednesday, May 17, 2023.
\nIt is in English.
\n" # noqa: E501
in document.content
)
- def test_r_metadata_from_odt(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample.odt"
- documents = tika_client.rmeta.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_metadata_from_odt(self, tika_client: TikaClient, sample_google_docs_to_libre_office_writer_file: Path):
+ documents = tika_client.rmeta.as_html.from_file(
+ sample_google_docs_to_libre_office_writer_file,
+ magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True),
+ )
assert len(documents) == 2
document = documents[0]
assert document.type == "application/vnd.oasis.opendocument.text"
+ assert document.content is not None
assert "This is an ODT test document, created September 14, 2022
\n" in document.content
assert document.created is None
- def test_r_metadata_from_odt_plain(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample.odt"
- documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_metadata_from_odt_plain(
+ self,
+ tika_client: TikaClient,
+ sample_google_docs_to_libre_office_writer_file: Path,
+ ):
+ documents = tika_client.rmeta.as_text.from_file(
+ sample_google_docs_to_libre_office_writer_file,
+ magic.from_file(str(sample_google_docs_to_libre_office_writer_file), mime=True),
+ )
assert len(documents) == 2
document = documents[0]
assert document.type == "application/vnd.oasis.opendocument.text"
+ assert document.content is not None
assert "This is an ODT test document, created September 14, 2022" in document.content
document = documents[1]
assert document.type == "image/png"
- def test_r_metadata_from_ods_plain(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample-spreadsheet.ods"
- documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_metadata_from_ods_plain(self, tika_client: TikaClient, sample_ods_file: Path):
+ documents = tika_client.rmeta.as_text.from_file(
+ sample_ods_file,
+ magic.from_file(str(sample_ods_file), mime=True),
+ )
assert len(documents) == 2
document = documents[0]
+ assert document.content is not None
assert "This is cell A1" in document.content
assert "You sunk my battleship" in document.content
document = documents[1]
assert document.type == "image/png"
- def test_r_metadata_from_xlsx_plain(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample-spreadsheet.xlsx"
- documents = tika_client.rmeta.as_text.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_r_metadata_from_xlsx_plain(self, tika_client: TikaClient, sample_xlsx_file: Path):
+ documents = tika_client.rmeta.as_text.from_file(
+ sample_xlsx_file,
+ magic.from_file(str(sample_xlsx_file), mime=True),
+ )
assert len(documents) == 1
document = documents[0]
+ assert document.content is not None
assert "This is cell A1" in document.content
assert "You sunk my battleship" in document.content
diff --git a/tests/test_resource_tika.py b/tests/test_resource_tika.py
index c7296c0..989db67 100644
--- a/tests/test_resource_tika.py
+++ b/tests/test_resource_tika.py
@@ -1,10 +1,8 @@
import shutil
-import tempfile
from pathlib import Path
import magic
-from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
@@ -13,19 +11,22 @@ class TestParseFormatted:
Test the Tika endpoint for returning HTML formatted content
"""
- def test_parse_docx_from_file_as_html(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample.docx"
- resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_parse_docx_from_file_as_html(self, tika_client: TikaClient, sample_google_docs_to_docx_file: Path):
+ resp = tika_client.tika.as_html.from_file(
+ sample_google_docs_to_docx_file,
+ magic.from_file(str(sample_google_docs_to_docx_file), mime=True),
+ )
assert resp.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ assert resp.content is not None
assert "This is an DOCX test document, also made September 14, 2022
\n" in resp.content
assert resp.content_length == 6424
- def test_parse_doc_from_file_as_html(self, tika_client: TikaClient):
- test_file = SAMPLE_DIR / "sample.doc"
- resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
+ def test_parse_doc_from_file_as_html(self, tika_client: TikaClient, sample_doc_file: Path):
+ resp = tika_client.tika.as_html.from_file(sample_doc_file, magic.from_file(str(sample_doc_file), mime=True))
assert resp.type == "application/msword"
+ assert resp.content is not None
assert (
"body>This is a test document, saved in the older .doc format for Word documents (but created in Google Drive)
\n
This is an DOCX test document, also made September 14, 2022
\nThis is a sample document, generated by Microsoft Office on Wednesday, May 17, 2023.
\nIt is in English.
\nThis is an ODT test document, created September 14, 2022
\nThis is an DOCX test document, also made September 14, 2022
\n