Skip to content

Commit

Permalink
pytest fixture conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed Oct 9, 2024
1 parent 8741c3b commit 3a44733
Show file tree
Hide file tree
Showing 8 changed files with 296 additions and 134 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2 (by [@dependabot](https://github.com/apps/dependabot) in [#22](https://github.com/stumpylog/tika-client/pull/22))
- Update `pre-commit` to 4.0.1 ([#23](https://github.com/stumpylog/tika-client/pull/23))
- Use pytest fixtures effectively ([#24](https://github.com/stumpylog/tika-client/pull/24))

## [0.6.0] - 2024-07-18

Expand Down
75 changes: 68 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,85 @@
import logging
import os
from pathlib import Path
from typing import Final
from typing import Generator

import pytest

from tika_client.client import TikaClient

TIKA_URL: Final[str] = os.getenv("TIKA_URL", "http://localhost:9998")

SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples"
@pytest.fixture(scope="session")
def tika_host() -> str:
return os.getenv("TIKA_URL", "http://localhost:9998")


@pytest.fixture(scope="session")
def samples_dir() -> Path:
return Path(__file__).parent.resolve() / "samples"


@pytest.fixture(scope="session")
def sample_libre_office_writer_file(samples_dir: Path) -> Path:
return samples_dir / "sample-libre-office.odt"


@pytest.fixture(scope="session")
def sample_google_docs_to_libre_office_writer_file(samples_dir: Path) -> Path:
return samples_dir / "sample.odt"


@pytest.fixture(scope="session")
def sample_google_docs_to_docx_file(samples_dir: Path) -> Path:
return samples_dir / "sample.docx"


@pytest.fixture(scope="session")
def sample_docx_file(samples_dir: Path) -> Path:
return samples_dir / "microsoft-sample.docx"


@pytest.fixture(scope="session")
def sample_doc_file(samples_dir: Path) -> Path:
return samples_dir / "sample.doc"


@pytest.fixture(scope="session")
def c(samples_dir: Path) -> Path:
return samples_dir / "sample.html"


@pytest.fixture(scope="session")
def sample_office_doc_with_images_file(samples_dir: Path) -> Path:
return samples_dir / "test-document-images.odt"


@pytest.fixture(scope="session")
def sample_jpeg_file(samples_dir: Path) -> Path:
return samples_dir / "sample.jpg"


@pytest.fixture(scope="session")
def sample_png_file(samples_dir: Path) -> Path:
return samples_dir / "sample.png"


@pytest.fixture(scope="session")
def sample_ods_file(samples_dir: Path) -> Path:
return samples_dir / "sample-spreadsheet.ods"


@pytest.fixture(scope="session")
def sample_xlsx_file(samples_dir: Path) -> Path:
return samples_dir / "sample-spreadsheet.xlsx"


@pytest.fixture
def tika_client() -> TikaClient:
with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO) as client:
def tika_client(tika_host: str) -> Generator[TikaClient, None, None]:
with TikaClient(tika_url=tika_host, log_level=logging.INFO) as client:
yield client


@pytest.fixture
def tika_client_compressed() -> TikaClient:
with TikaClient(tika_url=TIKA_URL, log_level=logging.INFO, compress=True) as client:
def tika_client_compressed(tika_host: str) -> Generator[TikaClient, None, None]:
with TikaClient(tika_url=tika_host, log_level=logging.INFO, compress=True) as client:
yield client
80 changes: 61 additions & 19 deletions tests/test_datetime_formats.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,36 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from pathlib import Path

import magic
import pytest
from pytest_httpx import HTTPXMock

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
from tika_client.data_models import DublinCoreKey
from tika_client.data_models import TikaKey


class TestDateTimeFormat:
def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_utc(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the +xx:yy format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-05-17T16:30:44+00:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == datetime(
year=2023,
Expand All @@ -35,17 +42,24 @@ def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock:
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_zulu(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the Z format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-01-17T16:35:44Z"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == datetime(
year=2023,
Expand All @@ -57,34 +71,48 @@ def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_positive(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_positive(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the +xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44+08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(
datetime(year=2023, month=6, day=17, hour=16, minute=30, second=44, tzinfo=timezone(timedelta(hours=8))),
rel=timedelta(seconds=1),
)

def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_negative(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the -xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44-08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(
datetime(
Expand All @@ -99,32 +127,46 @@ def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_
rel=timedelta(seconds=1),
)

def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_python_isoformat(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the ISO 8061 format (as done by Python)
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

expected = datetime.now(tz=timezone.utc)

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: expected.isoformat()},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(expected, rel=timedelta(seconds=1))

def test_parse_offset_date_no_match(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_no_match(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time string which doesn't match the correct formats
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "202-06-17T16:30:44-0"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created is None
11 changes: 7 additions & 4 deletions tests/test_file_formats.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
from datetime import datetime
from pathlib import Path

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestLibreOfficeFormats:
def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
def test_parse_libre_office_writer_document(self, tika_client: TikaClient, sample_libre_office_writer_file: Path):
"""
Test handling of a ODT document produced by LibreOffice
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"
resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.tika.as_html.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.type == "application/vnd.oasis.opendocument.text"
assert resp.content is not None
assert (
"<body><p>This is a document created by LibreOffice Writer 7.5.12, on July 19th, 2023</p>\n</body>"
in resp.content
Expand Down
13 changes: 6 additions & 7 deletions tests/test_image_files.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
from pathlib import Path

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestParseImageMetadata:
def test_image_jpeg(self, tika_client: TikaClient):
def test_image_jpeg(self, tika_client: TikaClient, sample_jpeg_file: Path):
"""
Test the handling of a JPEG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.jpg"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(sample_jpeg_file, magic.from_file(str(sample_jpeg_file), mime=True))

assert resp.type == "image/jpeg"

def test_image_png(self, tika_client: TikaClient):
def test_image_png(self, tika_client: TikaClient, sample_png_file: Path):
"""
Test the handling of a PNG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.png"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(sample_png_file, magic.from_file(str(sample_png_file), mime=True))

assert resp.type == "image/png"
Loading

0 comments on commit 3a44733

Please sign in to comment.