Skip to content

Commit

Permalink
Further conversions
Browse files Browse the repository at this point in the history
  • Loading branch information
Trenton Holmes committed Oct 8, 2024
1 parent 9f8ecff commit ff3836f
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 37 deletions.
3 changes: 0 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import logging
import os
from pathlib import Path
from typing import Final
from typing import Generator

import pytest

from tika_client.client import TikaClient

SAMPLE_DIR: Final[Path] = Path(__file__).parent.resolve() / "samples"


@pytest.fixture(scope="session")
def tika_host() -> str:
Expand Down
80 changes: 61 additions & 19 deletions tests/test_datetime_formats.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,36 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from pathlib import Path

import magic
import pytest
from pytest_httpx import HTTPXMock

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
from tika_client.data_models import DublinCoreKey
from tika_client.data_models import TikaKey


class TestDateTimeFormat:
def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_utc(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the +xx:yy format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-05-17T16:30:44+00:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == datetime(
year=2023,
Expand All @@ -35,17 +42,24 @@ def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock:
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_zulu(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the Z format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-01-17T16:35:44Z"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == datetime(
year=2023,
Expand All @@ -57,34 +71,48 @@ def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_positive(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_positive(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the +xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44+08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(
datetime(year=2023, month=6, day=17, hour=16, minute=30, second=44, tzinfo=timezone(timedelta(hours=8))),
rel=timedelta(seconds=1),
)

def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_negative(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the -xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44-08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(
datetime(
Expand All @@ -99,32 +127,46 @@ def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_
rel=timedelta(seconds=1),
)

def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_format_python_isoformat(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time with a timezone in the ISO 8061 format (as done by Python)
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

expected = datetime.now(tz=timezone.utc)

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: expected.isoformat()},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created == pytest.approx(expected, rel=timedelta(seconds=1))

def test_parse_offset_date_no_match(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
def test_parse_offset_date_no_match(
self,
tika_client: TikaClient,
sample_libre_office_writer_file: Path,
httpx_mock: HTTPXMock,
):
"""
Test the datetime parsing properly handles a time string which doesn't match the correct formats
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "202-06-17T16:30:44-0"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(
sample_libre_office_writer_file,
magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.created is None
9 changes: 5 additions & 4 deletions tests/test_file_formats.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from datetime import datetime
from pathlib import Path

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestLibreOfficeFormats:
def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
def test_parse_libre_office_writer_document(self, tika_client: TikaClient, sample_libre_office_writer_file: Path):
"""
Test handling of a ODT document produced by LibreOffice
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"
resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.tika.as_html.from_file(
sample_libre_office_writer_file, magic.from_file(str(sample_libre_office_writer_file), mime=True),
)

assert resp.type == "application/vnd.oasis.opendocument.text"
assert resp.content is not None
Expand Down
13 changes: 6 additions & 7 deletions tests/test_image_files.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
from pathlib import Path

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestParseImageMetadata:
def test_image_jpeg(self, tika_client: TikaClient):
def test_image_jpeg(self, tika_client: TikaClient, sample_jpeg_file: Path):
"""
Test the handling of a JPEG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.jpg"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(sample_jpeg_file, magic.from_file(str(sample_jpeg_file), mime=True))

assert resp.type == "image/jpeg"

def test_image_png(self, tika_client: TikaClient):
def test_image_png(self, tika_client: TikaClient, sample_png_file: Path):
"""
Test the handling of a PNG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.png"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))
resp = tika_client.metadata.from_file(sample_png_file, magic.from_file(str(sample_png_file), mime=True))

assert resp.type == "image/png"
2 changes: 0 additions & 2 deletions tests/test_resource_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import pytest
from pytest_httpx import HTTPXMock

from tests.conftest import SAMPLE_DIR
from tests.conftest import TIKA_URL
from tika_client.client import TikaClient


Expand Down
1 change: 0 additions & 1 deletion tests/test_resource_recursive_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


Expand Down
1 change: 0 additions & 1 deletion tests/test_resource_tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


Expand Down

0 comments on commit ff3836f

Please sign in to comment.