Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Reader Performance Queries for /datafiles and /datasets #488

Merged
merged 18 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ jobs:
- name: Create search_api_mapping.json
run: cp datagateway_api/search_api_mapping.json.example datagateway_api/search_api_mapping.json

# See comment in noxfile.py for explanation regarding this step
- name: Downgrade setuptools
run: poetry run pip install --upgrade setuptools==70.0.0
if: matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10'
- name: Install dependencies
run: poetry install

Expand Down Expand Up @@ -284,6 +288,9 @@ jobs:
- name: Install Requests
run: pip install 'requests<2.30'

# See comment in noxfile.py for explanation regarding this step
- name: Downgrade setuptools
run: poetry run pip install --upgrade setuptools==70.0.0
- name: Install dependencies
run: poetry install

Expand Down Expand Up @@ -339,6 +346,9 @@ jobs:
- name: Create search_api_mapping.json
run: cd /home/runner/work/datagateway-api/datagateway-api; cp datagateway_api/search_api_mapping.json.example datagateway_api/search_api_mapping.json

# See comment in noxfile.py for explanation regarding this step
- name: Downgrade setuptools
run: poetry run pip install --upgrade setuptools==70.0.0
- name: Install dependencies
run: poetry install

Expand Down
5 changes: 5 additions & 0 deletions datagateway_api/config.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ datagateway_api:
db_url: "mysql+pymysql://icatdbuser:icatdbuserpw@localhost:3306/icatdb"
icat_url: "https://localhost:8181"
icat_check_cert: false
use_reader_for_performance:
enabled: false
reader_mechanism: simple
reader_username: reader
reader_password: readerpw
search_api:
extension: "/search-api"
icat_url: "https://localhost:8181"
Expand Down
8 changes: 8 additions & 0 deletions datagateway_api/src/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ def validate_extension(extension):
return extension


class UseReaderForPerformance(BaseModel):
enabled: StrictBool
reader_mechanism: StrictStr
reader_username: StrictStr
reader_password: StrictStr


class DataGatewayAPI(BaseModel):
"""
Configuration model class that implements pydantic's BaseModel class to allow for
Expand All @@ -54,6 +61,7 @@ class DataGatewayAPI(BaseModel):
extension: StrictStr
icat_check_cert: Optional[StrictBool]
icat_url: Optional[StrictStr]
use_reader_for_performance: Optional[UseReaderForPerformance]

_validate_extension = validator("extension", allow_reuse=True)(validate_extension)

Expand Down
3 changes: 3 additions & 0 deletions datagateway_api/src/common/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def __init__(self, field, value, operation):
"must contain two values e.g. [1, 2]",
)

def __repr__(self):
return f"Field: {self.field}, Operation: {self.operation}, Value: {self.value}"


class DistinctFieldFilter(QueryFilter):
precedence = 0
Expand Down
97 changes: 84 additions & 13 deletions datagateway_api/src/datagateway_api/icat/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ICATValidationError,
)

from datagateway_api.src.common.config import Config
from datagateway_api.src.common.date_handler import DateHandler
from datagateway_api.src.common.exceptions import (
AuthenticationError,
Expand All @@ -27,6 +28,9 @@
)
from datagateway_api.src.datagateway_api.icat.lru_cache import ExtendedLRUCache
from datagateway_api.src.datagateway_api.icat.query import ICATQuery
from datagateway_api.src.datagateway_api.icat.reader_query_handler import (
ReaderQueryHandler,
)

log = logging.getLogger()

Expand Down Expand Up @@ -298,15 +302,7 @@ def get_entity_with_filters(client, entity_type, filters):
result of the query
"""
log.info("Getting entity using request's filters")

query = ICATQuery(client, entity_type)

filter_handler = FilterOrderHandler()
filter_handler.manage_icat_filters(filters, query.query)

data = query.execute_query(client, True)

return data
return get_data_with_filters(client, entity_type, filters)


def get_count_with_filters(client, entity_type, filters):
Expand All @@ -329,15 +325,90 @@ def get_count_with_filters(client, entity_type, filters):
entity_type,
)

query = ICATQuery(client, entity_type, aggregate="COUNT")
data = get_data_with_filters(client, entity_type, filters, aggregate="COUNT")
# Only ever 1 element in a count query result
return data[0]


def get_data_with_filters(client, entity_type, filters, aggregate=None):
"""
Gets all the records of a given entity, based on the filters and an optional
aggregate provided in the request. This function is called by
`get_entity_with_filters()` and `get_count_with_filters()` that deal with GET entity
and GET /count entity endpoints respectively

This function uses the reader performance query functionality IF it is enabled in
the config. Checks are done to see whether this functionality has been enabled and
whether the query is suitable to be completed with the reader account. There are
more details about the inner workings in ReaderQueryHandler
"""

if not is_use_reader_for_performance_enabled():
# just execute the query as normal
return execute_entity_query(client, entity_type, filters, aggregate=aggregate)

# otherwise see if this query is eligible to benefit from running
# faster using the reader account
reader_query = ReaderQueryHandler(entity_type, filters)
if reader_query.is_query_eligible_for_reader_performance():
log.info("Query is eligible to be passed as reader acount")
if reader_query.is_user_authorised_to_see_entity_id(client):
reader_client = ReaderQueryHandler.reader_client
log.info("Query to be executed as reader account")
try:
results = execute_entity_query(
reader_client, entity_type, filters, aggregate=aggregate,
)
except ICATSessionError:
# re-login as reader and try the query again
reader_client = reader_query.create_reader_client()
results = execute_entity_query(
reader_client, entity_type, filters, aggregate=aggregate,
)
return results
else:
raise AuthenticationError(
"Not authorised to access the"
f" {ReaderQueryHandler.entity_filter_check[entity_type]}"
" you have filtered on",
)
else:
log.info("Query to be executed as user from request: %s", client.getUserName())
return execute_entity_query(client, entity_type, filters, aggregate=aggregate)


def execute_entity_query(client, entity_type, filters, aggregate=None):
"""
Assemble a query object with the user's query filters and execute the query by
passing it to ICAT, returning them in this function
"""

query = ICATQuery(client, entity_type, aggregate=aggregate)

filter_handler = FilterOrderHandler()
filter_handler.manage_icat_filters(filters, query.query)

data = query.execute_query(client, True)
log.debug(
"Query on entity '%s' (aggregate: %s), executed as user: %s",
entity_type,
aggregate,
client.getUserName(),
)
return query.execute_query(client, True)

# Only ever 1 element in a count query result
return data[0]

def is_use_reader_for_performance_enabled() -> bool:
"""
Returns true is the 'use_reader_for_performance' section is present in the
config file and 'enabled' in that section is set to true
"""
reader_config = Config.config.datagateway_api.use_reader_for_performance
if not reader_config:
return False
if not reader_config.enabled:
return False

return True


def get_first_result_with_filters(client, entity_type, filters):
Expand Down
162 changes: 162 additions & 0 deletions datagateway_api/src/datagateway_api/icat/reader_query_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import logging
from typing import List, Optional

from icat.exception import ICATSessionError

from datagateway_api.src.common.config import Config
from datagateway_api.src.common.exceptions import PythonICATError
from datagateway_api.src.common.filter_order_handler import FilterOrderHandler
from datagateway_api.src.common.filters import QueryFilter
from datagateway_api.src.datagateway_api.icat.filters import PythonICATWhereFilter
from datagateway_api.src.datagateway_api.icat.icat_client_pool import ICATClient
from datagateway_api.src.datagateway_api.icat.query import ICATQuery

log = logging.getLogger()


class ReaderQueryHandler:
"""
This class handles the mechanism that allows 'performance queries' to occur on
particular endpoints. These queries are to improve performance on requests that have
a WHERE filter on the ID of the parent entity where passing the query directly to
ICAT can cause performance issues. This is due to the complexity of the ICAT rules,
meaning a relatively simple SQL query is a long paragraph of SQL. The rules are
bypassed by performing an equivalent check to see if the user can see the parent
entity by querying for it directly. Once permissions have been verified, the user's
original query is executed using a configurable reader account.

On a production instance where this functionality is needed, the reader account will
have been setup with appropriate ICAT rules to view the entities.

Example workflow:
- User sends request to /datafiles with a WHERE filter of dataset.id = 4
- Query is determined as eligble
- Dataset query is sent to ICAT with a WHERE filter of id = 4
- If the appropriate dataset is returned, the user's original query is executed, but
as the reader account, not the user's account
- If no dataset is found (i.e. the user doesn't have permission is view the dataset)
the API responds with a 403
"""

# Lookup to determine which field to search whether a user has permission to view
entity_filter_check = {"Datafile": "dataset.id", "Dataset": "investigation.id"}
parent_entity_lookup = {"Datafile": "Dataset", "Dataset": "Investigation"}
# Keep a cached reader_client for faster queries. A reader client is created when
# the first instance of this class is created and is refreshed when a login attempt
# fails (due to an expired session ID)
reader_client = None

def __init__(self, entity_type: str, filters: List[QueryFilter]) -> None:
self.entity_type = entity_type
self.filters = filters
log.debug(
"Instance of ReaderQueryHandler created for a '%s' request",
self.entity_type,
)
self.reader_query_eligible = self.check_eligibility()
if not ReaderQueryHandler.reader_client:
self.create_reader_client()

def create_reader_client(self) -> ICATClient:
"""
Create a new client (assigning it as a class variable) and login using the
reader's credentials. If the credentials aren't valid, a PythonICATError is
raised (resulting in a 500). The client object is returned
"""

log.info("Creating reader_client")
ReaderQueryHandler.reader_client = ICATClient("datagateway_api")
reader_config = Config.config.datagateway_api.use_reader_for_performance
login_credentals = {
"username": reader_config.reader_username,
"password": reader_config.reader_password,
}
try:
ReaderQueryHandler.reader_client.login(
reader_config.reader_mechanism, login_credentals,
)
except ICATSessionError:
log.error("User credentials for reader account aren't valid")
raise PythonICATError("Internal error with reader account configuration")
return ReaderQueryHandler.reader_client

def check_eligibility(self) -> bool:
"""
This function checks whether the input query can be executed as a 'reader
performance query'. The entity of the query needs to be in `entity_filter_check`
and an appropriate WHERE filter needs to be sought
(using `get_where_filter_for_entity_id_check()`)
"""
log.info("Checking whether query is eligible to go via reader account")
if self.entity_type in ReaderQueryHandler.entity_filter_check.keys():
if self.get_where_filter_for_entity_id_check():
return True

return False

def is_query_eligible_for_reader_performance(self) -> bool:
"""
Getter that returns a boolean regarding query eligibility
"""
return self.reader_query_eligible

def get_where_filter_for_entity_id_check(self) -> Optional[PythonICATWhereFilter]:
"""
Iterate through the instance's query filters and return a WHERE filter for a
relevant parent entity (e.g. dataset.id or datafile.id). The WHERE filter must
use the 'eq' operator
"""

for query_filter in self.filters:
if (
isinstance(query_filter, PythonICATWhereFilter)
and query_filter.field
== ReaderQueryHandler.entity_filter_check[self.entity_type]
and query_filter.operation == "eq"
):
log.debug(
"WHERE filter relevant for reader query checking: %s", query_filter,
)
self.where_filter_entity_id = query_filter.value
return query_filter

return None

def is_user_authorised_to_see_entity_id(self, client) -> bool:
"""
This function checks whether the user is authorised to see a parent entity (e.g.
if they query /datafiles, whether they can see a particular dataset). A query is
created and sent to ICAT for execution - the query is performed using the
session ID provided in the request
"""

log.info(
"Checking to see if user '%s' can see '%s' = %s",
client.getUserName(),
ReaderQueryHandler.entity_filter_check[self.entity_type],
self.where_filter_entity_id,
)
access_query = ICATQuery(
client, ReaderQueryHandler.parent_entity_lookup[self.entity_type],
)
id_check = PythonICATWhereFilter("id", self.where_filter_entity_id, "eq")
access_filter_handler = FilterOrderHandler()
access_filter_handler.manage_icat_filters([id_check], access_query.query)
results = access_query.execute_query(client)

if results:
log.debug(
"User is authorised to see '%s' '%s'",
ReaderQueryHandler.entity_filter_check[self.entity_type],
self.where_filter_entity_id,
)
user_authorised = True
else:
log.debug(
"User is NOT authorised to see '%s' '%s'",
ReaderQueryHandler.entity_filter_check[self.entity_type],
self.where_filter_entity_id,
)
user_authorised = False

return user_authorised
Loading
Loading