Skip to content

Commit

Permalink
incorporating use_at_id setting for GeoNetwork metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
iannesbitt committed Jan 22, 2025
1 parent 1285c95 commit 4340273
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 2 deletions.
33 changes: 32 additions & 1 deletion soscan/sonormalizepipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sonormal.normalize
import json
import opersist.rdfutils
from pathlib import Path

def consolidate_list(l: list, sep: str=', '):
"""
Expand Down Expand Up @@ -37,8 +38,25 @@ class SoscanNormalizePipeline:
4. Get the identifier from the framed JSONLD
"""

def __init__(self):
def __init__(self, **kwargs):
self.logger = logging.getLogger("SoscanNormalize")
self.use_at_id = False
if 'use_at_id' in kwargs:
self.use_at_id = kwargs['use_at_id']


@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
node_path = crawler.settings.get("STORE_PATH", None)
mn_settings = Path(f'{node_path}/settings.json')
if mn_settings.exists():
with open(mn_settings) as cs:
_cs: dict = json.loads(cs.read())
for s in _cs:
if s == 'use_at_id':
kwargs['use_at_id'] = _cs[s]
return cls(**kwargs)


def process_item(self, item, spider):
self.logger.debug("process_item: %s", item["url"])
Expand Down Expand Up @@ -139,6 +157,19 @@ def process_item(self, item, spider):
for group in ids:
g += 1
self.logger.debug(f'Dataset grouping {g}: {group}')
if self.use_at_id:
# if there is no identifier and use_at_id is True, use the @id value as the Dataset identifier
# append other @id values to alt_identifiers
# This is a last resort measure and should be avoided if possible!
# it is needed for repositories that use GeoNetwork software which does not provide identifiers (as of Jan 2025)
if item["series_id"] is None:
item["series_id"] = group["@id"]
self.logger.info(f'Using @id {g} for series_id: {item["series_id"]}')
if len(group["@id"]) > 1:
item["alt_identifiers"].append(group["@id"][1:])
else:
self.logger.info(f'series_id already set: {item["series_id"]}. Appending @id value to alt_identifiers: {group["@id"]}')
item["alt_identifiers"].append(group["@id"])
if len(group["identifier"]) > 0:
if item["series_id"] is None:
item["series_id"] = group["identifier"][0]
Expand Down
4 changes: 3 additions & 1 deletion soscan/spiders/jsonldspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
mn_settings = Path(f'{node_path}/settings.json')
if mn_settings.exists():
with open(mn_settings) as cs:
_cs = json.loads(cs.read())
_cs: dict = json.loads(cs.read())
for s in _cs:
spider.settings.set(s, _cs[s], priority='spider')
spider.logger.info(f'Setting override from {mn_settings}: set {s} to {_cs[s]}')
Expand All @@ -109,6 +109,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.reversed = _cs.get(s, None)
if s in "which_jsonld":
spider.which_jsonld = _cs.get(s, None)
if s in "use_at_id":
spider.use_at_id = _cs.get(s, None)
return spider

def sitemap_filter(self, entries):
Expand Down

1 comment on commit 4340273

@iannesbitt
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

related to #68

Please sign in to comment.