Skip to content

Commit

Permalink
Merge pull request #268 from semantic-systems/integrate-gesis-kg
Browse files Browse the repository at this point in the history
Integrate gesis kg
  • Loading branch information
abdullah-rana authored Dec 26, 2024
2 parents 24981b7 + f77235b commit 332a925
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 21 deletions.
50 changes: 29 additions & 21 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Config:
IEEE_API_KEY = os.environ.get("IEEE_API_KEY", "")

REQUEST_HEADER_USER_AGENT = "nfdi4dsBot/1.0 (https://www.nfdi4datascience.de/nfdi4dsBot/; [email protected])"
REQUEST_TIMEOUT = 5
REQUEST_TIMEOUT = 100

NUMBER_OF_RECORDS_TO_SHOW_ON_PAGE_LOAD = 20
NUMBER_OF_RECORDS_TO_APPEND_ON_LAZY_LOAD = 10
Expand All @@ -34,95 +34,103 @@ class Config:
# "module": "dblp_researchers",
# "search-endpoint": f"https://dblp.org/search/author/api?format=json&h={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
# },
"GESIS KG": {
"module": "gesis_kg_publication",
"search-endpoint": f"https://data.gesis.org/gesiskg/sparql?default-graph-uri=&query=",
},
"GESIS KG - Dataset": {
"module": "gesis_kg_dataset",
"search-endpoint": f"https://data.gesis.org/gesiskg/sparql?default-graph-uri=&query=",
},
"OPENALEX - Publications": {
"module": "openalex_publications",
"module": "openalex_publications",
"search-endpoint": f"https://api.openalex.org/works?page=1&per-page={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
"get-publication-endpoint": "https://api.openalex.org/works/",
"get-researcher-publications-endpoint": "https://api.openalex.org/works?filter=author.id:",
},
"OPENALEX - Researchers": {
"module": "openalex_researchers",
"module": "openalex_researchers",
"search-endpoint": f"https://api.openalex.org/authors?page=1&per-page={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
"get-researcher-endpoint": "https://api.openalex.org/authors/",
"get-researcher-publications-endpoint": "https://api.openalex.org/works?filter=author.id:",
},
"ZENODO": {
"module": "zenodo",
"module": "zenodo",
"search-endpoint": f"https://zenodo.org/api/records?size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
"get-publication-endpoint": f"https://zenodo.org/api/records/",
},
"WIKIDATA - Publications": {
"module": "wikidata_publications",
"module": "wikidata_publications",
"search-endpoint": f"https://query.wikidata.org/sparql?format=json&query=",
},
"WIKIDATA - Researchers": {
"module": "wikidata_researchers",
"module": "wikidata_researchers",
"search-endpoint": f"https://query.wikidata.org/sparql?format=json&query=",
},
# "resodate": {
# "module": "resodate",
# "module": "resodate",
# "search-endpoint": f"https://resodate.org/resources/api/search/oer_data/_search?pretty&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
# },
# "OERSI": {
# "module": "oersi",
# "module": "oersi",
# "search-endpoint": f"https://oersi.org/resources/api/search/oer_data/_search?pretty&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
# },
"IEEE": {
"module": "ieee",
"module": "ieee",
"search-endpoint": f"http://ieeexploreapi.ieee.org/api/v1/search/articles?apikey={IEEE_API_KEY}&max_records={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&querytext=",
"get-publication-endpoint": f"http://ieeexploreapi.ieee.org/api/v1/search/articles?apikey={IEEE_API_KEY}&doi=",
},
# "EUDAT": {
# "module": "eudat",
# "module": "eudat",
# "search-endpoint": f"https://b2share.eudat.eu/api/records/?page=1&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&sort=bestmatch&q=",
# "record-base-url": f"https://b2share.eudat.eu/records/",
# },
"OPENAIRE - Products": {
"module": "openaire_products",
"module": "openaire_products",
"search-endpoint": f"https://api.openaire.eu/search/researchProducts?format=json&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&keywords=",
"get-publication-endpoint": f"https://api.openaire.eu/search/researchProducts?format=json&doi=",
},
"OPENAIRE - Projects": {
"module": "openaire_projects",
"module": "openaire_projects",
"search-endpoint": f"https://api.openaire.eu/search/projects?format=json&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&keywords=",
},
"ORCID": {
"module": "orcid",
"module": "orcid",
"search-endpoint": f"https://pub.orcid.org/v3.0/expanded-search/?start=0&rows={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
},
"GESIS": {
"module": "gesis",
"module": "gesis",
"search-endpoint": f"http://193.175.238.35:8089/dc/_search?size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
},
# "CORDIS": {
# "module": "cordis",
# "module": "cordis",
# "search-endpoint": f"https://cordis.europa.eu/search?p=1&num={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&srt=Relevance:decreasing&format=json&q=contenttype='project'%20AND%20",
# },
# "ORKG": {
# "module": "orkg",
# "module": "orkg",
# "search-endpoint": f"https://orkg.org/api/resources/?size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
# },
# "gepris": {
# "module": "gepris",
# "module": "gepris",
# "search-endpoint": f"https://gepris.dfg.de/gepris/OCTOPUS?context=projekt&hitsPerPage=1&index=0&language=en&task=doSearchSimple&keywords_criterion=",
# },
"CROSSREF - Publications": {
"module": "crossref_publications",
"module": "crossref_publications",
"search-endpoint": f"https://api.crossref.org/works?rows={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&query=",
"get-publication-endpoint": "https://api.crossref.org/works/",
"get-publication-references-endpoint": "https://api.crossref.org/works/",
},
"SEMANTIC SCHOLAR - Publications": {
"module": "semanticscholar_publications",
"module": "semanticscholar_publications",
# "search-endpoint": f"",
# "get-publication-endpoint": "https://api.semanticscholar.org/graph/v1/paper/",
"citations-endpoint": f"https://api.semanticscholar.org/graph/v1/paper/",
"recommendations-endpoint": f"https://api.semanticscholar.org/recommendations/v1/papers/forpaper/",
},
"SEMANTIC SCHOLAR - Researchers": {
"module": "semanticscholar_researchers",
"module": "semanticscholar_researchers",
# "search-endpoint": f"",
# "get-researcher-endpoint": f"https://api.semanticscholar.org/graph/v1/author/",
# "get-researcher-endpoint": f"https://api.semanticscholar.org/graph/v1/author/",
},
"RE3DATA": {
"module": "re3data",
Expand Down
111 changes: 111 additions & 0 deletions sources/gesis_kg_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from objects import thing, Article, Author, Dataset
from sources import data_retriever
import utils
from main import app
from string import Template


@utils.handle_exceptions
def search(source: str, search_term: str, results, failed_sources):
query_template = Template('''
PREFIX schema:<https://schema.org/>
PREFIX rdfs:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?dataset ?title ?doi ?datePublished ?license ?version ?publisher ?dateModified ?dateCreated
(GROUP_CONCAT(DISTINCT ?contributor_name; SEPARATOR="; ") AS ?contributors)
(GROUP_CONCAT(DISTINCT ?author_name; SEPARATOR="; ") AS ?authors)
(GROUP_CONCAT(DISTINCT ?provider; SEPARATOR="\n ") AS ?providers)
(GROUP_CONCAT(DISTINCT ?inLanguage; SEPARATOR="; ") AS ?languages)
(GROUP_CONCAT(DISTINCT ?sourceInfo; SEPARATOR="\n ") AS ?sourceInfos)
(GROUP_CONCAT(DISTINCT ?category; SEPARATOR="; ") AS ?categories)
(GROUP_CONCAT(DISTINCT ?abstract; SEPARATOR="\n ") AS ?abstracts)
(GROUP_CONCAT(DISTINCT ?comment; SEPARATOR="\n ") AS ?comments)
(GROUP_CONCAT(DISTINCT ?conditionsOfAccess; SEPARATOR="\n ") AS ?conditionsOfAccesses)
(GROUP_CONCAT(DISTINCT ?spatialCoverage_name; SEPARATOR="\n ") AS ?spatialCoverages)
WHERE {
?dataset rdfs:type schema:Dataset .
?dataset schema:name ?title . FILTER(CONTAINS(?title, "$search_string"))
OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/doi> ?doi . }
OPTIONAL { ?dataset schema:abstract ?abstract . }
OPTIONAL { ?dataset schema:datePublished ?datePublished . }
OPTIONAL { ?dataset schema:provider ?provider . }
OPTIONAL { ?dataset schema:publisher ?publisher . }
OPTIONAL { ?dataset schema:inLanguage ?inLanguage . }
OPTIONAL { ?dataset schema:version ?version . }
OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/category> ?category . }
OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/sourceInfo> ?sourceInfo . }
OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/license> ?license . }
OPTIONAL { ?dataset schema:comment ?comment . }
OPTIONAL { ?dataset schema:conditionsOfAccess ?conditionsOfAccess . }
OPTIONAL { ?dataset schema:dateModified ?dateModified .}
OPTIONAL { ?dataset schema:dateCreated ?dateCreated .}
OPTIONAL { ?dataset schema:spatialCoverage ?spatialCoverage .
?spatialCoverage schema:name ?spatialCoverage_name .}
OPTIONAL { ?dataset schema:contributor ?contributor .
?contributor schema:name ?contributor_name .}
OPTIONAL { ?dataset schema:author ?author .
?author schema:name ?author_name . }
}
GROUP BY ?dataset ?title ?doi ?datePublished ?license ?version ?publisher ?dateModified ?dateCreated
LIMIT $number_of_records
''')

replacement_dict = {
"search_string": search_term,
"number_of_records": app.config['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT']
}
query = query_template.substitute(replacement_dict)
query = ' '.join(query.split())
search_result = data_retriever.retrieve_data(source=source,
base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
search_term=query,
failed_sources=failed_sources)

hits = search_result.get("results", {}).get("bindings", [])
total_hits = len(hits)
utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_hits}")
print(str(total_hits) + "from GESIS KG")
if int(total_hits) > 0:
for hit in hits:
dataset = Dataset()
dataset.additionalType = "DATASET"
dataset.identifier = hit.get("doi", {}).get("value", "")
dataset.name = hit.get("title", {}).get("value", "")
dataset.url = hit.get("dataset", {}).get("value", "").strip()

dataset.datePublished = hit.get('datePublished', {}).get('value', "")
dataset.dateCreated = hit.get('dateCreated', {}).get('value', "")
dataset.dateModified = hit.get('dateModified', {}).get('value', "")
dataset.version = hit.get('version', {}).get('value', "")
dataset.license = hit.get('license', {}).get('value', "")
dataset.publisher = hit.get('publisher', {}).get('value', "")

languages = hit.get("languages", {}).get("value", "")
if languages:
for language in languages.strip().split(" "):
dataset.inLanguage.append(language)
# dataset.sourceOrganization = hit.get("providers", {}).get("value", "")
dataset.description = hit.get("abstract", {}).get("value", "")
dataset.publication = hit.get("sourceInfos", {}).get("value", "")

authors = hit.get("authors", {}).get("value", "")
contributors = hit.get("contributors", {}).get("value", "")
authors_list = [name for name in (authors + ";" + contributors).strip(", ").split(";") if name]
authors_list = list(dict.fromkeys(authors_list))

for authorsName in authors_list:
_author = Author()
_author.type = 'Person'
_author.name = authorsName
_author.identifier = "" # ORCID is available for few; we need to update the sparql query to pull this information
dataset.author.append(_author)

_source = thing()
_source.name = 'GESIS KG - Dataset'
_source.originalSource = dataset.publisher
_source.identifier = dataset.identifier
_source.url = dataset.url
dataset.source.append(_source)

results['resources'].append(dataset)
97 changes: 97 additions & 0 deletions sources/gesis_kg_publication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from objects import thing, Article, Author
from sources import data_retriever
import utils
from main import app
from string import Template


@utils.handle_exceptions
def search(source: str, search_term: str, results, failed_sources):
query_template = Template('''
PREFIX schema:<https://schema.org/>
PREFIX rdfs:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT ?publication ?title ?doi ?abstract
(GROUP_CONCAT(DISTINCT ?linksURN; SEPARATOR=", ") AS ?linksURNs)
(GROUP_CONCAT(DISTINCT ?url; SEPARATOR=", ") AS ?urls)
(GROUP_CONCAT(DISTINCT ?datePub; SEPARATOR=", ") AS ?datePublished)
(GROUP_CONCAT(DISTINCT ?contributor_name; SEPARATOR="; ") AS ?contributors)
(GROUP_CONCAT(DISTINCT ?author_name; SEPARATOR="; ") AS ?authors)
(GROUP_CONCAT(DISTINCT ?provider; SEPARATOR=", ") AS ?providers)
(GROUP_CONCAT(DISTINCT ?inLanguage; SEPARATOR=", ") AS ?languages)
(GROUP_CONCAT(DISTINCT ?sourceInfo; SEPARATOR=", ") AS ?sourceInfos)
WHERE {
?publication rdfs:type schema:ScholarlyArticle .
?publication schema:name ?title . FILTER(CONTAINS(?title, "$search_string"))
OPTIONAL { ?publication <https://data.gesis.org/gesiskg/schema/doi> ?doi . }
OPTIONAL { ?publication schema:abstract ?abstract . }
OPTIONAL { ?publication <https://data.gesis.org/gesiskg/schema/linksURN> ?linksURN . }
OPTIONAL { ?publication schema:url ?url . }
OPTIONAL { ?publication schema:datePublished ?datePub . }
OPTIONAL { ?publication schema:provider ?provider . }
OPTIONAL { ?publication schema:inLanguage ?inLanguage . }
OPTIONAL { ?publication <https://data.gesis.org/gesiskg/schema/sourceInfo> ?sourceInfo . }
OPTIONAL { ?publication schema:contributor ?contributor .
?contributor schema:name ?contributor_name .}
OPTIONAL { ?publication schema:author ?author .
?author schema:name ?author_name . }
}
GROUP BY ?publication ?title ?doi ?abstract
LIMIT $number_of_records
''')

replacement_dict = {
"search_string": search_term,
"number_of_records": app.config['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT']
}
query = query_template.substitute(replacement_dict)
query = ' '.join(query.split())
search_result = data_retriever.retrieve_data(source=source,
base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
search_term=query,
failed_sources=failed_sources)

hits = search_result.get("results", {}).get("bindings", [])
total_hits = len(hits)
utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_hits}")
if int(total_hits) > 0:
for hit in hits:
publication = Article()
publication.identifier = hit.get("doi", {}).get("value", "")
publication.name = hit.get("title", {}).get("value", "")
publication.url = hit.get("urls", {}).get("value", "").strip() #hit.get("urls", {}).get("value", "")

#publication.identifier = hit.get("linksURNs", {}).get("value", "") # DOI is available for few; we need to update the sparql query to fetch this information
publication.description = hit.get("abstract", {}).get("value", "")
publication.datePublished = hit.get('datePublished', {}).get('value', "")
languages = hit.get("languages", {}).get("value", "")
if languages:
for language in languages.strip().split(" "):
publication.inLanguage.append(language)
#publication.sourceOrganization = hit.get("providers", {}).get("value", "")
publication.publisher = hit.get("sourceInfos", {}).get("value", "")

authors = hit.get("authors", {}).get("value", "")
contributors = hit.get("contributors", {}).get("value", "")
authors_list = [name for name in (authors + ";" + contributors).strip(", ").split(";") if name ]
authors_list = list(dict.fromkeys(authors_list))

for authorsName in authors_list:
_author = Author()
_author.type = 'Person'
_author.name = authorsName
_author.identifier = "" # ORCID is available for few; we need to update the sparql query to pull this information
publication.author.append(_author)

_source = thing()
_source.name = 'GESIS KG'
_source.originalSource = publication.publisher
_source.identifier = publication.identifier # hit['publication'].get('value', "") #.replace("http://www.wikidata.org/", "") # remove the base url and only keep the ID
_source.url = publication.url #hit['urls'].get('value', "").strip()
publication.source.append(_source)

if publication.identifier != "":
results['publications'].append(publication)
else:
results['others'].append(publication)

0 comments on commit 332a925

Please sign in to comment.