Merge pull request #268 from semantic-systems/integrate-gesis-kg

Integrate gesis kg
semantic-systems · Dec 26, 2024 · 332a925 · 332a925
2 parents 24981b7 + f77235b
commit 332a925
Show file tree

Hide file tree

Showing 3 changed files with 237 additions and 21 deletions.
diff --git a/config.py b/config.py
@@ -14,7 +14,7 @@ class Config:
     IEEE_API_KEY = os.environ.get("IEEE_API_KEY", "")
 
     REQUEST_HEADER_USER_AGENT = "nfdi4dsBot/1.0 (https://www.nfdi4datascience.de/nfdi4dsBot/; [email protected])"
-    REQUEST_TIMEOUT = 5
+    REQUEST_TIMEOUT = 100
 
     NUMBER_OF_RECORDS_TO_SHOW_ON_PAGE_LOAD = 20
     NUMBER_OF_RECORDS_TO_APPEND_ON_LAZY_LOAD = 10
@@ -34,95 +34,103 @@ class Config:
         #    "module": "dblp_researchers", 
         #    "search-endpoint": f"https://dblp.org/search/author/api?format=json&h={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
         # },
+        "GESIS KG": {
+            "module": "gesis_kg_publication",
+            "search-endpoint": f"https://data.gesis.org/gesiskg/sparql?default-graph-uri=&query=",
+        },
+        "GESIS KG - Dataset": {
+            "module": "gesis_kg_dataset",
+            "search-endpoint": f"https://data.gesis.org/gesiskg/sparql?default-graph-uri=&query=",
+        },
         "OPENALEX - Publications": {
-            "module": "openalex_publications", 
+            "module": "openalex_publications",
             "search-endpoint": f"https://api.openalex.org/works?page=1&per-page={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
             "get-publication-endpoint": "https://api.openalex.org/works/",
             "get-researcher-publications-endpoint": "https://api.openalex.org/works?filter=author.id:",
         },
         "OPENALEX - Researchers": {
-            "module": "openalex_researchers", 
+            "module": "openalex_researchers",
             "search-endpoint": f"https://api.openalex.org/authors?page=1&per-page={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
             "get-researcher-endpoint": "https://api.openalex.org/authors/",
             "get-researcher-publications-endpoint": "https://api.openalex.org/works?filter=author.id:",
         },
         "ZENODO": {
-            "module": "zenodo", 
+            "module": "zenodo",
             "search-endpoint": f"https://zenodo.org/api/records?size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
             "get-publication-endpoint": f"https://zenodo.org/api/records/",
         },
         "WIKIDATA - Publications": {
-            "module": "wikidata_publications", 
+            "module": "wikidata_publications",
             "search-endpoint": f"https://query.wikidata.org/sparql?format=json&query=",
         },
         "WIKIDATA - Researchers": {
-            "module": "wikidata_researchers", 
+            "module": "wikidata_researchers",
             "search-endpoint": f"https://query.wikidata.org/sparql?format=json&query=",
         },
         # "resodate": {
-        #     "module": "resodate", 
+        #     "module": "resodate",
         #     "search-endpoint": f"https://resodate.org/resources/api/search/oer_data/_search?pretty&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
         # },
         # "OERSI": {
-        #     "module": "oersi", 
+        #     "module": "oersi",
         #     "search-endpoint": f"https://oersi.org/resources/api/search/oer_data/_search?pretty&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
         # },
         "IEEE": {
-            "module": "ieee", 
+            "module": "ieee",
             "search-endpoint": f"http://ieeexploreapi.ieee.org/api/v1/search/articles?apikey={IEEE_API_KEY}&max_records={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&querytext=",
             "get-publication-endpoint": f"http://ieeexploreapi.ieee.org/api/v1/search/articles?apikey={IEEE_API_KEY}&doi=",
         },
         # "EUDAT": {
-        #     "module": "eudat", 
+        #     "module": "eudat",
         #     "search-endpoint": f"https://b2share.eudat.eu/api/records/?page=1&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&sort=bestmatch&q=",
         #     "record-base-url": f"https://b2share.eudat.eu/records/",
         # },
         "OPENAIRE - Products": {
-            "module": "openaire_products", 
+            "module": "openaire_products",
             "search-endpoint": f"https://api.openaire.eu/search/researchProducts?format=json&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&keywords=",
             "get-publication-endpoint": f"https://api.openaire.eu/search/researchProducts?format=json&doi=",
         },
         "OPENAIRE - Projects": {
-            "module": "openaire_projects", 
+            "module": "openaire_projects",
             "search-endpoint": f"https://api.openaire.eu/search/projects?format=json&size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&keywords=",
         },
         "ORCID": {
-            "module": "orcid", 
+            "module": "orcid",
             "search-endpoint": f"https://pub.orcid.org/v3.0/expanded-search/?start=0&rows={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
         },
         "GESIS": {
-            "module": "gesis", 
+            "module": "gesis",
             "search-endpoint": f"http://193.175.238.35:8089/dc/_search?size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
         },
         # "CORDIS": {
-        #     "module": "cordis", 
+        #     "module": "cordis",
         #     "search-endpoint": f"https://cordis.europa.eu/search?p=1&num={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&srt=Relevance:decreasing&format=json&q=contenttype='project'%20AND%20",
         # },
         # "ORKG": {
-        #     "module": "orkg", 
+        #     "module": "orkg",
         #     "search-endpoint": f"https://orkg.org/api/resources/?size={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&q=",
         # },
         # "gepris": {
-        #     "module": "gepris", 
+        #     "module": "gepris",
         #     "search-endpoint": f"https://gepris.dfg.de/gepris/OCTOPUS?context=projekt&hitsPerPage=1&index=0&language=en&task=doSearchSimple&keywords_criterion=",
         # },
         "CROSSREF - Publications": {
-            "module": "crossref_publications", 
+            "module": "crossref_publications",
             "search-endpoint": f"https://api.crossref.org/works?rows={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&query=",
             "get-publication-endpoint": "https://api.crossref.org/works/",
             "get-publication-references-endpoint": "https://api.crossref.org/works/",
         },
         "SEMANTIC SCHOLAR - Publications": {
-            "module": "semanticscholar_publications", 
+            "module": "semanticscholar_publications",
             # "search-endpoint": f"",
             # "get-publication-endpoint": "https://api.semanticscholar.org/graph/v1/paper/",
             "citations-endpoint": f"https://api.semanticscholar.org/graph/v1/paper/",
             "recommendations-endpoint": f"https://api.semanticscholar.org/recommendations/v1/papers/forpaper/",
         },
         "SEMANTIC SCHOLAR - Researchers": {
-            "module": "semanticscholar_researchers", 
+            "module": "semanticscholar_researchers",
             # "search-endpoint": f"",
-            # "get-researcher-endpoint": f"https://api.semanticscholar.org/graph/v1/author/",            
+            # "get-researcher-endpoint": f"https://api.semanticscholar.org/graph/v1/author/",
         },
         "RE3DATA": {
             "module": "re3data",

diff --git a/sources/gesis_kg_dataset.py b/sources/gesis_kg_dataset.py
@@ -0,0 +1,111 @@
+from objects import thing, Article, Author, Dataset
+from sources import data_retriever
+import utils
+from main import app
+from string import Template
+
+
+@utils.handle_exceptions
+def search(source: str, search_term: str, results, failed_sources):
+    query_template = Template('''
+                            PREFIX schema:<https://schema.org/>
+                            PREFIX rdfs:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+                            SELECT ?dataset ?title ?doi ?datePublished ?license ?version ?publisher ?dateModified ?dateCreated
+                                    (GROUP_CONCAT(DISTINCT ?contributor_name; SEPARATOR="; ") AS ?contributors)
+                                    (GROUP_CONCAT(DISTINCT ?author_name; SEPARATOR="; ") AS ?authors)
+                                    (GROUP_CONCAT(DISTINCT ?provider; SEPARATOR="\n ") AS ?providers)
+                                    (GROUP_CONCAT(DISTINCT ?inLanguage; SEPARATOR="; ") AS ?languages)
+                                    (GROUP_CONCAT(DISTINCT ?sourceInfo; SEPARATOR="\n ") AS ?sourceInfos)
+                                    (GROUP_CONCAT(DISTINCT ?category; SEPARATOR="; ") AS ?categories)
+                                    (GROUP_CONCAT(DISTINCT ?abstract; SEPARATOR="\n ") AS ?abstracts) 
+                                    (GROUP_CONCAT(DISTINCT ?comment; SEPARATOR="\n ") AS ?comments) 
+                                    (GROUP_CONCAT(DISTINCT ?conditionsOfAccess; SEPARATOR="\n ") AS ?conditionsOfAccesses)
+                                    (GROUP_CONCAT(DISTINCT ?spatialCoverage_name; SEPARATOR="\n ") AS ?spatialCoverages)
+                            WHERE {
+                                ?dataset rdfs:type schema:Dataset .
+                                ?dataset schema:name ?title . FILTER(CONTAINS(?title, "$search_string"))
+
+                                OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/doi> ?doi . }
+                                OPTIONAL { ?dataset schema:abstract ?abstract . }
+                                OPTIONAL { ?dataset schema:datePublished ?datePublished . }
+                                OPTIONAL { ?dataset schema:provider ?provider . }
+                                OPTIONAL { ?dataset schema:publisher ?publisher . }
+                                OPTIONAL { ?dataset schema:inLanguage ?inLanguage . }
+                                OPTIONAL { ?dataset schema:version ?version . }
+                                OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/category> ?category . }
+                                OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/sourceInfo> ?sourceInfo . }
+                                OPTIONAL { ?dataset <https://data.gesis.org/gesiskg/schema/license> ?license . }
+                                OPTIONAL { ?dataset schema:comment ?comment . }
+                                OPTIONAL { ?dataset schema:conditionsOfAccess ?conditionsOfAccess . }
+                                OPTIONAL { ?dataset schema:dateModified ?dateModified .}
+                                OPTIONAL { ?dataset schema:dateCreated ?dateCreated .}
+                                OPTIONAL { ?dataset schema:spatialCoverage ?spatialCoverage .
+                                            ?spatialCoverage schema:name ?spatialCoverage_name .}
+                                OPTIONAL { ?dataset schema:contributor ?contributor . 
+                                            ?contributor schema:name ?contributor_name .}
+                                OPTIONAL { ?dataset schema:author ?author .
+                                           ?author schema:name ?author_name . }
+                            }
+                            GROUP BY ?dataset ?title ?doi ?datePublished ?license ?version ?publisher ?dateModified ?dateCreated
+                            LIMIT $number_of_records
+                            ''')
+
+    replacement_dict = {
+        "search_string": search_term,
+        "number_of_records": app.config['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT']
+    }
+    query = query_template.substitute(replacement_dict)
+    query = ' '.join(query.split())
+    search_result = data_retriever.retrieve_data(source=source,
+                                                 base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
+                                                 search_term=query,
+                                                 failed_sources=failed_sources)
+
+    hits = search_result.get("results", {}).get("bindings", [])
+    total_hits = len(hits)
+    utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_hits}")
+    print(str(total_hits) + "from GESIS KG")
+    if int(total_hits) > 0:
+        for hit in hits:
+            dataset = Dataset()
+            dataset.additionalType = "DATASET"
+            dataset.identifier = hit.get("doi", {}).get("value", "")
+            dataset.name = hit.get("title", {}).get("value", "")
+            dataset.url = hit.get("dataset", {}).get("value", "").strip()
+
+            dataset.datePublished = hit.get('datePublished', {}).get('value', "")
+            dataset.dateCreated = hit.get('dateCreated', {}).get('value', "")
+            dataset.dateModified = hit.get('dateModified', {}).get('value', "")
+            dataset.version = hit.get('version', {}).get('value', "")
+            dataset.license = hit.get('license', {}).get('value', "")
+            dataset.publisher = hit.get('publisher', {}).get('value', "")
+
+            languages = hit.get("languages", {}).get("value", "")
+            if languages:
+                for language in languages.strip().split(" "):
+                    dataset.inLanguage.append(language)
+            # dataset.sourceOrganization = hit.get("providers", {}).get("value", "")
+            dataset.description = hit.get("abstract", {}).get("value", "")
+            dataset.publication = hit.get("sourceInfos", {}).get("value", "")
+
+            authors = hit.get("authors", {}).get("value", "")
+            contributors = hit.get("contributors", {}).get("value", "")
+            authors_list = [name for name in (authors + ";" + contributors).strip(", ").split(";") if name]
+            authors_list = list(dict.fromkeys(authors_list))
+
+            for authorsName in authors_list:
+                _author = Author()
+                _author.type = 'Person'
+                _author.name = authorsName
+                _author.identifier = ""  # ORCID is available for few; we need to update the sparql query to pull this information
+                dataset.author.append(_author)
+
+            _source = thing()
+            _source.name = 'GESIS KG - Dataset'
+            _source.originalSource = dataset.publisher
+            _source.identifier = dataset.identifier
+            _source.url = dataset.url
+            dataset.source.append(_source)
+
+            results['resources'].append(dataset)
diff --git a/sources/gesis_kg_publication.py b/sources/gesis_kg_publication.py
@@ -0,0 +1,97 @@
+from objects import thing, Article, Author
+from sources import data_retriever
+import utils
+from main import app
+from string import Template
+
+
+@utils.handle_exceptions
+def search(source: str, search_term: str, results, failed_sources):
+    query_template = Template('''
+                            PREFIX schema:<https://schema.org/>
+                            PREFIX rdfs:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+                            
+                            SELECT ?publication ?title ?doi ?abstract
+                                    (GROUP_CONCAT(DISTINCT ?linksURN; SEPARATOR=", ") AS ?linksURNs) 
+                                    (GROUP_CONCAT(DISTINCT ?url; SEPARATOR=", ") AS ?urls)
+                                    (GROUP_CONCAT(DISTINCT ?datePub; SEPARATOR=", ") AS ?datePublished)
+                                    (GROUP_CONCAT(DISTINCT ?contributor_name; SEPARATOR="; ") AS ?contributors)
+                                    (GROUP_CONCAT(DISTINCT ?author_name; SEPARATOR="; ") AS ?authors)
+                                    (GROUP_CONCAT(DISTINCT ?provider; SEPARATOR=", ") AS ?providers)
+                                    (GROUP_CONCAT(DISTINCT ?inLanguage; SEPARATOR=", ") AS ?languages)
+                                    (GROUP_CONCAT(DISTINCT ?sourceInfo; SEPARATOR=", ") AS ?sourceInfos)
+                            WHERE {
+                                ?publication rdfs:type schema:ScholarlyArticle .
+                                ?publication schema:name ?title . FILTER(CONTAINS(?title, "$search_string"))
+                                
+                                OPTIONAL { ?publication <https://data.gesis.org/gesiskg/schema/doi> ?doi . }
+                                OPTIONAL { ?publication schema:abstract ?abstract . }
+                                OPTIONAL { ?publication <https://data.gesis.org/gesiskg/schema/linksURN> ?linksURN . }
+                                OPTIONAL { ?publication schema:url ?url . }
+                                OPTIONAL { ?publication schema:datePublished ?datePub . }
+                                OPTIONAL { ?publication schema:provider ?provider . }
+                                OPTIONAL { ?publication schema:inLanguage ?inLanguage . }
+                                OPTIONAL { ?publication <https://data.gesis.org/gesiskg/schema/sourceInfo> ?sourceInfo . }
+                                OPTIONAL { ?publication schema:contributor ?contributor . 
+                                            ?contributor schema:name ?contributor_name .}
+                                OPTIONAL { ?publication schema:author ?author .
+                                           ?author schema:name ?author_name . }
+                            }
+                            GROUP BY ?publication ?title ?doi ?abstract
+                            LIMIT $number_of_records
+                            ''')
+
+    replacement_dict = {
+        "search_string": search_term,
+        "number_of_records": app.config['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT']
+    }
+    query = query_template.substitute(replacement_dict)
+    query = ' '.join(query.split())
+    search_result = data_retriever.retrieve_data(source=source,
+                                                 base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
+                                                 search_term=query,
+                                                 failed_sources=failed_sources)
+
+    hits = search_result.get("results", {}).get("bindings", [])
+    total_hits = len(hits)
+    utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_hits}")
+    if int(total_hits) > 0:
+        for hit in hits:
+            publication = Article()
+            publication.identifier = hit.get("doi", {}).get("value", "")
+            publication.name = hit.get("title", {}).get("value", "")
+            publication.url =  hit.get("urls", {}).get("value", "").strip() #hit.get("urls", {}).get("value", "")
+
+            #publication.identifier = hit.get("linksURNs", {}).get("value", "")  # DOI is available for few; we need to update the sparql query to fetch this information
+            publication.description = hit.get("abstract", {}).get("value", "")
+            publication.datePublished = hit.get('datePublished', {}).get('value', "")
+            languages = hit.get("languages", {}).get("value", "")
+            if languages:
+                for language in languages.strip().split(" "):
+                    publication.inLanguage.append(language)
+            #publication.sourceOrganization = hit.get("providers", {}).get("value", "")
+            publication.publisher = hit.get("sourceInfos", {}).get("value", "")
+
+            authors = hit.get("authors", {}).get("value", "")
+            contributors = hit.get("contributors", {}).get("value", "")
+            authors_list = [name for name in (authors + ";" + contributors).strip(", ").split(";") if name ]
+            authors_list = list(dict.fromkeys(authors_list))
+
+            for authorsName in authors_list:
+                _author = Author()
+                _author.type = 'Person'
+                _author.name = authorsName
+                _author.identifier = ""  # ORCID is available for few; we need to update the sparql query to pull this information
+                publication.author.append(_author)
+
+            _source = thing()
+            _source.name = 'GESIS KG'
+            _source.originalSource = publication.publisher
+            _source.identifier = publication.identifier # hit['publication'].get('value', "") #.replace("http://www.wikidata.org/", "")  # remove the base url and only keep the ID
+            _source.url = publication.url #hit['urls'].get('value', "").strip()
+            publication.source.append(_source)
+
+            if publication.identifier != "":
+                results['publications'].append(publication)
+            else:
+                results['others'].append(publication)