-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathviews.py
212 lines (171 loc) · 6 KB
/
views.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import json
import os
from datetime import datetime, timedelta, timezone
from io import BytesIO
from urllib.parse import urljoin, urlencode
from bs4 import BeautifulSoup
from dateutil.parser import parse
from flask import jsonify, request, redirect, send_file
from app import app
from exceptions import APIError, BadLandingPageError
from find_pdf import find_pdf_link
from find_license import find_license_in_html
from find_bronze_hybrid import check_access_type
from publisher import cache
from publisher.controller import PublisherController
from publisher.utils import prep_message, check_bad_landing_page
from repository.controller import RepositoryController
from util import s3
from util.grobid import clean_soup
from util.s3 import get_landing_page, is_pdf
@app.route("/")
def home():
return jsonify(
{
"version": "0.1",
"app_name": "parseland",
"msg": "Don't panic",
}
)
@app.route('/grobid-parse')
def grobid_parse():
doi = request.args.get("doi")
if doi.startswith('http'):
doi = doi.split('doi.org/')[1]
params = request.args.copy()
if 'forward' in params:
del params['forward']
del params['doi']
params['doi'] = doi
params['api_key'] = os.getenv("OPENALEX_PDF_PARSER_API_KEY")
qs = urlencode(params)
path = urljoin(os.getenv('OPENALEX_PDF_PARSER_URL'), 'parse-html')
url = f'{path}?{qs}'
return redirect(url)
def is_true(value: str):
return value.lower().startswith('t') or value == '1'
@app.route('/view')
def view():
doi = request.args.get("doi")
try_stylize = request.args.get('try_stylize', default=False, type=is_true)
if doi.startswith('http'):
doi = doi.split('doi.org/')[1]
lp_contents = s3.get_landing_page(doi)
if is_pdf(lp_contents):
# Specify the mimetype as 'application/pdf' and set as_attachment to False
return send_file(BytesIO(lp_contents), mimetype='application/pdf',
as_attachment=False)
soup = BeautifulSoup(lp_contents.decode(), features='lxml', parser='lxml')
cleaned, _ = clean_soup(soup, try_stylize)
return str(cleaned)
@app.route("/parse-publisher")
def parse_publisher():
doi = request.args.get("doi")
if doi.startswith('http'):
doi = doi.split('doi.org/')[-1]
check_cache = request.args.get('check_cache', 't')
check_cache = check_cache.lower().startswith('t') or check_cache == '1'
update_cache = False
if check_cache:
cached = cache.get(doi)
if cached:
cached_obj = json.loads(cached)
five_min_ago = datetime.now(timezone.utc) - timedelta(minutes=5)
last_updated, _, cached_response = cached_obj
last_updated = parse(last_updated)
if last_updated >= five_min_ago:
print(f'Cache hit - {doi}')
return jsonify(cached_response)
else:
update_cache = True
else:
update_cache = True
lp_contents = get_landing_page(doi)
grobid_parse_url = 'https://parseland.herokuapp.com/grobid-parse?doi=' + doi
if is_pdf(lp_contents):
params = {
'doi': doi,
'api_key': os.getenv("OPENALEX_PDF_PARSER_API_KEY"),
'include_raw': 'false'
}
qs = urlencode(params)
path = urljoin(os.getenv('OPENALEX_PDF_PARSER_URL'), 'parse')
url = f'{path}?{qs}'
return redirect(url)
pc = PublisherController(lp_contents.decode(), doi)
if check_bad_landing_page(pc.soup):
raise BadLandingPageError()
parser, parsed_msg = pc.best_parser_msg()
message = prep_message(parsed_msg, parser)
response = {
"message": message,
"metadata": {
"parser": parser.parser_name,
"grobid_parse_url": grobid_parse_url,
"doi": doi,
"doi_url": f"https://doi.org/{doi}",
},
}
if check_cache and update_cache:
cache.set(doi, None,
response) # Setting current_s3_last_modified to None when updating the cache
return jsonify(response)
@app.route("/parse-oa")
def parse_oa():
doi = request.args.get("doi")
if doi.startswith('http'):
doi = doi.split('doi.org/')[-1]
lp_contents = get_landing_page(doi)
soup = BeautifulSoup(lp_contents.decode(), features='lxml', parser='lxml')
if check_bad_landing_page(soup):
raise BadLandingPageError()
pdf_link = find_pdf_link(soup)
oa_license = find_license_in_html(lp_contents.decode())
bronze_hybrid = check_access_type(lp_contents.decode(), soup)
pdf = {
"href": pdf_link.href,
"anchor": pdf_link.anchor,
"source": pdf_link.source,
} if pdf_link else None
print(f"PDF link: {pdf_link}")
response = {
"pdf": pdf,
"license": oa_license,
"bronze_hybrid": bronze_hybrid,
"metadata": {
"doi": doi,
"doi_url": f"https://doi.org/{doi}",
},
}
return jsonify(response)
@app.route("/parse-repository")
def parse_repository():
page_id = request.args.get("page-id")
rc = RepositoryController(page_id)
parser = rc.find_parser()
if parser.authors_found():
message = parser.parse()
else:
message = parser.no_authors_output()
response = {
"message": message,
"metadata": {
"parser": parser.parser_name,
"page_id": page_id,
},
}
return jsonify(response)
@app.errorhandler(APIError)
def handle_exception(err):
"""Return custom JSON when APIError or its children are raised"""
response = {"error": err.description, "message": ""}
if len(err.args) > 0:
response["message"] = err.args[0]
# Add some logging so that we can monitor different types of errors
app.logger.error("{}: {}".format(err.description, response["message"]))
return jsonify(response), err.code
@app.route('/debug-sentry')
def trigger_error():
division_by_zero = 1 / 0
if __name__ == "__main__":
app.run(port=5001)