From 2a9c07864a4a476140ce90f97425debd77b673b6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 30 Apr 2020 23:19:25 -0700 Subject: [PATCH] docker: update to pywb rc7 content type redirects: support default block list, with specific allow rules, per #54 --- Dockerfile | 2 +- config.yaml | 10 +++++++++ ukwa_pywb/test/config_test.yaml | 8 ++++++++ ukwa_pywb/ukwa_app.py | 36 ++++++++++++++++++++------------- 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2552d77..ff890be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Based on standard pywb fork -FROM webrecorder/pywb:2.4.0-rc6-test +FROM webrecorder/pywb:2.4.0-rc7 USER root WORKDIR /ukwa_pywb diff --git a/config.yaml b/config.yaml index 4e9fb7b..4115556 100644 --- a/config.yaml +++ b/config.yaml @@ -16,10 +16,20 @@ collections: Expires: 'Thu, 01 Jan 1970 00:00:00 GMT' content_type_redirects: + # allows + 'text/': 'allow' + 'image/': 'allow' + 'video/': 'allow' + 'audio/': 'allow' + 'application/javascript': 'allow' + 'text/rtf': 'https://example.com/viewer?{query}' 'application/pdf': 'https://example.com/viewer?{query}' 'application/': 'https://example.com/blocked?{query}' + + # default redirects '': 'https://example.com/blocked?{query}' + '*': 'https://example.com/blocked?{query}' # open access (with access controls) diff --git a/ukwa_pywb/test/config_test.yaml b/ukwa_pywb/test/config_test.yaml index f92e885..fdfa863 100644 --- a/ukwa_pywb/test/config_test.yaml +++ b/ukwa_pywb/test/config_test.yaml @@ -13,10 +13,18 @@ collections: Expires: 'Thu, 01 Jan 1970 00:00:00 GMT' content_type_redirects: + 'text/': 'allow' + 'image/': 'allow' + 'video/': 'allow' + 'audio/': 'allow' + 'text/rtf': 'https://example.com/viewer?{query}' 'application/pdf': 'https://example.com/viewer?{query}' 'application/': 'https://example.com/blocked?{query}' + + # default redirects '': 'https://example.com/blocked?{query}' + '*': 'https://example.com/blocked?{query}' pywb-no-locks: index_paths: ./integration-test/test-data/ diff --git a/ukwa_pywb/ukwa_app.py b/ukwa_pywb/ukwa_app.py index bf7c144..54c01ce 100644 --- a/ukwa_pywb/ukwa_app.py +++ b/ukwa_pywb/ukwa_app.py @@ -196,25 +196,33 @@ def render_content(self, wb_url_str, coll_config, environ): if default_response.status_headers.get('preference-applied') == 'raw': return default_response - content_type = default_response.status_headers.get("content-type") redirect_url = None - if content_type: - content_type = content_type.split(";", 1)[0] - redirect_url = ct_redirects.get(content_type) - if redirect_url is None: - redirect_url = ct_redirects.get(content_type.split("/")[0] + "/") - - # if no content-type match, check content-disposition - if not redirect_url: - content_disp = default_response.status_headers.get("content-disposition") - if content_disp and 'attachment' in content_disp: - redirect_url = ct_redirects.get('') - - if not redirect_url: + # if we have a content-disposition, takes precedence using the option + content_disp = default_response.status_headers.get("content-disposition") + if content_disp and 'attachment' in content_disp: + redirect_url = ct_redirects.get('') + + # attempt to find rule by content-type + if redirect_url is None: + content_type = default_response.status_headers.get("content-type") + if content_type: + content_type = content_type.split(";", 1)[0] + redirect_url = ct_redirects.get(content_type) + # find by content-type prefix, eg: text/ + if redirect_url is None: + redirect_url = ct_redirects.get(content_type.split("/")[0] + "/") + + # default rule if no other matches + if redirect_url is None: + redirect_url = ct_redirects.get('*') + + # if no redirect or rule is 'allow', then continue + if not redirect_url or redirect_url == 'allow': return default_response + # otherwise, redirect to specified url wb_url = WbUrl(wb_url_str) wb_url.mod = 'id_' loc = self.get_full_prefix(environ) + str(wb_url)