-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Significantly reduce load time on the Show page for large datasets #1989
Changes from all commits
0856d92
e0a5464
3964f14
89fcc49
b62c634
4199222
a91e481
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,10 +89,10 @@ def file_list | |
if params[:id] == "NONE" | ||
# This is a special case when we render the file list for a work being created | ||
# (i.e. it does not have an id just yet) | ||
render json: [] | ||
render json: file_list_ajax_response(nil) | ||
else | ||
@work = Work.find(params[:id]) | ||
render json: @work.file_list | ||
work = Work.find(params[:id]) | ||
render json: file_list_ajax_response(work) | ||
end | ||
end | ||
|
||
|
@@ -415,5 +415,24 @@ def migrated? | |
|
||
params[:submit] == "Migrate" | ||
end | ||
|
||
# Returns a hash object that can be serialized into something that DataTables | ||
# can consume. The `data` elements includes the work's file list all other | ||
# properties are used for displaying different data elements related but not | ||
# directly on the DataTable object (e.g. the total file size) | ||
def file_list_ajax_response(work) | ||
files = [] | ||
total_size = 0 | ||
unless work.nil? | ||
files = work.file_list | ||
total_size = work.total_file_size_from_list(files) | ||
end | ||
{ | ||
data: files, | ||
total_size:, | ||
total_size_display: ActiveSupport::NumberHelper.number_to_human_size(total_size), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
total_file_count: files.count | ||
} | ||
end | ||
end | ||
# rubocop:enable Metrics/ClassLength |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,9 +6,15 @@ class UploadSnapshot < ApplicationRecord | |
alias_attribute :existing_files, :files | ||
|
||
def snapshot_deletions(work_changes, s3_filenames) | ||
s3_filenames_sorted = s3_filenames.sort | ||
existing_files.each do |file| | ||
filename = file["filename"] | ||
unless s3_filenames.include?(filename) | ||
# Use Ruby's Binary Search functionality instead of a plain Ruby Array `.include?` | ||
# to detect missing values in the array because the binary search performs | ||
# much faster when the list of files is large. Notice that the binary search | ||
# requires that the list of files is sorted. | ||
# See https://ruby-doc.org/3.3.6/bsearch_rdoc.html | ||
if s3_filenames_sorted.bsearch { |s3_filename| filename <=> s3_filename }.nil? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the other optimization, notice the use of Binary Search There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💯 |
||
work_changes << { action: "removed", filename:, checksum: file["checksum"] } | ||
end | ||
end | ||
|
@@ -17,12 +23,12 @@ def snapshot_deletions(work_changes, s3_filenames) | |
def snapshot_modifications(work_changes, s3_files) | ||
# check for modifications | ||
s3_files.each do |s3_file| | ||
next if match?(s3_file) | ||
work_changes << if include?(s3_file) | ||
{ action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum } | ||
else | ||
{ action: "added", filename: s3_file.filename, checksum: s3_file.checksum } | ||
end | ||
match = existing_files_sorted.bsearch { |file| s3_file.filename <=> file["filename"] } | ||
if match.nil? | ||
work_changes << { action: "added", filename: s3_file.filename, checksum: s3_file.checksum } | ||
elsif UploadSnapshot.checksum_compare(match["checksum"], s3_file.checksum) == false | ||
work_changes << { action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum } | ||
end | ||
end | ||
end | ||
|
||
|
@@ -38,18 +44,6 @@ def filenames | |
files.map { |file| file["filename"] } | ||
end | ||
|
||
def include?(s3_file) | ||
filenames.include?(s3_file.filename) | ||
end | ||
|
||
def index(s3_file) | ||
files.index { |file| file["filename"] == s3_file.filename && checksum_compare(file["checksum"], s3_file.checksum) } | ||
end | ||
|
||
def match?(s3_file) | ||
index(s3_file).present? | ||
end | ||
|
||
def store_files(s3_files) | ||
self.files = s3_files.map { |file| { "filename" => file.filename, "checksum" => file.checksum } } | ||
end | ||
|
@@ -58,12 +52,7 @@ def self.find_by_filename(work_id:, filename:) | |
find_by("work_id = ? AND files @> ?", work_id, JSON.dump([{ filename: }])) | ||
end | ||
|
||
private | ||
|
||
def uploads | ||
work.uploads | ||
end | ||
|
||
class << self | ||
# Compares two checksums. Accounts for the case in which one of them is | ||
# a plain MD5 value and the other has been encoded with base64. | ||
# See also | ||
|
@@ -85,4 +74,15 @@ def checksum_compare(checksum1, checksum2) | |
# One of the values was not properly encoded | ||
false | ||
end | ||
end | ||
|
||
private | ||
|
||
def existing_files_sorted | ||
@existing_files_sorted ||= files.sort_by { |file| file["filename"] } | ||
end | ||
|
||
def uploads | ||
work.uploads | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -334,7 +334,9 @@ def artifact_uploads | |
# Returns the list of files for the work with some basic information about each of them. | ||
# This method is much faster than `uploads` because it does not return the actual S3File | ||
# objects to the client, instead it returns just a few selected data elements. | ||
# rubocop:disable Metrics/MethodLength | ||
def file_list | ||
start = Time.zone.now | ||
s3_files = approved? ? post_curation_uploads : pre_curation_uploads | ||
files_info = s3_files.map do |s3_file| | ||
{ | ||
|
@@ -349,17 +351,24 @@ def file_list | |
"is_folder": s3_file.is_folder | ||
} | ||
end | ||
log_performance(start, "file_list called for #{id}") | ||
files_info | ||
end | ||
# rubocop:enable Metrics/MethodLength | ||
|
||
def total_file_size | ||
@total_file_size ||= begin | ||
total_size = 0 | ||
file_list.each do |file| | ||
total_size += file[:size] | ||
end | ||
total_size | ||
total_size = 0 | ||
file_list.each do |file| | ||
total_size += file[:size] | ||
end | ||
total_size | ||
end | ||
|
||
# Calculates the total file size from a given list of files | ||
# This is so that we don't fetch the list twice from AWS since it can be expensive when | ||
# there are thousands of files on the work. | ||
def total_file_size_from_list(files) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
files.sum { |file| file[:size] } | ||
end | ||
|
||
# Fetches the data from S3 directly bypassing ActiveStorage | ||
|
@@ -629,5 +638,14 @@ def embargo_date_as_json | |
embargo_date_iso8601.gsub(/\+.+$/, "Z") | ||
end | ||
end | ||
|
||
def log_performance(start, message) | ||
elapsed = Time.zone.now - start | ||
if elapsed > 20 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can tweak the threshold, but I thought it would be good to start logging as warning slow requests |
||
Rails.logger.warn("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds") | ||
else | ||
Rails.logger.info("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds") | ||
end | ||
end | ||
end | ||
# rubocop:enable Metrics/ClassLength |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,9 @@ | |
<div class="lux"> | ||
<div class="card"> | ||
<div class="files card-body"> | ||
<h3><%= "#{@work.upload_count} #{'File'.pluralize(@work.upload_count)} or #{'Directory'.pluralize(@work.upload_count)}"%> in | ||
<%= @work.approved? ? "post-curation" : "pre-curation" %> storage | ||
<h3> | ||
<div id="total-file-count-spinner" class="spinner-border spinner-border-sm" role="status"></div> | ||
<span id="total-file-count-in-table"></span> Files or Directories in <%= @work.approved? ? "post-curation" : "pre-curation" %> storage | ||
</h3> | ||
<table id="files-table" class="table"> | ||
<thead> | ||
|
@@ -15,14 +16,6 @@ | |
</tr> | ||
</thead> | ||
<tbody> | ||
<% if @work.upload_count.zero? %> | ||
<tr class="files"> | ||
<td><span><%= t('works.form.curation_uploads.empty') %></span></td> | ||
<td><span></span></td> | ||
<td><span></span></td> | ||
<td><span></span></td> | ||
</tr> | ||
<% end %> | ||
</tbody> | ||
<tfoot></tfoot> | ||
</table> | ||
|
@@ -75,7 +68,18 @@ | |
select: true, | ||
ajax: { | ||
url: fileListUrl, | ||
dataSrc: '' | ||
dataSrc: function(json) { | ||
// The JSON payload includes a few extra properties (fetched via AJAX because | ||
// they be slow for large datasets). Here we pick up those properties and update | ||
// the display with them. | ||
$("#total-file-size").text(json.total_size_display); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We update the page with the extra values that now come on the AJAX response |
||
$("#total-file-size-spinner").hide(); | ||
$("#total-file-count-in-table").text(json.total_file_count.toLocaleString("en-US")); | ||
$("#total-file-count-spinner").hide(); | ||
// The JSON payload include the file list in the `data` property | ||
// and that's what we return to DataTables as the "source" data. | ||
return json.data; | ||
} | ||
}, | ||
rowId: 'safe_id', | ||
columns: [ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Notice how we return the file list plus a bunch of other values that depend on the file list