Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Significantly reduce load time on the Show page for large datasets #1989

Merged
merged 7 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions app/controllers/works_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ def file_list
if params[:id] == "NONE"
# This is a special case when we render the file list for a work being created
# (i.e. it does not have an id just yet)
render json: []
render json: file_list_ajax_response(nil)
else
@work = Work.find(params[:id])
render json: @work.file_list
work = Work.find(params[:id])
render json: file_list_ajax_response(work)
end
end

Expand Down Expand Up @@ -415,5 +415,24 @@ def migrated?

params[:submit] == "Migrate"
end

# Returns a hash object that can be serialized into something that DataTables
# can consume. The `data` elements includes the work's file list all other
# properties are used for displaying different data elements related but not
# directly on the DataTable object (e.g. the total file size)
def file_list_ajax_response(work)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notice how we return the file list plus a bunch of other values that depend on the file list

files = []
total_size = 0
unless work.nil?
files = work.file_list
total_size = work.total_file_size_from_list(files)
end
{
data: files,
total_size:,
total_size_display: ActiveSupport::NumberHelper.number_to_human_size(total_size),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

total_file_count: files.count
}
end
end
# rubocop:enable Metrics/ClassLength
50 changes: 25 additions & 25 deletions app/models/upload_snapshot.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@ class UploadSnapshot < ApplicationRecord
alias_attribute :existing_files, :files

def snapshot_deletions(work_changes, s3_filenames)
s3_filenames_sorted = s3_filenames.sort
existing_files.each do |file|
filename = file["filename"]
unless s3_filenames.include?(filename)
# Use Ruby's Binary Search functionality instead of a plain Ruby Array `.include?`
# to detect missing values in the array because the binary search performs
# much faster when the list of files is large. Notice that the binary search
# requires that the list of files is sorted.
# See https://ruby-doc.org/3.3.6/bsearch_rdoc.html
if s3_filenames_sorted.bsearch { |s3_filename| filename <=> s3_filename }.nil?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the other optimization, notice the use of Binary Search bsearch instead of a normal include?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯

work_changes << { action: "removed", filename:, checksum: file["checksum"] }
end
end
Expand All @@ -17,12 +23,12 @@ def snapshot_deletions(work_changes, s3_filenames)
def snapshot_modifications(work_changes, s3_files)
# check for modifications
s3_files.each do |s3_file|
next if match?(s3_file)
work_changes << if include?(s3_file)
{ action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum }
else
{ action: "added", filename: s3_file.filename, checksum: s3_file.checksum }
end
match = existing_files_sorted.bsearch { |file| s3_file.filename <=> file["filename"] }
if match.nil?
work_changes << { action: "added", filename: s3_file.filename, checksum: s3_file.checksum }
elsif UploadSnapshot.checksum_compare(match["checksum"], s3_file.checksum) == false
work_changes << { action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum }
end
end
end

Expand All @@ -38,18 +44,6 @@ def filenames
files.map { |file| file["filename"] }
end

def include?(s3_file)
filenames.include?(s3_file.filename)
end

def index(s3_file)
files.index { |file| file["filename"] == s3_file.filename && checksum_compare(file["checksum"], s3_file.checksum) }
end

def match?(s3_file)
index(s3_file).present?
end

def store_files(s3_files)
self.files = s3_files.map { |file| { "filename" => file.filename, "checksum" => file.checksum } }
end
Expand All @@ -58,12 +52,7 @@ def self.find_by_filename(work_id:, filename:)
find_by("work_id = ? AND files @> ?", work_id, JSON.dump([{ filename: }]))
end

private

def uploads
work.uploads
end

class << self
# Compares two checksums. Accounts for the case in which one of them is
# a plain MD5 value and the other has been encoded with base64.
# See also
Expand All @@ -85,4 +74,15 @@ def checksum_compare(checksum1, checksum2)
# One of the values was not properly encoded
false
end
end

private

def existing_files_sorted
@existing_files_sorted ||= files.sort_by { |file| file["filename"] }
end

def uploads
work.uploads
end
end
30 changes: 24 additions & 6 deletions app/models/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,9 @@ def artifact_uploads
# Returns the list of files for the work with some basic information about each of them.
# This method is much faster than `uploads` because it does not return the actual S3File
# objects to the client, instead it returns just a few selected data elements.
# rubocop:disable Metrics/MethodLength
def file_list
start = Time.zone.now
s3_files = approved? ? post_curation_uploads : pre_curation_uploads
files_info = s3_files.map do |s3_file|
{
Expand All @@ -349,17 +351,24 @@ def file_list
"is_folder": s3_file.is_folder
}
end
log_performance(start, "file_list called for #{id}")
files_info
end
# rubocop:enable Metrics/MethodLength

def total_file_size
@total_file_size ||= begin
total_size = 0
file_list.each do |file|
total_size += file[:size]
end
total_size
total_size = 0
file_list.each do |file|
total_size += file[:size]
end
total_size
end

# Calculates the total file size from a given list of files
# This is so that we don't fetch the list twice from AWS since it can be expensive when
# there are thousands of files on the work.
def total_file_size_from_list(files)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

files.sum { |file| file[:size] }
end

# Fetches the data from S3 directly bypassing ActiveStorage
Expand Down Expand Up @@ -629,5 +638,14 @@ def embargo_date_as_json
embargo_date_iso8601.gsub(/\+.+$/, "Z")
end
end

def log_performance(start, message)
elapsed = Time.zone.now - start
if elapsed > 20
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can tweak the threshold, but I thought it would be good to start logging as warning slow requests

Rails.logger.warn("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds")
else
Rails.logger.info("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds")
end
end
end
# rubocop:enable Metrics/ClassLength
26 changes: 15 additions & 11 deletions app/views/works/_s3_resources.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
<div class="lux">
<div class="card">
<div class="files card-body">
<h3><%= "#{@work.upload_count} #{'File'.pluralize(@work.upload_count)} or #{'Directory'.pluralize(@work.upload_count)}"%> in
<%= @work.approved? ? "post-curation" : "pre-curation" %> storage
<h3>
<div id="total-file-count-spinner" class="spinner-border spinner-border-sm" role="status"></div>
<span id="total-file-count-in-table"></span> Files or Directories in <%= @work.approved? ? "post-curation" : "pre-curation" %> storage
</h3>
<table id="files-table" class="table">
<thead>
Expand All @@ -15,14 +16,6 @@
</tr>
</thead>
<tbody>
<% if @work.upload_count.zero? %>
<tr class="files">
<td><span><%= t('works.form.curation_uploads.empty') %></span></td>
<td><span></span></td>
<td><span></span></td>
<td><span></span></td>
</tr>
<% end %>
</tbody>
<tfoot></tfoot>
</table>
Expand Down Expand Up @@ -75,7 +68,18 @@
select: true,
ajax: {
url: fileListUrl,
dataSrc: ''
dataSrc: function(json) {
// The JSON payload includes a few extra properties (fetched via AJAX because
// they be slow for large datasets). Here we pick up those properties and update
// the display with them.
$("#total-file-size").text(json.total_size_display);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We update the page with the extra values that now come on the AJAX response

$("#total-file-size-spinner").hide();
$("#total-file-count-in-table").text(json.total_file_count.toLocaleString("en-US"));
$("#total-file-count-spinner").hide();
// The JSON payload include the file list in the `data` property
// and that's what we return to DataTables as the "source" data.
return json.data;
}
},
rowId: 'safe_id',
columns: [
Expand Down
5 changes: 3 additions & 2 deletions app/views/works/show.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@

<dt>Location</dt>
<dd>
<% if @work.upload_count > 0 %>
<% if @work.files_location_upload? %>
Amazon S3 Curation Storage
<% elsif @work.files_location_cluster? %>
PUL Research Cluster
Expand All @@ -291,7 +291,8 @@
<% end %>
<dt>Total Size</dt>
<dd>
<%= number_to_human_size(@work.total_file_size)%>
<div id="total-file-size-spinner" class="spinner-border spinner-border-sm" role="status"></div>
<span id="total-file-size"></span>
</dd>
</dl>

Expand Down
4 changes: 3 additions & 1 deletion spec/controllers/works_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,10 @@
allow(fake_s3_service).to receive(:client_s3_files).and_return(data)

get :file_list, params: { id: work.id }
file_list = JSON.parse(response.body)
json_response = JSON.parse(response.body)
file_list = json_response["data"]
expect(file_list.map { |f| f["filename"] }.sort).to eq(["10.34770/123-abc/#{work.id}/SCoData_combined_v1_2020-07_README.txt", "10.34770/123-abc/#{work.id}/something.jpg"])
expect(json_response["total_size"]).to be 21_518

# Check that we don't accidentally serialize the Work as part of the JSON file list.
# Including the work is bad for large records, because if for example a Work has 500 files
Expand Down
93 changes: 38 additions & 55 deletions spec/models/upload_snapshot_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
require "rails_helper"

RSpec.describe UploadSnapshot, type: :model do
subject(:upload_snapshot) { described_class.new(files: [{ filename:, checkSum: "aaabbb111222" }], url:, work:) }
subject(:upload_snapshot) { described_class.new(files: [{ filename:, checksum: "aaabbb111222" }], url:, work:) }

let(:filename) { "us_covid_2019.csv" }
let(:url) { "http://localhost.localdomain/us_covid_2019.csv" }
let(:work) { FactoryBot.create(:approved_work) }

describe "#files" do
it "lists files associated with the snapshot" do
expect(upload_snapshot.files).to eq([{ "filename" => filename, "checkSum" => "aaabbb111222" }])
expect(upload_snapshot.files).to eq([{ "filename" => filename, "checksum" => "aaabbb111222" }])
end
end

Expand Down Expand Up @@ -121,80 +121,63 @@
end
end

describe "#include?" do
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

let(:s3_file) { FactoryBot.build :s3_file, filename: "fileone" }
let(:other_file) { FactoryBot.build :s3_file, filename: "other" }
it "checks if a files is part of the snamshot via name" do
expect(upload_snapshot.include?(s3_file)).to be_truthy
expect(upload_snapshot.include?(other_file)).to be_falsey
end
end

describe "#index" do
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

let(:s3_file) { FactoryBot.build :s3_file, filename: "filetwo", checksum: "aaabbb111222" }
let(:other_file) { FactoryBot.build :s3_file, filename: "other" }
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

it "lists filenames associated with the snapshot" do
expect(upload_snapshot.index(s3_file)).to eq(1)
expect(upload_snapshot.index(other_file)).to be_nil
describe "#snapshot_deletions" do
it "detects deletions" do
work_changes = []
upload_snapshot.snapshot_deletions(work_changes, ["us_covid_other.csv"])
expect(work_changes.first[:action]).to eq "removed"
expect(work_changes.first[:filename]).to eq "us_covid_2019.csv"
end

it "checks both the filename and the checksum" do
s3_file.checksum = "otherchecksum"
expect(upload_snapshot.index(s3_file)).to be_nil
it "does not detect false positives" do
work_changes = []
upload_snapshot.snapshot_deletions(work_changes, ["us_covid_2019.csv"])
expect(work_changes.count).to be 0
end
end

describe "#match?" do
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

let(:s3_file) { FactoryBot.build :s3_file, filename: "filetwo", checksum: "aaabbb111222" }
let(:other_file) { FactoryBot.build :s3_file, filename: "other" }
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

it "lists filenames associated with the snapshot" do
expect(upload_snapshot.match?(s3_file)).to be_truthy
expect(upload_snapshot.match?(other_file)).to be_falsey
describe "#snapshot_modifications" do
let(:existing_file) { FactoryBot.build :s3_file, filename: "us_covid_2019.csv", checksum: "aaabbb111222" }
let(:s3_new_file) { FactoryBot.build :s3_file, filename: "one.txt", checksum: "111" }
let(:s3_modified_file) { FactoryBot.build :s3_file, filename: "us_covid_2019.csv", checksum: "zzzyyyy999888" }
it "detects additions and modifications" do
work_changes = []
upload_snapshot.snapshot_modifications(work_changes, [existing_file, s3_new_file, s3_modified_file])
expect(work_changes.find { |change| change[:action] == "added" && change[:filename] == "one.txt" }).to_not be nil
expect(work_changes.find { |change| change[:action] == "replaced" && change[:filename] == "us_covid_2019.csv" }).to_not be nil
end

it "checks both the filename and the checksum" do
s3_file.checksum = "otherchecksum"
expect(upload_snapshot.match?(s3_file)).to be_falsey
it "does not detect false positives" do
work_changes = []
upload_snapshot.snapshot_modifications(work_changes, [existing_file])
expect(work_changes.count).to be 0
end
end

describe "checksum compare" do
let(:file1_base64) { { filename: "fileone", checksum: "98691a716ece23a77735f37b5a421253" } }
let(:file2_md5) { { filename: "filetwo", checksum: "mGkacW7OI6d3NfN7WkISUw==" } }
subject(:upload_snapshot) { described_class.new(files: [file1_base64, file2_md5], url:, work:) }
describe "#compare_checksum" do
let(:checksum1) { "98691a716ece23a77735f37b5a421253" }
let(:checksum1_encoded) { "mGkacW7OI6d3NfN7WkISUw==" }
let(:checksum2) { "xx691a716ece23a77735f37b5a421253" }

it "matches identical checksums" do
s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "98691a716ece23a77735f37b5a421253"
expect(upload_snapshot.match?(s3_file)).to be true
expect(described_class.checksum_compare(checksum1, checksum1)).to be true
end

it "detects differences" do
s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "xx691a716ece23a77735f37b5a421253"
expect(upload_snapshot.match?(s3_file)).to be false
expect(described_class.checksum_compare(checksum1, checksum2)).to be false
end

it "matches encoded checksums" do
s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "mGkacW7OI6d3NfN7WkISUw=="
expect(upload_snapshot.match?(s3_file)).to be true

s3_file = FactoryBot.build :s3_file, filename: "filetwo", checksum: "98691a716ece23a77735f37b5a421253"
expect(upload_snapshot.match?(s3_file)).to be true
expect(described_class.checksum_compare(checksum1, checksum1_encoded)).to be true
end

it "does not cause issues when the checksum is nil" do
s3_file = FactoryBot.build :s3_file, filename: "fileone"
s3_file.checksum = nil
expect(upload_snapshot.match?(s3_file)).to be false
expect(described_class.checksum_compare(nil, checksum1)).to be false
expect(described_class.checksum_compare(checksum1, nil)).to be false
end

it "return false if the encoding is wrong" do
expect(described_class.checksum_compare(checksum1, checksum1_encoded + "xxx")).to be false
end
end

Expand Down