Skip to content

Commit

Permalink
Host Exports on s3 (#8795)
Browse files Browse the repository at this point in the history
* upload dumps to s3

* serve dumps from s3

* make routes for permalinks

* correct filenames

* make URLs less ugly

* rename method to be more clear

* only cache size and URL for one day

* rename to current_results_export

* use the DumpPublicResultsDatabase start_date as the timestamp for the results exports

* update api_controller

* install zip in docker container

* correct paths for uploading

* make file public-read

* run rubocop

* run rubocop

* change filenames again

* use run_start instead of run_end in test

* delete zip file after upload

* review changes

* rename method

* add .zip to the url
  • Loading branch information
FinnIckler authored and gregorbg committed Feb 2, 2024
1 parent c7df7b0 commit 84ef7f8
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 65 deletions.
1 change: 1 addition & 0 deletions WcaOnRails/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesourc
RUN apt-get update && apt-get install -y \
git \
build-essential \
zip \
nodejs \
mariadb-client \
libssl-dev \
Expand Down
9 changes: 3 additions & 6 deletions WcaOnRails/app/controllers/api/v0/api_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,12 @@ def records
end

def export_public
sql_perma_path = DatabaseController.rel_download_path DbDumpHelper::RESULTS_EXPORT_FOLDER, DbDumpHelper::RESULTS_EXPORT_SQL_PERMALINK
tsv_perma_path = DatabaseController.rel_download_path DbDumpHelper::RESULTS_EXPORT_FOLDER, DbDumpHelper::RESULTS_EXPORT_TSV_PERMALINK

timestamp = DumpPublicResultsDatabase.end_date
timestamp = DumpPublicResultsDatabase.start_date

render json: {
export_date: timestamp&.iso8601,
sql_url: "#{root_url}#{sql_perma_path.delete_prefix '/'}",
tsv_url: "#{root_url}#{tsv_perma_path.delete_prefix '/'}",
sql_url: "#{sql_permalink_url}.zip",
tsv_url: "#{tsv_permalink_url}.zip",
}
end

Expand Down
50 changes: 27 additions & 23 deletions WcaOnRails/app/controllers/database_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,42 @@ class DatabaseController < ApplicationController
RESULTS_README_TEMPLATE = 'database/public_results_readme'

def results_export
@sql_filename, @sql_filesize = _link_info DbDumpHelper::RESULTS_EXPORT_SQL_PERMALINK
@tsv_filename, @tsv_filesize = _link_info DbDumpHelper::RESULTS_EXPORT_TSV_PERMALINK

@sql_rel_path = DatabaseController.rel_download_path DbDumpHelper::RESULTS_EXPORT_FOLDER, @sql_filename
@tsv_rel_path = DatabaseController.rel_download_path DbDumpHelper::RESULTS_EXPORT_FOLDER, @tsv_filename

@sql_perma_path = DatabaseController.rel_download_path DbDumpHelper::RESULTS_EXPORT_FOLDER, DbDumpHelper::RESULTS_EXPORT_SQL_PERMALINK
@tsv_perma_path = DatabaseController.rel_download_path DbDumpHelper::RESULTS_EXPORT_FOLDER, DbDumpHelper::RESULTS_EXPORT_TSV_PERMALINK
@sql_path, @sql_filesize = current_results_export("sql")
@tsv_path, @tsv_filesize = current_results_export("tsv")
@sql_filename = File.basename(@sql_path)
@tsv_filename = File.basename(@tsv_path)
end

def _link_info(permalink)
full_permalink = DbDumpHelper::RESULTS_EXPORT_FOLDER.join(permalink)

actual_filename = File.basename File.readlink(full_permalink)
# Manually resolve the relative link to be independent of runtime environments
actual_file = DbDumpHelper::RESULTS_EXPORT_FOLDER.join(actual_filename)
def sql_permalink
url, = current_results_export("sql")
redirect_to url, status: 301, allow_other_host: true
end

filesize_bytes = File.size? actual_file
def tsv_permalink
url, = current_results_export("sql")
redirect_to url, status: 301, allow_other_host: true
end

[actual_filename, filesize_bytes]
def current_results_export(file_type)
export_timestamp = DumpPublicResultsDatabase.start_date

Rails.cache.fetch("database-export-#{export_timestamp}-#{file_type}", expires_in: 1.days) do
file_name = DbDumpHelper.result_export_file_name(file_type, export_timestamp)
bucket = Aws::S3::Resource.new(
region: EnvConfig.STORAGE_AWS_REGION,
credentials: Aws::InstanceProfileCredentials.new,
).bucket(DbDumpHelper::BUCKET_NAME)
filesize_bytes = bucket.object(file_name).content_length
[public_s3_path(file_name), filesize_bytes]
end
end

def developer_export
@rel_download_path = DatabaseController.rel_download_path DbDumpHelper::DEVELOPER_EXPORT_FOLDER, DbDumpHelper::DEVELOPER_EXPORT_SQL_PERMALINK
@rel_download_path = public_s3_path(DbDumpHelper::DEVELOPER_EXPORT_SQL_PERMALINK)
end

def self.rel_download_path(base_folder, file_name)
file_path = base_folder.join file_name
relative_path = file_path.relative_path_from DbDumpHelper::EXPORT_PUBLIC_FOLDER.parent

# has to start with / or otherwise Rails resolves the controller action into the path
"/#{relative_path}"
def public_s3_path(file_name)
"https://s3.#{EnvConfig.STORAGE_AWS_REGION}.amazonaws.com/#{DbDumpHelper::BUCKET_NAME}/#{file_name}"
end

def self.render_readme(rendering_engine, export_timestamp)
Expand Down
2 changes: 1 addition & 1 deletion WcaOnRails/app/jobs/dump_public_results_database.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

class DumpPublicResultsDatabase < WcaCronjob
def perform
DbDumpHelper.dump_results_db
DbDumpHelper.dump_results_db self.class.start_date
end
end
8 changes: 4 additions & 4 deletions WcaOnRails/app/views/database/results_export.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

<dl>
<dt>
SQL: <%= link_to(@sql_filename, @sql_rel_path) %> (<%= number_to_human_size @sql_filesize %>)
<small>(<%= link_to(t('.permalink'), @sql_perma_path) %>)</small>
SQL: <%= link_to(@sql_filename, @sql_path) %> (<%= number_to_human_size @sql_filesize %>)
<small>(<%= link_to(t('.permalink'), sql_permalink_path) %>)</small>
</dt>
<dd><%= t('.file_formats.sql') %></dd>

<dt>
TSV: <%= link_to(@tsv_filename, @tsv_rel_path) %> (<%= number_to_human_size @tsv_filesize %>)
<small>(<%= link_to(t('.permalink'), @tsv_perma_path) %>)</small>
TSV: <%= link_to(@tsv_filename, @tsv_path) %> (<%= number_to_human_size @tsv_filesize %>)
<small>(<%= link_to(t('.permalink'), tsv_permalink_path) %>)</small>
</dt>
<dd><%= t('.file_formats.tsv') %></dd>
</dl>
Expand Down
2 changes: 2 additions & 0 deletions WcaOnRails/config/routes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@
resources :media, only: [:index, :new, :create, :edit, :update, :destroy]

get 'export/results' => 'database#results_export', as: :db_results_export
get 'export/results/WCA_export.sql' => 'database#sql_permalink', as: :sql_permalink
get 'export/results/WCA_export.tsv' => 'database#tsv_permalink', as: :tsv_permalink
get 'export/developer' => 'database#developer_export', as: :db_dev_export
# redirect from the old path that used to be linked on GitHub
get 'wst/wca-developer-database-dump.zip', to: redirect('/export/developer/wca-developer-database-dump.zip')
Expand Down
57 changes: 27 additions & 30 deletions WcaOnRails/lib/db_dump_helper.rb
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# frozen_string_literal: true

module DbDumpHelper
EXPORT_PUBLIC_FOLDER = Rails.root.join('public', 'export')
S3_BASE_PATH = "export"

RESULTS_EXPORT_FOLDER = EXPORT_PUBLIC_FOLDER.join('results')
RESULTS_EXPORT_FOLDER = "#{S3_BASE_PATH}/results".freeze
RESULTS_EXPORT_FILENAME = 'WCA_export'
RESULTS_EXPORT_SQL = "#{RESULTS_EXPORT_FILENAME}.sql".freeze
RESULTS_EXPORT_README = 'README.md'
RESULTS_EXPORT_METADATA = 'metadata.json'
RESULTS_EXPORT_SQL_PERMALINK = "#{RESULTS_EXPORT_SQL}.zip".freeze
RESULTS_EXPORT_TSV_PERMALINK = "#{RESULTS_EXPORT_FILENAME}.tsv.zip".freeze

DEVELOPER_EXPORT_FOLDER = EXPORT_PUBLIC_FOLDER.join('developer')
DEVELOPER_EXPORT_FILENAME = 'wca-developer-database-dump'
DEVELOPER_EXPORT_FOLDER = "#{S3_BASE_PATH}/developer".freeze
DEVELOPER_EXPORT_FILENAME = "wca-developer-database-dump"
DEVELOPER_EXPORT_SQL = "#{DEVELOPER_EXPORT_FILENAME}.sql".freeze
DEVELOPER_EXPORT_SQL_PERMALINK = "#{DEVELOPER_EXPORT_FILENAME}.zip".freeze

DEVELOPER_EXPORT_SQL_PERMALINK = "#{DEVELOPER_EXPORT_FOLDER}/#{DEVELOPER_EXPORT_FILENAME}.zip".freeze
BUCKET_NAME = 'assets.worldcubeassociation.org'
DEFAULT_DEV_PASSWORD = 'wca'

def self.dump_developer_db
Expand All @@ -38,17 +38,16 @@ def self.dump_developer_db
end

DatabaseDumper.development_dump(DEVELOPER_EXPORT_SQL)
zip_file_name = File.basename(DEVELOPER_EXPORT_SQL_PERMALINK)

self.zip_and_permalink(DEVELOPER_EXPORT_FOLDER, DEVELOPER_EXPORT_SQL_PERMALINK, nil, DEVELOPER_EXPORT_SQL)
self.zip_and_upload_to_s3(zip_file_name, DEVELOPER_EXPORT_SQL_PERMALINK, DEVELOPER_EXPORT_SQL)
end
end
end

def self.dump_results_db
def self.dump_results_db(export_timestamp = DateTime.now)
Dir.mktmpdir do |dir|
FileUtils.cd dir do
export_timestamp = DateTime.now

tsv_folder_name = "TSV_export"
FileUtils.mkpath tsv_folder_name

Expand All @@ -63,45 +62,43 @@ def self.dump_results_db
readme_template = DatabaseController.render_readme(ActionController::Base.new, export_timestamp)
File.write(RESULTS_EXPORT_README, readme_template)

# Remove old exports to save storage space
FileUtils.rm_r RESULTS_EXPORT_FOLDER, force: true, secure: true

sql_zip_filename = "WCA_export#{export_timestamp.strftime('%j')}_#{export_timestamp.strftime('%Y%m%dT%H%M%SZ')}.sql.zip"
sql_zip_filename = self.result_export_file_name("sql", export_timestamp)
sql_zip_contents = [RESULTS_EXPORT_METADATA, RESULTS_EXPORT_README, RESULTS_EXPORT_SQL]

self.zip_and_permalink(RESULTS_EXPORT_FOLDER, sql_zip_filename, RESULTS_EXPORT_SQL_PERMALINK, *sql_zip_contents)
self.zip_and_upload_to_s3(sql_zip_filename, "#{RESULTS_EXPORT_FOLDER}/#{sql_zip_filename}", *sql_zip_contents)

tsv_zip_filename = "WCA_export#{export_timestamp.strftime('%j')}_#{export_timestamp.strftime('%Y%m%dT%H%M%SZ')}.tsv.zip"
tsv_zip_filename = self.result_export_file_name("tsv", export_timestamp)
tsv_files = Dir.glob("#{tsv_folder_name}/*.tsv").map do |tsv|
FileUtils.mv(tsv, '.')
File.basename tsv
end

tsv_zip_contents = [RESULTS_EXPORT_METADATA, RESULTS_EXPORT_README] | tsv_files
self.zip_and_permalink(RESULTS_EXPORT_FOLDER, tsv_zip_filename, RESULTS_EXPORT_TSV_PERMALINK, *tsv_zip_contents)
self.zip_and_upload_to_s3(tsv_zip_filename, "#{RESULTS_EXPORT_FOLDER}/#{tsv_zip_filename}", *tsv_zip_contents)
end
end
end

def self.zip_and_permalink(root_folder, zip_filename, permalink_filename = nil, *zip_contents)
def self.result_export_file_name(file_type, timestamp)
"WCA_export#{timestamp.strftime('%j')}_#{timestamp.strftime('%Y%m%dT%H%M%SZ')}.#{file_type}.zip"
end

def self.zip_and_upload_to_s3(zip_filename, s3_path, *zip_contents)
zip_file_list = zip_contents.join(" ")

LogTask.log_task "Zipping #{zip_contents.length} file entries to '#{zip_filename}'" do
system("zip #{zip_filename} #{zip_file_list}") || raise("Error running `zip`")
system("zip #{zip_filename} #{zip_file_list}", exception: true)
end

public_zip_path = root_folder.join(zip_filename)

LogTask.log_task "Moving zipped file to '#{public_zip_path}'" do
FileUtils.mkpath(File.dirname(public_zip_path))
FileUtils.mv(zip_filename, public_zip_path)
LogTask.log_task "Moving zipped file to 's3://#{s3_path}'" do
bucket = Aws::S3::Resource.new(
region: EnvConfig.STORAGE_AWS_REGION,
credentials: Aws::InstanceProfileCredentials.new,
).bucket(BUCKET_NAME)
bucket.object(s3_path).upload_file(zip_filename, { acl: "public-read" })

if permalink_filename.present?
permalink_zip_path = root_folder.join(permalink_filename)

# Writing a RELATIVE link, so that we can do readlink in dev and prod and not care about stuff like Docker
FileUtils.ln_s(zip_filename, permalink_zip_path, force: true)
end
# Delete the zipfile now that it's uploaded
FileUtils.rm zip_filename
end
end

Expand Down
2 changes: 1 addition & 1 deletion WcaOnRails/spec/controllers/api_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@
describe 'GET #export_public' do
it 'returns information about latest public export' do
export_timestamp = DateTime.current.utc
DumpPublicResultsDatabase.cronjob_statistics.update!(run_end: export_timestamp)
DumpPublicResultsDatabase.cronjob_statistics.update!(run_start: export_timestamp)

get :export_public
expect(response.status).to eq 200
Expand Down

0 comments on commit 84ef7f8

Please sign in to comment.