Skip to content

Commit

Permalink
Add first_page_only option for office doc conversion; Extend office d…
Browse files Browse the repository at this point in the history
…oc to pdf conversion timeouts
  • Loading branch information
elohanlon committed Jan 28, 2025
1 parent 5718c8c commit 96f8db3
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 13 deletions.
29 changes: 20 additions & 9 deletions lib/derivativo/conversion/office_helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@ module OfficeHelpers
PDF_HIGHER_COMPRESSION_CONVERSION_THRESHOLD = 30.megabytes

SIZE_THRESHOLD_FOR_LARGE_OFFICE_CONVERSION_DOC_TIMEOUT = 5.megabytes
SMALL_OFFICE_CONVERSION_DOC_TIMEOUT = 30.seconds
LARGE_OFFICE_CONVERSION_DOC_TIMEOUT = 120.seconds
SMALL_OFFICE_CONVERSION_DOC_TIMEOUT = 60.seconds
LARGE_OFFICE_CONVERSION_DOC_TIMEOUT = 600.seconds # We need a long timeout for larger docs (like big CSV files)

# Converts an input office output audiovisual file
def self.office_convert_to_pdf(src_file_path:, dst_file_path:)
def self.office_convert_to_pdf(src_file_path:, dst_file_path:, first_page_only: false)
office_convert_to_pdf_impl(
src_file_path: src_file_path,
dst_file_path: dst_file_path,
soffice_binary_path: soffice_binary_path_from_config_or_path
soffice_binary_path: soffice_binary_path_from_config_or_path,
first_page_only: first_page_only
)

# If the generated file size is less than or equal to our desired size, then we're done.
Expand All @@ -28,7 +29,8 @@ def self.office_convert_to_pdf(src_file_path:, dst_file_path:)
src_file_path: src_file_path,
dst_file_path: dst_file_path,
soffice_binary_path: soffice_binary_path_from_config_or_path,
compression_integer: compression_value_for_first_try_file_size(generated_file_size)
compression_integer: compression_value_for_first_try_file_size(generated_file_size),
first_page_only: first_page_only
)
end

Expand All @@ -50,7 +52,9 @@ def self.compression_value_for_first_try_file_size(size)
((1 / Math.log(size.to_f / 1.megabyte, 10))**2 * 100).to_i
end

def self.office_convert_to_pdf_impl(src_file_path:, dst_file_path:, soffice_binary_path:, compression_integer: 80)
def self.office_convert_to_pdf_impl(
src_file_path:, dst_file_path:, soffice_binary_path:, compression_integer: 80, first_page_only: false
)
# Create a unique, temporary home dir for this office conversion process so we can run
# multiple conversion jobs independently. If two conversion processes use the same
# home dir, the first process will block the second one.
Expand All @@ -65,7 +69,8 @@ def self.office_convert_to_pdf_impl(src_file_path:, dst_file_path:, soffice_bina
Derivativo::FileHelper.working_directory_temp_dir('office_temp_outdir') do |office_temp_outdir|
# Run conversion
cmd_to_run = conversion_command(
soffice_binary_path, src_file_path, office_temp_homedir.path, office_temp_outdir.path
soffice_binary_path, src_file_path, office_temp_homedir.path,
office_temp_outdir.path, first_page_only: first_page_only
)
Derivativo::Utils::ShellUtils.run_with_timeout(
cmd_to_run,
Expand Down Expand Up @@ -97,14 +102,20 @@ def self.conversion_timeout_for_src_file(src_file_path)
end
end

def self.conversion_command(soffice_binary_path, src_file_path, office_temp_homedir_path, office_temp_outdir_path)
def self.conversion_command(
soffice_binary_path, src_file_path, office_temp_homedir_path, office_temp_outdir_path, first_page_only: false
)
[
soffice_binary_path,
"-env:UserInstallation=file://#{office_temp_homedir_path}",
'--invisible',
'--headless',
'--convert-to',
'pdf:writer_pdf_Export',
if first_page_only
'pdf:writer_pdf_Export:\{\"PageRange\":\{\"type\":\"string\",\"value\":\"1\"\}\}'
else
'pdf:writer_pdf_Export'
end,
'--outdir',
Shellwords.escape(office_temp_outdir_path),
Shellwords.escape(src_file_path)
Expand Down
31 changes: 28 additions & 3 deletions spec/derivativo/conversion/office_helpers_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,27 @@
with_auto_deleting_tempfile('dst', '.pdf') do |dst_file|
described_class.office_convert_to_pdf(src_file_path: src_file_path, dst_file_path: dst_file.path)
expect(File.size(dst_file.path)).to be_positive
with_auto_deleting_tempfile('tempfile', '.txt') do |text_temp_file|
Derivativo::Extraction.extract_fulltext(src_file_path: dst_file.path, dst_file_path: text_temp_file.path)
file_content = File.read(text_temp_file.path)
expect(file_content).to include('Page 1!')
expect(file_content).to include('Page 2!')
end
end
end

it 'only converts the first page when called with `first_page_only: true`' do
with_auto_deleting_tempfile('dst', '.pdf') do |dst_file|
described_class.office_convert_to_pdf(
src_file_path: src_file_path, dst_file_path: dst_file.path, first_page_only: true
)
expect(File.size(dst_file.path)).to be_positive
with_auto_deleting_tempfile('tempfile', '.txt') do |text_temp_file|
Derivativo::Extraction.extract_fulltext(src_file_path: dst_file.path, dst_file_path: text_temp_file.path)
file_content = File.read(text_temp_file.path)
expect(file_content).to include('Page 1!')
expect(file_content).not_to include('Page 2!')
end
end
end
end
Expand Down Expand Up @@ -43,15 +64,19 @@
expect(described_class).to receive(:office_convert_to_pdf_impl).with(
src_file_path: src_file_path,
dst_file_path: dst_file.path,
soffice_binary_path: soffice_binary_path
soffice_binary_path: soffice_binary_path,
first_page_only: false
).ordered.and_call_original
expect(described_class).to receive(:office_convert_to_pdf_impl).with(
src_file_path: src_file_path,
dst_file_path: dst_file.path,
soffice_binary_path: soffice_binary_path,
compression_integer: second_conversion_compression_integer
compression_integer: second_conversion_compression_integer,
first_page_only: false
).ordered.and_call_original
described_class.office_convert_to_pdf(src_file_path: src_file_path, dst_file_path: dst_file.path)
described_class.office_convert_to_pdf(
src_file_path: src_file_path, dst_file_path: dst_file.path, first_page_only: false
)
expect(File.size(dst_file.path)).to be_positive
end
end
Expand Down
2 changes: 1 addition & 1 deletion spec/derivativo/extraction_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
with_auto_deleting_tempfile('dst', '.txt') do |dst_file|
described_class.extract_fulltext(src_file_path: src_file_path, dst_file_path: dst_file.path)
expect(File.size(dst_file.path)).to be_positive
expect(File.read(dst_file.path)).to eq("This is an office document!\n")
expect(File.read(dst_file.path)).to eq("Page 1!\n\n\nPage 2!\n\n")
end
end
end
Expand Down
Binary file modified spec/fixtures/files/office-doc.doc
Binary file not shown.

0 comments on commit 96f8db3

Please sign in to comment.