Optimize memory usage in samples/repositories import [SCI-1665]

This commit is contained in:
Oleksii Kriuchykhin 2017-10-12 14:43:25 +02:00
parent 03743a21f6
commit 56f52ebfa7
8 changed files with 213 additions and 165 deletions

View file

@ -47,6 +47,7 @@ gem 'commit_param_routing' # Enables different submit actions in the same form t
gem 'kaminari' gem 'kaminari'
gem "i18n-js", ">= 3.0.0.rc11" # Localization in javascript files gem "i18n-js", ">= 3.0.0.rc11" # Localization in javascript files
gem 'roo', '~> 2.7.1' # Spreadsheet parser gem 'roo', '~> 2.7.1' # Spreadsheet parser
gem 'creek'
gem 'wicked_pdf' gem 'wicked_pdf'
gem 'silencer' # Silence certain Rails logs gem 'silencer' # Silence certain Rails logs
gem 'wkhtmltopdf-heroku' gem 'wkhtmltopdf-heroku'
@ -72,7 +73,6 @@ gem 'ruby-graphviz', '~> 1.2' # Graphviz for rails
gem 'tinymce-rails', '~> 4.5.7' # Rich text editor gem 'tinymce-rails', '~> 4.5.7' # Rich text editor
gem 'base62' # Used for smart annotations gem 'base62' # Used for smart annotations
gem 'newrelic_rpm'
group :development, :test do group :development, :test do
gem 'byebug' gem 'byebug'
@ -85,6 +85,7 @@ group :development, :test do
end end
group :production do group :production do
gem 'newrelic_rpm'
gem 'puma' gem 'puma'
gem 'rails_12factor' gem 'rails_12factor'
end end

View file

@ -115,6 +115,9 @@ GEM
commit_param_routing (0.0.1) commit_param_routing (0.0.1)
concurrent-ruby (1.0.0) concurrent-ruby (1.0.0)
crass (1.0.2) crass (1.0.2)
creek (1.1.2)
nokogiri (~> 1.6.0)
rubyzip (>= 1.0.0)
debug_inspector (0.0.2) debug_inspector (0.0.2)
deface (1.0.2) deface (1.0.2)
colorize (>= 0.5.8) colorize (>= 0.5.8)
@ -362,6 +365,7 @@ DEPENDENCIES
bootstrap_form bootstrap_form
byebug byebug
commit_param_routing commit_param_routing
creek
deface (~> 1.0) deface (~> 1.0)
delayed_job_active_record delayed_job_active_record
delayed_paperclip! delayed_paperclip!

View file

@ -198,12 +198,15 @@ class RepositoriesController < ApplicationController
if parsed_file.too_large? if parsed_file.too_large?
repository_response(t('general.file.size_exceeded', repository_response(t('general.file.size_exceeded',
file_size: Constants::FILE_MAX_SIZE_MB)) file_size: Constants::FILE_MAX_SIZE_MB))
elsif parsed_file.empty?
flash[:notice] = t('teams.parse_sheet.errors.empty_file')
redirect_to back and return
else else
@import_data = parsed_file.data @import_data = parsed_file.data
if parsed_file.generated_temp_file?
unless @import_data.header.any? && @import_data.columns.any?
return repository_response(t('teams.parse_sheet.errors.empty_file'))
end
@temp_file = parsed_file.generate_temp_file
if @temp_file
respond_to do |format| respond_to do |format|
format.json do format.json do
render json: { render json: {

View file

@ -7,106 +7,77 @@ class TeamsController < ApplicationController
def parse_sheet def parse_sheet
session[:return_to] ||= request.referer session[:return_to] ||= request.referer
respond_to do |format| unless params[:file]
if params[:file] return parse_sheet_error(t('teams.parse_sheet.errors.no_file_selected'))
begin end
if params[:file].size > Constants::FILE_MAX_SIZE_MB.megabytes
error = t('general.file.size_exceeded',
file_size: Constants::FILE_MAX_SIZE_MB)
return parse_sheet_error(error)
end
if params[:file].size > Constants::FILE_MAX_SIZE_MB.megabytes begin
error = t 'general.file.size_exceeded', sheet = Team.open_spreadsheet(params[:file])
file_size: Constants::FILE_MAX_SIZE_MB # Get data (it will trigger any errors as well)
if sheet.is_a?(Roo::CSV)
format.html { @header = sheet.row(1)
flash[:alert] = error @columns = sheet.row(2)
redirect_to session.delete(:return_to) elsif sheet.is_a?(Roo::Excelx)
} i = 1
format.json { sheet.each_row_streaming do |row|
render json: {message: error}, @header = row.map(&:cell_value) if i == 1
status: :unprocessable_entity @columns = row.map(&:cell_value) if i == 2
} i += 1
break if i > 2
else
sheet = Team.open_spreadsheet(params[:file])
# Check if we actually have any rows (last_row > 1)
if sheet.last_row.between?(0, 1)
flash[:notice] = t(
"teams.parse_sheet.errors.empty_file")
redirect_to session.delete(:return_to) and return
end
# Get data (it will trigger any errors as well)
@header = sheet.row(1)
@columns = sheet.row(2)
# Fill in fields for dropdown
@available_fields = @team.get_available_sample_fields
# Truncate long fields
@available_fields.update(@available_fields) do |_k, v|
v.truncate(Constants::NAME_TRUNCATION_LENGTH_DROPDOWN)
end
# Save file for next step (importing)
@temp_file = TempFile.new(
session_id: session.id,
file: params[:file]
)
if @temp_file.save
@temp_file.destroy_obsolete
# format.html
format.json {
render :json => {
:html => render_to_string({
:partial => "samples/parse_samples_modal.html.erb"
})
}
}
else
error = t("teams.parse_sheet.errors.temp_file_failure")
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
end
end
rescue ArgumentError, CSV::MalformedCSVError
error = t('teams.parse_sheet.errors.invalid_file',
encoding: ''.encoding)
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
rescue TypeError
error = t("teams.parse_sheet.errors.invalid_extension")
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
end end
else else
error = t("teams.parse_sheet.errors.no_file_selected") i = 1
format.html { sheet.rows.each do |row|
flash[:alert] = error @header = row.values if i == 1
session[:return_to] ||= request.referer @columns = row.values if i == 2
redirect_to session.delete(:return_to) i += 1
} break if i > 2
format.json { end
render json: {message: error},
status: :unprocessable_entity
}
end end
unless @header && @header.any? && @columns && @columns.any?
return parse_sheet_error(t('teams.parse_sheet.errors.empty_file'))
end
# Fill in fields for dropdown
@available_fields = @team.get_available_sample_fields
# Truncate long fields
@available_fields.update(@available_fields) do |_k, v|
v.truncate(Constants::NAME_TRUNCATION_LENGTH_DROPDOWN)
end
# Save file for next step (importing)
@temp_file = TempFile.new(
session_id: session.id,
file: params[:file]
)
if @temp_file.save
@temp_file.destroy_obsolete
respond_to do |format|
format.json do
render json: {
html: render_to_string(
partial: 'samples/parse_samples_modal.html.erb'
)
}
end
end
else
return parse_sheet_error(
t('teams.parse_sheet.errors.temp_file_failure')
)
end
rescue ArgumentError, CSV::MalformedCSVError
return parse_sheet_error(t('teams.parse_sheet.errors.invalid_file',
encoding: ''.encoding))
rescue TypeError
return parse_sheet_error(t('teams.parse_sheet.errors.invalid_extension'))
end end
end end
@ -275,6 +246,20 @@ class TeamsController < ApplicationController
private private
def parse_sheet_error(error)
respond_to do |format|
format.html do
flash[:alert] = error
session[:return_to] ||= request.referer
redirect_to session.delete(:return_to)
end
format.json do
render json: { message: error },
status: :unprocessable_entity
end
end
end
def load_vars def load_vars
@team = Team.find_by_id(params[:id]) @team = Team.find_by_id(params[:id])

View file

@ -114,6 +114,7 @@ class Repository < ActiveRecord::Base
name_index = -1 name_index = -1
total_nr = 0 total_nr = 0
nr_of_added = 0 nr_of_added = 0
header_skipped = false
mappings.each.with_index do |(_k, value), index| mappings.each.with_index do |(_k, value), index|
if value == '-1' if value == '-1'
@ -130,54 +131,69 @@ class Repository < ActiveRecord::Base
unless col_compact.map(&:id).uniq.length == col_compact.length unless col_compact.map(&:id).uniq.length == col_compact.length
return { status: :error, nr_of_added: nr_of_added, total_nr: total_nr } return { status: :error, nr_of_added: nr_of_added, total_nr: total_nr }
end end
rows = if sheet.is_a?(Roo::CSV)
sheet
elsif sheet.is_a?(Roo::Excelx)
sheet.each_row_streaming
else
sheet.rows
end
# Now we can iterate through record data and save stuff into db # Now we can iterate through record data and save stuff into db
transaction do rows.each do |row|
(2..sheet.last_row).each do |i| # Skip empty rows
total_nr += 1 next if row.empty?
record_row = RepositoryRow.new(name: sheet.row(i)[name_index], unless header_skipped
repository: self, header_skipped = true
created_by: user, next
last_modified_by: user) end
record_row.transaction(requires_new: true) do total_nr += 1
unless record_row.save
errors = true
raise ActiveRecord::Rollback
end
row_cell_values = [] # Creek XLSX parser returns Hash of the row, Roo - Array
row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s)
sheet.row(i).each.with_index do |value, index| record_row = RepositoryRow.new(name: row[name_index],
if columns[index] && value repository: self,
cell_value = RepositoryTextValue.new( created_by: user,
data: value, last_modified_by: user)
created_by: user, record_row.transaction do
last_modified_by: user, unless record_row.save
repository_cell_attributes: { errors = true
repository_row: record_row, raise ActiveRecord::Rollback
repository_column: columns[index]
}
)
cell = RepositoryCell.new(repository_row: record_row,
repository_column: columns[index],
value: cell_value)
cell.skip_on_import = true
cell_value.repository_cell = cell
unless cell.valid? && cell_value.valid?
errors = true
raise ActiveRecord::Rollback
end
row_cell_values << cell_value
end
end
if RepositoryTextValue.import(row_cell_values,
recursive: true,
validate: false).failed_instances.any?
errors = true
raise ActiveRecord::Rollback
end
nr_of_added += 1
end end
row_cell_values = []
row.each.with_index do |value, index|
if columns[index] && value
cell_value = RepositoryTextValue.new(
data: value,
created_by: user,
last_modified_by: user,
repository_cell_attributes: {
repository_row: record_row,
repository_column: columns[index]
}
)
cell = RepositoryCell.new(repository_row: record_row,
repository_column: columns[index],
value: cell_value)
cell.skip_on_import = true
cell_value.repository_cell = cell
unless cell.valid? && cell_value.valid?
errors = true
raise ActiveRecord::Rollback
end
row_cell_values << cell_value
end
end
if RepositoryTextValue.import(row_cell_values,
recursive: true,
validate: false).failed_instances.any?
errors = true
raise ActiveRecord::Rollback
end
nr_of_added += 1
end end
end end
@ -199,7 +215,10 @@ class Repository < ActiveRecord::Base
# This assumption is based purely on biologist's habits # This assumption is based purely on biologist's habits
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.xlsx' when '.xlsx'
Roo::Excelx.new(file_path) # Roo Excel parcel was replaced with Creek, but it can be enabled back,
# just swap lines below. But only one can be enabled at the same time.
# Roo::Excelx.new(file_path)
Creek::Book.new(file_path).sheets[0]
else else
raise TypeError raise TypeError
end end

View file

@ -45,7 +45,10 @@ class Team < ActiveRecord::Base
# This assumption is based purely on biologist's habits # This assumption is based purely on biologist's habits
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.xlsx' then when '.xlsx' then
Roo::Excelx.new(file_path) # Roo Excel parcel was replaced with Creek, but it can be enabled back,
# just swap lines below. But only one can be enabled at the same time.
# Roo::Excelx.new(file_path)
Creek::Book.new(file_path).sheets[0]
else else
raise TypeError raise TypeError
end end
@ -66,6 +69,7 @@ class Team < ActiveRecord::Base
errors = false errors = false
nr_of_added = 0 nr_of_added = 0
total_nr = 0 total_nr = 0
header_skipped = false
# First let's query for all custom_fields we're refering to # First let's query for all custom_fields we're refering to
custom_fields = [] custom_fields = []
@ -91,10 +95,28 @@ class Team < ActiveRecord::Base
custom_fields << cf custom_fields << cf
end end
end end
rows = if sheet.is_a?(Roo::CSV)
sheet
elsif sheet.is_a?(Roo::Excelx)
sheet.each_row_streaming
else
sheet.rows
end
# Now we can iterate through sample data and save stuff into db # Now we can iterate through sample data and save stuff into db
(2..sheet.last_row).each do |i| rows.each do |row|
# Skip empty rows
next if row.empty?
unless header_skipped
header_skipped = true
next
end
total_nr += 1 total_nr += 1
sample = Sample.new(name: sheet.row(i)[sname_index], # Creek XLSX parser returns Hash of the row, Roo - Array
row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s)
sample = Sample.new(name: row[sname_index],
team: self, team: self,
user: user) user: user)
@ -104,7 +126,7 @@ class Team < ActiveRecord::Base
raise ActiveRecord::Rollback raise ActiveRecord::Rollback
end end
sheet.row(i).each.with_index do |value, index| row.each.with_index do |value, index|
if index == stype_index if index == stype_index
stype = SampleType.where(name: value.strip, team: self).take stype = SampleType.where(name: value.strip, team: self).take

View file

@ -10,43 +10,57 @@ module ImportRepository
def data def data
# Get data (it will trigger any errors as well) # Get data (it will trigger any errors as well)
header = @sheet.row(1) if @sheet.is_a?(Roo::CSV)
columns = @sheet.row(2) header = @sheet.row(1)
columns = @sheet.row(2)
elsif @sheet.is_a?(Roo::Excelx)
i = 1
@sheet.each_row_streaming do |row|
header = row.map(&:cell_value) if i == 1
columns = row.map(&:cell_value) if i == 2
i += 1
break if i > 2
end
else
i = 1
@sheet.rows.each do |row|
header = row.values if i == 1
columns = row.values if i == 2
i += 1
break if i > 2
end
end
# Fill in fields for dropdown # Fill in fields for dropdown
@repository.available_repository_fields.transform_values! do |name| @repository.available_repository_fields.transform_values! do |name|
truncate(name, length: Constants::NAME_TRUNCATION_LENGTH_DROPDOWN) truncate(name, length: Constants::NAME_TRUNCATION_LENGTH_DROPDOWN)
end end
@temp_file = TempFile.create(session_id: @session.id, file: @file) header ||= []
columns ||= []
Data.new(header, Data.new(header,
columns, columns,
@repository.available_repository_fields, @repository.available_repository_fields,
@repository, @repository)
@temp_file)
end end
def too_large? def too_large?
@file.size > Constants::FILE_MAX_SIZE_MB.megabytes @file.size > Constants::FILE_MAX_SIZE_MB.megabytes
end end
def empty? def generate_temp_file
@sheet.last_row.between?(0, 1)
end
def generated_temp_file?
# Save file for next step (importing) # Save file for next step (importing)
@temp_file = TempFile.new( temp_file = TempFile.new(
session_id: @session.id, session_id: @session.id,
file: @file file: @file
) )
if @temp_file.save if temp_file.save
@temp_file.destroy_obsolete temp_file.destroy_obsolete
return true return temp_file
end end
end end
Data = Struct.new( Data = Struct.new(
:header, :columns, :available_fields, :repository, :temp_file :header, :columns, :available_fields, :repository
) )
end end
end end

View file

@ -58,7 +58,7 @@
</tbody> </tbody>
</table> </table>
</div> </div>
<%= hidden_field_tag 'file_id', @import_data.temp_file.id %> <%= hidden_field_tag 'file_id', @temp_file.id %>
<div id="import-errors-container"> <div id="import-errors-container">
</div> </div>