Merge pull request from okriuchykhin/ok_SCI_1665

Optimize memory usage in samples/repositories import [SCI-1665]
This commit is contained in:
okriuchykhin 2017-10-17 17:26:00 +02:00 committed by GitHub
commit 8e90e041b9
10 changed files with 209 additions and 218 deletions

View file

@ -47,6 +47,7 @@ gem 'commit_param_routing' # Enables different submit actions in the same form t
gem 'kaminari'
gem "i18n-js", ">= 3.0.0.rc11" # Localization in javascript files
gem 'roo', '~> 2.7.1' # Spreadsheet parser
gem 'creek'
gem 'wicked_pdf'
gem 'silencer' # Silence certain Rails logs
gem 'wkhtmltopdf-heroku'

View file

@ -117,6 +117,9 @@ GEM
commit_param_routing (0.0.1)
concurrent-ruby (1.0.0)
crass (1.0.2)
creek (1.1.2)
nokogiri (~> 1.6.0)
rubyzip (>= 1.0.0)
debug_inspector (0.0.2)
deface (1.0.2)
colorize (>= 0.5.8)
@ -364,6 +367,7 @@ DEPENDENCIES
bootstrap_form
byebug
commit_param_routing
creek
deface (~> 1.0)
delayed_job_active_record
delayed_paperclip!

View file

@ -198,12 +198,14 @@ class RepositoriesController < ApplicationController
if parsed_file.too_large?
repository_response(t('general.file.size_exceeded',
file_size: Constants::FILE_MAX_SIZE_MB))
elsif parsed_file.empty?
flash[:notice] = t('teams.parse_sheet.errors.empty_file')
redirect_to back and return
else
@import_data = parsed_file.data
if parsed_file.generated_temp_file?
if @import_data.header.empty? || @import_data.columns.empty?
return repository_response(t('teams.parse_sheet.errors.empty_file'))
end
if (@temp_file = parsed_file.generate_temp_file)
respond_to do |format|
format.json do
render json: {

View file

@ -7,106 +7,57 @@ class TeamsController < ApplicationController
def parse_sheet
session[:return_to] ||= request.referer
respond_to do |format|
if params[:file]
begin
unless params[:file]
return parse_sheet_error(t('teams.parse_sheet.errors.no_file_selected'))
end
if params[:file].size > Constants::FILE_MAX_SIZE_MB.megabytes
error = t('general.file.size_exceeded',
file_size: Constants::FILE_MAX_SIZE_MB)
return parse_sheet_error(error)
end
if params[:file].size > Constants::FILE_MAX_SIZE_MB.megabytes
error = t 'general.file.size_exceeded',
file_size: Constants::FILE_MAX_SIZE_MB
begin
sheet = SpreadsheetParser.open_spreadsheet(params[:file])
@header, @columns = SpreadsheetParser.first_two_rows(sheet)
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
if @header.empty? || @columns.empty?
return parse_sheet_error(t('teams.parse_sheet.errors.empty_file'))
end
# Fill in fields for dropdown
@available_fields = @team.get_available_sample_fields
# Truncate long fields
@available_fields.update(@available_fields) do |_k, v|
v.truncate(Constants::NAME_TRUNCATION_LENGTH_DROPDOWN)
end
# Save file for next step (importing)
@temp_file = TempFile.new(
session_id: session.id,
file: params[:file]
)
if @temp_file.save
@temp_file.destroy_obsolete
respond_to do |format|
format.json do
render json: {
html: render_to_string(
partial: 'samples/parse_samples_modal.html.erb'
)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
else
sheet = Team.open_spreadsheet(params[:file])
# Check if we actually have any rows (last_row > 1)
if sheet.last_row.between?(0, 1)
flash[:notice] = t(
"teams.parse_sheet.errors.empty_file")
redirect_to session.delete(:return_to) and return
end
# Get data (it will trigger any errors as well)
@header = sheet.row(1)
@columns = sheet.row(2)
# Fill in fields for dropdown
@available_fields = @team.get_available_sample_fields
# Truncate long fields
@available_fields.update(@available_fields) do |_k, v|
v.truncate(Constants::NAME_TRUNCATION_LENGTH_DROPDOWN)
end
# Save file for next step (importing)
@temp_file = TempFile.new(
session_id: session.id,
file: params[:file]
)
if @temp_file.save
@temp_file.destroy_obsolete
# format.html
format.json {
render :json => {
:html => render_to_string({
:partial => "samples/parse_samples_modal.html.erb"
})
}
}
else
error = t("teams.parse_sheet.errors.temp_file_failure")
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
end
end
rescue ArgumentError, CSV::MalformedCSVError
error = t('teams.parse_sheet.errors.invalid_file',
encoding: ''.encoding)
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
rescue TypeError
error = t("teams.parse_sheet.errors.invalid_extension")
format.html {
flash[:alert] = error
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
end
else
error = t("teams.parse_sheet.errors.no_file_selected")
format.html {
flash[:alert] = error
session[:return_to] ||= request.referer
redirect_to session.delete(:return_to)
}
format.json {
render json: {message: error},
status: :unprocessable_entity
}
return parse_sheet_error(
t('teams.parse_sheet.errors.temp_file_failure')
)
end
rescue ArgumentError, CSV::MalformedCSVError
return parse_sheet_error(t('teams.parse_sheet.errors.invalid_file',
encoding: ''.encoding))
rescue TypeError
return parse_sheet_error(t('teams.parse_sheet.errors.invalid_extension'))
end
end
@ -122,7 +73,7 @@ class TeamsController < ApplicationController
if @temp_file.session_id == session.id
# Check if mappings exists or else we don't have anything to parse
if params[:mappings]
@sheet = Team.open_spreadsheet(@temp_file.file)
@sheet = SpreadsheetParser.open_spreadsheet(@temp_file.file)
# Check for duplicated values
h1 = params[:mappings].clone.delete_if { |k, v| v.empty? }
@ -275,6 +226,20 @@ class TeamsController < ApplicationController
private
def parse_sheet_error(error)
respond_to do |format|
format.html do
flash[:alert] = error
session[:return_to] ||= request.referer
redirect_to session.delete(:return_to)
end
format.json do
render json: { message: error },
status: :unprocessable_entity
end
end
end
def load_vars
@team = Team.find_by_id(params[:id])

View file

@ -58,17 +58,6 @@ class Repository < ActiveRecord::Base
end
end
def open_spreadsheet(file)
filename = file.original_filename
file_path = file.path
if file.class == Paperclip::Attachment && file.is_stored_on_s3?
fa = file.fetch
file_path = fa.path
end
generate_file(filename, file_path)
end
def available_repository_fields
fields = {}
# First and foremost add record name
@ -114,6 +103,7 @@ class Repository < ActiveRecord::Base
name_index = -1
total_nr = 0
nr_of_added = 0
header_skipped = false
mappings.each.with_index do |(_k, value), index|
if value == '-1'
@ -130,54 +120,63 @@ class Repository < ActiveRecord::Base
unless col_compact.map(&:id).uniq.length == col_compact.length
return { status: :error, nr_of_added: nr_of_added, total_nr: total_nr }
end
rows = SpreadsheetParser.spreadsheet_enumerator(sheet)
# Now we can iterate through record data and save stuff into db
transaction do
(2..sheet.last_row).each do |i|
total_nr += 1
record_row = RepositoryRow.new(name: sheet.row(i)[name_index],
repository: self,
created_by: user,
last_modified_by: user)
record_row.transaction(requires_new: true) do
unless record_row.save
errors = true
raise ActiveRecord::Rollback
end
rows.each do |row|
# Skip empty rows
next if row.empty?
unless header_skipped
header_skipped = true
next
end
total_nr += 1
row_cell_values = []
# Creek XLSX parser returns Hash of the row, Roo - Array
row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s)
sheet.row(i).each.with_index do |value, index|
if columns[index] && value
cell_value = RepositoryTextValue.new(
data: value,
created_by: user,
last_modified_by: user,
repository_cell_attributes: {
repository_row: record_row,
repository_column: columns[index]
}
)
cell = RepositoryCell.new(repository_row: record_row,
repository_column: columns[index],
value: cell_value)
cell.skip_on_import = true
cell_value.repository_cell = cell
unless cell.valid? && cell_value.valid?
errors = true
raise ActiveRecord::Rollback
end
row_cell_values << cell_value
end
end
if RepositoryTextValue.import(row_cell_values,
recursive: true,
validate: false).failed_instances.any?
errors = true
raise ActiveRecord::Rollback
end
nr_of_added += 1
record_row = RepositoryRow.new(name: row[name_index],
repository: self,
created_by: user,
last_modified_by: user)
record_row.transaction do
unless record_row.save
errors = true
raise ActiveRecord::Rollback
end
row_cell_values = []
row.each.with_index do |value, index|
if columns[index] && value
cell_value = RepositoryTextValue.new(
data: value,
created_by: user,
last_modified_by: user,
repository_cell_attributes: {
repository_row: record_row,
repository_column: columns[index]
}
)
cell = RepositoryCell.new(repository_row: record_row,
repository_column: columns[index],
value: cell_value)
cell.skip_on_import = true
cell_value.repository_cell = cell
unless cell.valid? && cell_value.valid?
errors = true
raise ActiveRecord::Rollback
end
row_cell_values << cell_value
end
end
if RepositoryTextValue.import(row_cell_values,
recursive: true,
validate: false).failed_instances.any?
errors = true
raise ActiveRecord::Rollback
end
nr_of_added += 1
end
end
@ -186,22 +185,4 @@ class Repository < ActiveRecord::Base
end
{ status: :ok, nr_of_added: nr_of_added, total_nr: total_nr }
end
private
def generate_file(filename, file_path)
case File.extname(filename)
when '.csv'
Roo::CSV.new(file_path, extension: :csv)
when '.tsv'
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.txt'
# This assumption is based purely on biologist's habits
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.xlsx'
Roo::Excelx.new(file_path)
else
raise TypeError
end
end
end

View file

@ -26,30 +26,6 @@ class Team < ActiveRecord::Base
has_many :protocol_keywords, inverse_of: :team, dependent: :destroy
has_many :tiny_mce_assets, inverse_of: :team, dependent: :destroy
has_many :repositories, dependent: :destroy
# Based on file's extension opens file (used for importing)
def self.open_spreadsheet(file)
filename = file.original_filename
file_path = file.path
if file.class == Paperclip::Attachment and file.is_stored_on_s3?
fa = file.fetch
file_path = fa.path
end
case File.extname(filename)
when '.csv' then
Roo::CSV.new(file_path, extension: :csv)
when '.tsv' then
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.txt' then
# This assumption is based purely on biologist's habits
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.xlsx' then
Roo::Excelx.new(file_path)
else
raise TypeError
end
end
def search_users(query = nil)
a_query = "%#{query}%"
@ -66,6 +42,7 @@ class Team < ActiveRecord::Base
errors = false
nr_of_added = 0
total_nr = 0
header_skipped = false
# First let's query for all custom_fields we're refering to
custom_fields = []
@ -91,10 +68,22 @@ class Team < ActiveRecord::Base
custom_fields << cf
end
end
rows = SpreadsheetParser.spreadsheet_enumerator(sheet)
# Now we can iterate through sample data and save stuff into db
(2..sheet.last_row).each do |i|
rows.each do |row|
# Skip empty rows
next if row.empty?
unless header_skipped
header_skipped = true
next
end
total_nr += 1
sample = Sample.new(name: sheet.row(i)[sname_index],
# Creek XLSX parser returns Hash of the row, Roo - Array
row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s)
sample = Sample.new(name: row[sname_index],
team: self,
user: user)
@ -104,7 +93,7 @@ class Team < ActiveRecord::Base
raise ActiveRecord::Rollback
end
sheet.row(i).each.with_index do |value, index|
row.each.with_index do |value, index|
if index == stype_index
stype = SampleType.where(team: self)
.where('name ILIKE ?', value.strip)

View file

@ -17,9 +17,11 @@ module ImportRepository
private
def run_import_actions
@repository.import_records(@repository.open_spreadsheet(@temp_file.file),
@mappings,
@user)
@repository.import_records(
SpreadsheetParser.open_spreadsheet(@temp_file.file),
@mappings,
@user
)
end
def run_checks

View file

@ -5,48 +5,40 @@ module ImportRepository
@file = options.fetch(:file)
@repository = options.fetch(:repository)
@session = options.fetch(:session)
@sheet = @repository.open_spreadsheet(@file)
@sheet = SpreadsheetParser.open_spreadsheet(@file)
end
def data
# Get data (it will trigger any errors as well)
header = @sheet.row(1)
columns = @sheet.row(2)
header, columns = SpreadsheetParser.first_two_rows(@sheet)
# Fill in fields for dropdown
@repository.available_repository_fields.transform_values! do |name|
truncate(name, length: Constants::NAME_TRUNCATION_LENGTH_DROPDOWN)
end
@temp_file = TempFile.create(session_id: @session.id, file: @file)
Data.new(header,
columns,
@repository.available_repository_fields,
@repository,
@temp_file)
@repository)
end
def too_large?
@file.size > Constants::FILE_MAX_SIZE_MB.megabytes
end
def empty?
@sheet.last_row.between?(0, 1)
end
def generated_temp_file?
def generate_temp_file
# Save file for next step (importing)
@temp_file = TempFile.new(
temp_file = TempFile.new(
session_id: @session.id,
file: @file
)
if @temp_file.save
@temp_file.destroy_obsolete
return true
if temp_file.save
temp_file.destroy_obsolete
return temp_file
end
end
Data = Struct.new(
:header, :columns, :available_fields, :repository, :temp_file
:header, :columns, :available_fields, :repository
)
end
end

View file

@ -0,0 +1,55 @@
class SpreadsheetParser
# Based on file's extension opens file (used for importing)
def self.open_spreadsheet(file)
filename = file.original_filename
file_path = file.path
if file.class == Paperclip::Attachment && file.is_stored_on_s3?
fa = file.fetch
file_path = fa.path
end
case File.extname(filename)
when '.csv'
Roo::CSV.new(file_path, extension: :csv)
when '.tsv'
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.txt'
# This assumption is based purely on biologist's habits
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.xlsx'
# Roo Excel parcel was replaced with Creek, but it can be enabled back,
# just swap lines below. But only one can be enabled at the same time.
# Roo::Excelx.new(file_path)
Creek::Book.new(file_path).sheets[0]
else
raise TypeError
end
end
def self.spreadsheet_enumerator(sheet)
if sheet.is_a?(Roo::CSV)
sheet
elsif sheet.is_a?(Roo::Excelx)
sheet.each_row_streaming
else
sheet.rows
end
end
def self.first_two_rows(sheet)
rows = spreadsheet_enumerator(sheet)
header = []
columns = []
i = 1
rows.each do |row|
# Creek XLSX parser returns Hash of the row, Roo - Array
row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s)
header = row if i == 1 && row
columns = row if i == 2 && row
i += 1
break if i > 2
end
return header, columns
end
end

View file

@ -58,7 +58,7 @@
</tbody>
</table>
</div>
<%= hidden_field_tag 'file_id', @import_data.temp_file.id %>
<%= hidden_field_tag 'file_id', @temp_file.id %>
<div id="import-errors-container">
</div>