From 56f52ebfa7fa18ec6a9a1a39fe33ba7df30034bc Mon Sep 17 00:00:00 2001 From: Oleksii Kriuchykhin Date: Thu, 12 Oct 2017 14:43:25 +0200 Subject: [PATCH 1/3] Optimize memory usage in samples/repositories import [SCI-1665] --- Gemfile | 3 +- Gemfile.lock | 4 + app/controllers/repositories_controller.rb | 11 +- app/controllers/teams_controller.rb | 177 ++++++++---------- app/models/repository.rb | 107 ++++++----- app/models/team.rb | 30 ++- .../import_repository/parse_repository.rb | 44 +++-- .../_parse_records_modal.html.erb | 2 +- 8 files changed, 213 insertions(+), 165 deletions(-) diff --git a/Gemfile b/Gemfile index 6754fde93..65740b8d7 100644 --- a/Gemfile +++ b/Gemfile @@ -47,6 +47,7 @@ gem 'commit_param_routing' # Enables different submit actions in the same form t gem 'kaminari' gem "i18n-js", ">= 3.0.0.rc11" # Localization in javascript files gem 'roo', '~> 2.7.1' # Spreadsheet parser +gem 'creek' gem 'wicked_pdf' gem 'silencer' # Silence certain Rails logs gem 'wkhtmltopdf-heroku' @@ -72,7 +73,6 @@ gem 'ruby-graphviz', '~> 1.2' # Graphviz for rails gem 'tinymce-rails', '~> 4.5.7' # Rich text editor gem 'base62' # Used for smart annotations -gem 'newrelic_rpm' group :development, :test do gem 'byebug' @@ -85,6 +85,7 @@ group :development, :test do end group :production do + gem 'newrelic_rpm' gem 'puma' gem 'rails_12factor' end diff --git a/Gemfile.lock b/Gemfile.lock index aed66990c..be92e9091 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -115,6 +115,9 @@ GEM commit_param_routing (0.0.1) concurrent-ruby (1.0.0) crass (1.0.2) + creek (1.1.2) + nokogiri (~> 1.6.0) + rubyzip (>= 1.0.0) debug_inspector (0.0.2) deface (1.0.2) colorize (>= 0.5.8) @@ -362,6 +365,7 @@ DEPENDENCIES bootstrap_form byebug commit_param_routing + creek deface (~> 1.0) delayed_job_active_record delayed_paperclip! diff --git a/app/controllers/repositories_controller.rb b/app/controllers/repositories_controller.rb index bb56e7938..6b638589c 100644 --- a/app/controllers/repositories_controller.rb +++ b/app/controllers/repositories_controller.rb @@ -198,12 +198,15 @@ class RepositoriesController < ApplicationController if parsed_file.too_large? repository_response(t('general.file.size_exceeded', file_size: Constants::FILE_MAX_SIZE_MB)) - elsif parsed_file.empty? - flash[:notice] = t('teams.parse_sheet.errors.empty_file') - redirect_to back and return else @import_data = parsed_file.data - if parsed_file.generated_temp_file? + + unless @import_data.header.any? && @import_data.columns.any? + return repository_response(t('teams.parse_sheet.errors.empty_file')) + end + + @temp_file = parsed_file.generate_temp_file + if @temp_file respond_to do |format| format.json do render json: { diff --git a/app/controllers/teams_controller.rb b/app/controllers/teams_controller.rb index 622cff038..32bd3d1d8 100644 --- a/app/controllers/teams_controller.rb +++ b/app/controllers/teams_controller.rb @@ -7,106 +7,77 @@ class TeamsController < ApplicationController def parse_sheet session[:return_to] ||= request.referer - respond_to do |format| - if params[:file] - begin + unless params[:file] + return parse_sheet_error(t('teams.parse_sheet.errors.no_file_selected')) + end + if params[:file].size > Constants::FILE_MAX_SIZE_MB.megabytes + error = t('general.file.size_exceeded', + file_size: Constants::FILE_MAX_SIZE_MB) + return parse_sheet_error(error) + end - if params[:file].size > Constants::FILE_MAX_SIZE_MB.megabytes - error = t 'general.file.size_exceeded', - file_size: Constants::FILE_MAX_SIZE_MB - - format.html { - flash[:alert] = error - redirect_to session.delete(:return_to) - } - format.json { - render json: {message: error}, - status: :unprocessable_entity - } - - else - sheet = Team.open_spreadsheet(params[:file]) - - # Check if we actually have any rows (last_row > 1) - if sheet.last_row.between?(0, 1) - flash[:notice] = t( - "teams.parse_sheet.errors.empty_file") - redirect_to session.delete(:return_to) and return - end - - # Get data (it will trigger any errors as well) - @header = sheet.row(1) - @columns = sheet.row(2) - - # Fill in fields for dropdown - @available_fields = @team.get_available_sample_fields - # Truncate long fields - @available_fields.update(@available_fields) do |_k, v| - v.truncate(Constants::NAME_TRUNCATION_LENGTH_DROPDOWN) - end - - # Save file for next step (importing) - @temp_file = TempFile.new( - session_id: session.id, - file: params[:file] - ) - - if @temp_file.save - @temp_file.destroy_obsolete - # format.html - format.json { - render :json => { - :html => render_to_string({ - :partial => "samples/parse_samples_modal.html.erb" - }) - } - } - else - error = t("teams.parse_sheet.errors.temp_file_failure") - format.html { - flash[:alert] = error - redirect_to session.delete(:return_to) - } - format.json { - render json: {message: error}, - status: :unprocessable_entity - } - end - end - rescue ArgumentError, CSV::MalformedCSVError - error = t('teams.parse_sheet.errors.invalid_file', - encoding: ''.encoding) - format.html { - flash[:alert] = error - redirect_to session.delete(:return_to) - } - format.json { - render json: {message: error}, - status: :unprocessable_entity - } - rescue TypeError - error = t("teams.parse_sheet.errors.invalid_extension") - format.html { - flash[:alert] = error - redirect_to session.delete(:return_to) - } - format.json { - render json: {message: error}, - status: :unprocessable_entity - } + begin + sheet = Team.open_spreadsheet(params[:file]) + # Get data (it will trigger any errors as well) + if sheet.is_a?(Roo::CSV) + @header = sheet.row(1) + @columns = sheet.row(2) + elsif sheet.is_a?(Roo::Excelx) + i = 1 + sheet.each_row_streaming do |row| + @header = row.map(&:cell_value) if i == 1 + @columns = row.map(&:cell_value) if i == 2 + i += 1 + break if i > 2 end else - error = t("teams.parse_sheet.errors.no_file_selected") - format.html { - flash[:alert] = error - session[:return_to] ||= request.referer - redirect_to session.delete(:return_to) - } - format.json { - render json: {message: error}, - status: :unprocessable_entity - } + i = 1 + sheet.rows.each do |row| + @header = row.values if i == 1 + @columns = row.values if i == 2 + i += 1 + break if i > 2 + end end + + unless @header && @header.any? && @columns && @columns.any? + return parse_sheet_error(t('teams.parse_sheet.errors.empty_file')) + end + + # Fill in fields for dropdown + @available_fields = @team.get_available_sample_fields + # Truncate long fields + @available_fields.update(@available_fields) do |_k, v| + v.truncate(Constants::NAME_TRUNCATION_LENGTH_DROPDOWN) + end + + # Save file for next step (importing) + @temp_file = TempFile.new( + session_id: session.id, + file: params[:file] + ) + + if @temp_file.save + @temp_file.destroy_obsolete + respond_to do |format| + format.json do + render json: { + html: render_to_string( + partial: 'samples/parse_samples_modal.html.erb' + ) + } + end + end + else + return parse_sheet_error( + t('teams.parse_sheet.errors.temp_file_failure') + ) + end + rescue ArgumentError, CSV::MalformedCSVError + return parse_sheet_error(t('teams.parse_sheet.errors.invalid_file', + encoding: ''.encoding)) + rescue TypeError + return parse_sheet_error(t('teams.parse_sheet.errors.invalid_extension')) end end @@ -275,6 +246,20 @@ class TeamsController < ApplicationController private + def parse_sheet_error(error) + respond_to do |format| + format.html do + flash[:alert] = error + session[:return_to] ||= request.referer + redirect_to session.delete(:return_to) + end + format.json do + render json: { message: error }, + status: :unprocessable_entity + end + end + end + def load_vars @team = Team.find_by_id(params[:id]) diff --git a/app/models/repository.rb b/app/models/repository.rb index 64058833b..93b5a30e4 100644 --- a/app/models/repository.rb +++ b/app/models/repository.rb @@ -114,6 +114,7 @@ class Repository < ActiveRecord::Base name_index = -1 total_nr = 0 nr_of_added = 0 + header_skipped = false mappings.each.with_index do |(_k, value), index| if value == '-1' @@ -130,54 +131,69 @@ class Repository < ActiveRecord::Base unless col_compact.map(&:id).uniq.length == col_compact.length return { status: :error, nr_of_added: nr_of_added, total_nr: total_nr } end + rows = if sheet.is_a?(Roo::CSV) + sheet + elsif sheet.is_a?(Roo::Excelx) + sheet.each_row_streaming + else + sheet.rows + end # Now we can iterate through record data and save stuff into db - transaction do - (2..sheet.last_row).each do |i| - total_nr += 1 - record_row = RepositoryRow.new(name: sheet.row(i)[name_index], - repository: self, - created_by: user, - last_modified_by: user) - record_row.transaction(requires_new: true) do - unless record_row.save - errors = true - raise ActiveRecord::Rollback - end + rows.each do |row| + # Skip empty rows + next if row.empty? + unless header_skipped + header_skipped = true + next + end + total_nr += 1 - row_cell_values = [] + # Creek XLSX parser returns Hash of the row, Roo - Array + row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s) - sheet.row(i).each.with_index do |value, index| - if columns[index] && value - cell_value = RepositoryTextValue.new( - data: value, - created_by: user, - last_modified_by: user, - repository_cell_attributes: { - repository_row: record_row, - repository_column: columns[index] - } - ) - cell = RepositoryCell.new(repository_row: record_row, - repository_column: columns[index], - value: cell_value) - cell.skip_on_import = true - cell_value.repository_cell = cell - unless cell.valid? && cell_value.valid? - errors = true - raise ActiveRecord::Rollback - end - row_cell_values << cell_value - end - end - if RepositoryTextValue.import(row_cell_values, - recursive: true, - validate: false).failed_instances.any? - errors = true - raise ActiveRecord::Rollback - end - nr_of_added += 1 + record_row = RepositoryRow.new(name: row[name_index], + repository: self, + created_by: user, + last_modified_by: user) + record_row.transaction do + unless record_row.save + errors = true + raise ActiveRecord::Rollback end + + row_cell_values = [] + + row.each.with_index do |value, index| + if columns[index] && value + cell_value = RepositoryTextValue.new( + data: value, + created_by: user, + last_modified_by: user, + repository_cell_attributes: { + repository_row: record_row, + repository_column: columns[index] + } + ) + cell = RepositoryCell.new(repository_row: record_row, + repository_column: columns[index], + value: cell_value) + cell.skip_on_import = true + cell_value.repository_cell = cell + unless cell.valid? && cell_value.valid? + errors = true + raise ActiveRecord::Rollback + end + row_cell_values << cell_value + end + end + if RepositoryTextValue.import(row_cell_values, + recursive: true, + validate: false).failed_instances.any? + errors = true + raise ActiveRecord::Rollback + end + nr_of_added += 1 end end @@ -199,7 +215,10 @@ class Repository < ActiveRecord::Base # This assumption is based purely on biologist's habits Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) when '.xlsx' - Roo::Excelx.new(file_path) + # Roo Excel parcel was replaced with Creek, but it can be enabled back, + # just swap lines below. But only one can be enabled at the same time. + # Roo::Excelx.new(file_path) + Creek::Book.new(file_path).sheets[0] else raise TypeError end diff --git a/app/models/team.rb b/app/models/team.rb index 04a45375a..57284d2d1 100644 --- a/app/models/team.rb +++ b/app/models/team.rb @@ -45,7 +45,10 @@ class Team < ActiveRecord::Base # This assumption is based purely on biologist's habits Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) when '.xlsx' then - Roo::Excelx.new(file_path) + # Roo Excel parcel was replaced with Creek, but it can be enabled back, + # just swap lines below. But only one can be enabled at the same time. + # Roo::Excelx.new(file_path) + Creek::Book.new(file_path).sheets[0] else raise TypeError end @@ -66,6 +69,7 @@ class Team < ActiveRecord::Base errors = false nr_of_added = 0 total_nr = 0 + header_skipped = false # First let's query for all custom_fields we're refering to custom_fields = [] @@ -91,10 +95,28 @@ class Team < ActiveRecord::Base custom_fields << cf end end + + rows = if sheet.is_a?(Roo::CSV) + sheet + elsif sheet.is_a?(Roo::Excelx) + sheet.each_row_streaming + else + sheet.rows + end + # Now we can iterate through sample data and save stuff into db - (2..sheet.last_row).each do |i| + rows.each do |row| + # Skip empty rows + next if row.empty? + unless header_skipped + header_skipped = true + next + end total_nr += 1 - sample = Sample.new(name: sheet.row(i)[sname_index], + # Creek XLSX parser returns Hash of the row, Roo - Array + row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s) + + sample = Sample.new(name: row[sname_index], team: self, user: user) @@ -104,7 +126,7 @@ class Team < ActiveRecord::Base raise ActiveRecord::Rollback end - sheet.row(i).each.with_index do |value, index| + row.each.with_index do |value, index| if index == stype_index stype = SampleType.where(name: value.strip, team: self).take diff --git a/app/services/import_repository/parse_repository.rb b/app/services/import_repository/parse_repository.rb index 192f4b61e..23fe97b67 100644 --- a/app/services/import_repository/parse_repository.rb +++ b/app/services/import_repository/parse_repository.rb @@ -10,43 +10,57 @@ module ImportRepository def data # Get data (it will trigger any errors as well) - header = @sheet.row(1) - columns = @sheet.row(2) + if @sheet.is_a?(Roo::CSV) + header = @sheet.row(1) + columns = @sheet.row(2) + elsif @sheet.is_a?(Roo::Excelx) + i = 1 + @sheet.each_row_streaming do |row| + header = row.map(&:cell_value) if i == 1 + columns = row.map(&:cell_value) if i == 2 + i += 1 + break if i > 2 + end + else + i = 1 + @sheet.rows.each do |row| + header = row.values if i == 1 + columns = row.values if i == 2 + i += 1 + break if i > 2 + end + end # Fill in fields for dropdown @repository.available_repository_fields.transform_values! do |name| truncate(name, length: Constants::NAME_TRUNCATION_LENGTH_DROPDOWN) end - @temp_file = TempFile.create(session_id: @session.id, file: @file) + header ||= [] + columns ||= [] Data.new(header, columns, @repository.available_repository_fields, - @repository, - @temp_file) + @repository) end def too_large? @file.size > Constants::FILE_MAX_SIZE_MB.megabytes end - def empty? - @sheet.last_row.between?(0, 1) - end - - def generated_temp_file? + def generate_temp_file # Save file for next step (importing) - @temp_file = TempFile.new( + temp_file = TempFile.new( session_id: @session.id, file: @file ) - if @temp_file.save - @temp_file.destroy_obsolete - return true + if temp_file.save + temp_file.destroy_obsolete + return temp_file end end Data = Struct.new( - :header, :columns, :available_fields, :repository, :temp_file + :header, :columns, :available_fields, :repository ) end end diff --git a/app/views/repositories/_parse_records_modal.html.erb b/app/views/repositories/_parse_records_modal.html.erb index ccab285b1..7a15968ba 100644 --- a/app/views/repositories/_parse_records_modal.html.erb +++ b/app/views/repositories/_parse_records_modal.html.erb @@ -58,7 +58,7 @@ - <%= hidden_field_tag 'file_id', @import_data.temp_file.id %> + <%= hidden_field_tag 'file_id', @temp_file.id %>
From c8d2ae70fa40fb0fe1bbdf3dc2063015d5e9ef7e Mon Sep 17 00:00:00 2001 From: Oleksii Kriuchykhin Date: Tue, 17 Oct 2017 14:42:06 +0200 Subject: [PATCH 2/3] Move duplicated code to service [SCI-1665] --- Gemfile | 2 +- app/controllers/repositories_controller.rb | 3 +- app/controllers/teams_controller.rb | 28 ++-------- app/models/repository.rb | 40 +------------- app/models/team.rb | 35 +----------- .../import_repository/import_records.rb | 8 ++- .../import_repository/parse_repository.rb | 26 +-------- app/services/spreadsheet_parser.rb | 55 +++++++++++++++++++ 8 files changed, 70 insertions(+), 127 deletions(-) create mode 100644 app/services/spreadsheet_parser.rb diff --git a/Gemfile b/Gemfile index 65740b8d7..9a8576cd9 100644 --- a/Gemfile +++ b/Gemfile @@ -73,6 +73,7 @@ gem 'ruby-graphviz', '~> 1.2' # Graphviz for rails gem 'tinymce-rails', '~> 4.5.7' # Rich text editor gem 'base62' # Used for smart annotations +gem 'newrelic_rpm' group :development, :test do gem 'byebug' @@ -85,7 +86,6 @@ group :development, :test do end group :production do - gem 'newrelic_rpm' gem 'puma' gem 'rails_12factor' end diff --git a/app/controllers/repositories_controller.rb b/app/controllers/repositories_controller.rb index 6b638589c..7454f1b81 100644 --- a/app/controllers/repositories_controller.rb +++ b/app/controllers/repositories_controller.rb @@ -205,8 +205,7 @@ class RepositoriesController < ApplicationController return repository_response(t('teams.parse_sheet.errors.empty_file')) end - @temp_file = parsed_file.generate_temp_file - if @temp_file + if (@temp_file = parsed_file.generate_temp_file) respond_to do |format| format.json do render json: { diff --git a/app/controllers/teams_controller.rb b/app/controllers/teams_controller.rb index 32bd3d1d8..c09e06389 100644 --- a/app/controllers/teams_controller.rb +++ b/app/controllers/teams_controller.rb @@ -17,30 +17,10 @@ class TeamsController < ApplicationController end begin - sheet = Team.open_spreadsheet(params[:file]) - # Get data (it will trigger any errors as well) - if sheet.is_a?(Roo::CSV) - @header = sheet.row(1) - @columns = sheet.row(2) - elsif sheet.is_a?(Roo::Excelx) - i = 1 - sheet.each_row_streaming do |row| - @header = row.map(&:cell_value) if i == 1 - @columns = row.map(&:cell_value) if i == 2 - i += 1 - break if i > 2 - end - else - i = 1 - sheet.rows.each do |row| - @header = row.values if i == 1 - @columns = row.values if i == 2 - i += 1 - break if i > 2 - end - end + sheet = SpreadsheetParser.open_spreadsheet(params[:file]) + @header, @columns = SpreadsheetParser.first_two_rows(sheet) - unless @header && @header.any? && @columns && @columns.any? + unless @header.any? && @columns.any? return parse_sheet_error(t('teams.parse_sheet.errors.empty_file')) end @@ -93,7 +73,7 @@ class TeamsController < ApplicationController if @temp_file.session_id == session.id # Check if mappings exists or else we don't have anything to parse if params[:mappings] - @sheet = Team.open_spreadsheet(@temp_file.file) + @sheet = SpreadsheetParser.open_spreadsheet(@temp_file.file) # Check for duplicated values h1 = params[:mappings].clone.delete_if { |k, v| v.empty? } diff --git a/app/models/repository.rb b/app/models/repository.rb index 93b5a30e4..349651882 100644 --- a/app/models/repository.rb +++ b/app/models/repository.rb @@ -58,17 +58,6 @@ class Repository < ActiveRecord::Base end end - def open_spreadsheet(file) - filename = file.original_filename - file_path = file.path - - if file.class == Paperclip::Attachment && file.is_stored_on_s3? - fa = file.fetch - file_path = fa.path - end - generate_file(filename, file_path) - end - def available_repository_fields fields = {} # First and foremost add record name @@ -131,13 +120,7 @@ class Repository < ActiveRecord::Base unless col_compact.map(&:id).uniq.length == col_compact.length return { status: :error, nr_of_added: nr_of_added, total_nr: total_nr } end - rows = if sheet.is_a?(Roo::CSV) - sheet - elsif sheet.is_a?(Roo::Excelx) - sheet.each_row_streaming - else - sheet.rows - end + rows = SpreadsheetParser.spreadsheet_enumerator(sheet) # Now we can iterate through record data and save stuff into db rows.each do |row| @@ -202,25 +185,4 @@ class Repository < ActiveRecord::Base end { status: :ok, nr_of_added: nr_of_added, total_nr: total_nr } end - - private - - def generate_file(filename, file_path) - case File.extname(filename) - when '.csv' - Roo::CSV.new(file_path, extension: :csv) - when '.tsv' - Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) - when '.txt' - # This assumption is based purely on biologist's habits - Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) - when '.xlsx' - # Roo Excel parcel was replaced with Creek, but it can be enabled back, - # just swap lines below. But only one can be enabled at the same time. - # Roo::Excelx.new(file_path) - Creek::Book.new(file_path).sheets[0] - else - raise TypeError - end - end end diff --git a/app/models/team.rb b/app/models/team.rb index 57284d2d1..ad22d7d42 100644 --- a/app/models/team.rb +++ b/app/models/team.rb @@ -26,33 +26,6 @@ class Team < ActiveRecord::Base has_many :protocol_keywords, inverse_of: :team, dependent: :destroy has_many :tiny_mce_assets, inverse_of: :team, dependent: :destroy has_many :repositories, dependent: :destroy - # Based on file's extension opens file (used for importing) - def self.open_spreadsheet(file) - filename = file.original_filename - file_path = file.path - - if file.class == Paperclip::Attachment and file.is_stored_on_s3? - fa = file.fetch - file_path = fa.path - end - - case File.extname(filename) - when '.csv' then - Roo::CSV.new(file_path, extension: :csv) - when '.tsv' then - Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) - when '.txt' then - # This assumption is based purely on biologist's habits - Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) - when '.xlsx' then - # Roo Excel parcel was replaced with Creek, but it can be enabled back, - # just swap lines below. But only one can be enabled at the same time. - # Roo::Excelx.new(file_path) - Creek::Book.new(file_path).sheets[0] - else - raise TypeError - end - end def search_users(query = nil) a_query = "%#{query}%" @@ -96,13 +69,7 @@ class Team < ActiveRecord::Base end end - rows = if sheet.is_a?(Roo::CSV) - sheet - elsif sheet.is_a?(Roo::Excelx) - sheet.each_row_streaming - else - sheet.rows - end + rows = SpreadsheetParser.spreadsheet_enumerator(sheet) # Now we can iterate through sample data and save stuff into db rows.each do |row| diff --git a/app/services/import_repository/import_records.rb b/app/services/import_repository/import_records.rb index e9deea8b4..ba80a4b0a 100644 --- a/app/services/import_repository/import_records.rb +++ b/app/services/import_repository/import_records.rb @@ -17,9 +17,11 @@ module ImportRepository private def run_import_actions - @repository.import_records(@repository.open_spreadsheet(@temp_file.file), - @mappings, - @user) + @repository.import_records( + SpreadsheetParser.open_spreadsheet(@temp_file.file), + @mappings, + @user + ) end def run_checks diff --git a/app/services/import_repository/parse_repository.rb b/app/services/import_repository/parse_repository.rb index 23fe97b67..222b71274 100644 --- a/app/services/import_repository/parse_repository.rb +++ b/app/services/import_repository/parse_repository.rb @@ -5,37 +5,15 @@ module ImportRepository @file = options.fetch(:file) @repository = options.fetch(:repository) @session = options.fetch(:session) - @sheet = @repository.open_spreadsheet(@file) + @sheet = SpreadsheetParser.open_spreadsheet(@file) end def data - # Get data (it will trigger any errors as well) - if @sheet.is_a?(Roo::CSV) - header = @sheet.row(1) - columns = @sheet.row(2) - elsif @sheet.is_a?(Roo::Excelx) - i = 1 - @sheet.each_row_streaming do |row| - header = row.map(&:cell_value) if i == 1 - columns = row.map(&:cell_value) if i == 2 - i += 1 - break if i > 2 - end - else - i = 1 - @sheet.rows.each do |row| - header = row.values if i == 1 - columns = row.values if i == 2 - i += 1 - break if i > 2 - end - end + header, columns = SpreadsheetParser.first_two_rows(@sheet) # Fill in fields for dropdown @repository.available_repository_fields.transform_values! do |name| truncate(name, length: Constants::NAME_TRUNCATION_LENGTH_DROPDOWN) end - header ||= [] - columns ||= [] Data.new(header, columns, @repository.available_repository_fields, diff --git a/app/services/spreadsheet_parser.rb b/app/services/spreadsheet_parser.rb new file mode 100644 index 000000000..3d5a64774 --- /dev/null +++ b/app/services/spreadsheet_parser.rb @@ -0,0 +1,55 @@ +class SpreadsheetParser + # Based on file's extension opens file (used for importing) + def self.open_spreadsheet(file) + filename = file.original_filename + file_path = file.path + + if file.class == Paperclip::Attachment && file.is_stored_on_s3? + fa = file.fetch + file_path = fa.path + end + + case File.extname(filename) + when '.csv' + Roo::CSV.new(file_path, extension: :csv) + when '.tsv' + Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) + when '.txt' + # This assumption is based purely on biologist's habits + Roo::CSV.new(file_path, csv_options: { col_sep: "\t" }) + when '.xlsx' + # Roo Excel parcel was replaced with Creek, but it can be enabled back, + # just swap lines below. But only one can be enabled at the same time. + # Roo::Excelx.new(file_path) + Creek::Book.new(file_path).sheets[0] + else + raise TypeError + end + end + + def self.spreadsheet_enumerator(sheet) + if sheet.is_a?(Roo::CSV) + sheet + elsif sheet.is_a?(Roo::Excelx) + sheet.each_row_streaming + else + sheet.rows + end + end + + def self.first_two_rows(sheet) + rows = spreadsheet_enumerator(sheet) + header = [] + columns = [] + i = 1 + rows.each do |row| + # Creek XLSX parser returns Hash of the row, Roo - Array + row = row.is_a?(Hash) ? row.values.map(&:to_s) : row.map(&:to_s) + header = row if i == 1 && row + columns = row if i == 2 && row + i += 1 + break if i > 2 + end + return header, columns + end +end From cd0aba1896f6d9b72246e09f03dbf94e1b003344 Mon Sep 17 00:00:00 2001 From: Oleksii Kriuchykhin Date: Tue, 17 Oct 2017 15:23:54 +0200 Subject: [PATCH 3/3] Change any? to empty? [SCI-1665] --- app/controllers/repositories_controller.rb | 2 +- app/controllers/teams_controller.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/controllers/repositories_controller.rb b/app/controllers/repositories_controller.rb index 7454f1b81..e356471e8 100644 --- a/app/controllers/repositories_controller.rb +++ b/app/controllers/repositories_controller.rb @@ -201,7 +201,7 @@ class RepositoriesController < ApplicationController else @import_data = parsed_file.data - unless @import_data.header.any? && @import_data.columns.any? + if @import_data.header.empty? || @import_data.columns.empty? return repository_response(t('teams.parse_sheet.errors.empty_file')) end diff --git a/app/controllers/teams_controller.rb b/app/controllers/teams_controller.rb index c09e06389..982b34d11 100644 --- a/app/controllers/teams_controller.rb +++ b/app/controllers/teams_controller.rb @@ -20,7 +20,7 @@ class TeamsController < ApplicationController sheet = SpreadsheetParser.open_spreadsheet(params[:file]) @header, @columns = SpreadsheetParser.first_two_rows(sheet) - unless @header.any? && @columns.any? + if @header.empty? || @columns.empty? return parse_sheet_error(t('teams.parse_sheet.errors.empty_file')) end