scinote-web/app/services/spreadsheet_parser.rb

78 lines
2 KiB
Ruby
Raw Normal View History

2019-09-05 14:35:11 +08:00
# frozen_string_literal: true
class SpreadsheetParser
# Based on file's extension opens file (used for importing)
def self.open_spreadsheet(file)
file_path = file.path
filename = if file.class.name.split('::')[-1] == 'UploadedFile'
file.original_filename
else
File.basename(file.path)
end
case File.extname(filename)
when '.csv'
Roo::CSV.new(file_path, extension: :csv)
when '.tsv'
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.txt'
# This assumption is based purely on biologist's habits
Roo::CSV.new(file_path, csv_options: { col_sep: "\t" })
when '.xlsx'
Roo::Excelx.new(file_path)
else
raise TypeError
end
end
def self.spreadsheet_enumerator(sheet)
if sheet.is_a?(Roo::CSV)
sheet
elsif sheet.is_a?(Roo::Excelx)
sheet.each_row_streaming(pad_cells: true)
else
sheet.rows
end
end
def self.first_two_rows(sheet, date_format: nil)
rows = spreadsheet_enumerator(sheet)
header = []
columns = []
2020-05-18 15:52:37 +08:00
rows.take(2).each_with_index do |row_values, i|
row = parse_row(row_values, sheet, header: i.zero?, date_format: date_format)
2020-05-18 15:52:37 +08:00
if row && i.zero?
header = row
else
columns = row
end
end
2020-05-18 15:52:37 +08:00
return header, columns
end
def self.parse_row(row, sheet, header: false, date_format: nil)
2020-05-18 15:52:37 +08:00
if sheet.is_a?(Roo::Excelx) && !header
row.map do |cell|
if cell.is_a?(Roo::Excelx::Cell::Number) && cell.format == 'General'
cell&.value&.to_d
elsif date_format && cell&.value.is_a?(Date)
cell&.value&.strftime(date_format)
else
cell&.formatted_value
end
end
else
row.map(&:to_s)
end
end
def self.duplicate_ids(sheet)
# Extracting IDs from sheet and removing header row
ids = sheet.drop(1).map(&:first)
# Selecting duplicate IDs
ids.group_by { |id| id }.select { |_, group| group.size > 1 }.keys
end
end