From 085cd8dea639ef9853cb6ce5260f5d4c813cc1e0 Mon Sep 17 00:00:00 2001 From: Alex Kriuchykhin Date: Mon, 7 Apr 2025 14:27:55 +0200 Subject: [PATCH] Add size limits for file text extraction and preview generation, remove unused data column from asset_text_data [SCI-11717] (#8369) --- app/controllers/assets_controller.rb | 6 +----- app/models/asset.rb | 3 +-- app/models/asset_text_datum.rb | 14 +------------- app/serializers/asset_serializer.rb | 4 ++-- app/utilities/active_storage_file_util.rb | 1 + config/initializers/constants.rb | 4 ++++ ...remove_data_column_from_asset_text_data.rb | 7 +++++++ db/schema.rb | 1 - .../analyzer/text_extraction_analyzer.rb | 19 +++++++++++-------- spec/factories/asset_text_datums.rb | 1 - spec/models/asset_text_datum_spec.rb | 6 ------ 11 files changed, 28 insertions(+), 38 deletions(-) create mode 100644 db/migrate/20250321114914_remove_data_column_from_asset_text_data.rb diff --git a/app/controllers/assets_controller.rb b/app/controllers/assets_controller.rb index 46e908b52..6f34f3d96 100644 --- a/app/controllers/assets_controller.rb +++ b/app/controllers/assets_controller.rb @@ -147,11 +147,7 @@ class AssetsController < ApplicationController end def show - if @asset - render json: @asset, serializer: AssetSerializer, user: current_user - else - render json: { error: 'Asset not found' }, status: :not_found - end + render json: @asset, serializer: AssetSerializer, user: current_user end def edit diff --git a/app/models/asset.rb b/app/models/asset.rb index be090803c..af7bd78e8 100644 --- a/app/models/asset.rb +++ b/app/models/asset.rb @@ -225,8 +225,8 @@ class Asset < ApplicationRecord def pdf_preview_ready? return false if pdf_preview_processing - return true if file_pdf_preview.attached? + return false unless previewable_document?(blob) PdfPreviewJob.perform_later(id) ActiveRecord::Base.no_touching { update(pdf_preview_processing: true) } @@ -259,7 +259,6 @@ class Asset < ApplicationRecord es = file_size if asset_text_datum.present? && asset_text_datum.persisted? asset_text_datum.reload - es += get_octet_length_record(asset_text_datum, :data) es += get_octet_length_record(asset_text_datum, :data_vector) end es *= Constants::ASSET_ESTIMATED_SIZE_FACTOR diff --git a/app/models/asset_text_datum.rb b/app/models/asset_text_datum.rb index 3b00cfe78..7e8f7ec10 100644 --- a/app/models/asset_text_datum.rb +++ b/app/models/asset_text_datum.rb @@ -3,18 +3,6 @@ class AssetTextDatum < ApplicationRecord include SearchableModel - validates :data, presence: true - validates :asset, presence: true, uniqueness: true + validates :asset, uniqueness: true belongs_to :asset, inverse_of: :asset_text_datum - - after_save :update_ts_index - - def update_ts_index - if saved_change_to_data? - sql = "UPDATE asset_text_data " + - "SET data_vector = to_tsvector(data) " + - "WHERE id = " + Integer(id).to_s - AssetTextDatum.connection.execute(sql) - end - end end diff --git a/app/serializers/asset_serializer.rb b/app/serializers/asset_serializer.rb index 18017dedf..0d34c6940 100644 --- a/app/serializers/asset_serializer.rb +++ b/app/serializers/asset_serializer.rb @@ -90,11 +90,11 @@ class AssetSerializer < ActiveModel::Serializer end def pdf - return unless object.pdf? + return unless object.pdf? || object.file_pdf_preview.attached? { url: object.pdf? ? asset_download_path(object) : asset_pdf_preview_path(object), - size: !object.pdf? && object.pdf_preview_ready? ? object.file_pdf_preview&.blob&.byte_size : object.file_size, + size: object.pdf? ? object.file_size : object.file_pdf_preview&.blob&.byte_size, worker_url: ActionController::Base.helpers.asset_path('pdf_js_worker.js') } end diff --git a/app/utilities/active_storage_file_util.rb b/app/utilities/active_storage_file_util.rb index 61bfde28c..0f6361b92 100644 --- a/app/utilities/active_storage_file_util.rb +++ b/app/utilities/active_storage_file_util.rb @@ -4,6 +4,7 @@ module ActiveStorageFileUtil # Method expects instance of ActiveStorage::Blob as argument def previewable_document?(blob) return false if blob.blank? + return false if blob.byte_size > Constants::PREVIEW_MAX_FILE_SIZE previewable = Constants::PREVIEWABLE_FILE_TYPES.include?(blob.content_type) diff --git a/config/initializers/constants.rb b/config/initializers/constants.rb index 0fd1b166e..ece5328ed 100644 --- a/config/initializers/constants.rb +++ b/config/initializers/constants.rb @@ -328,11 +328,15 @@ class Constants 'text/plain' ].freeze + TEXT_EXTRACT_MAX_FILE_SIZE = ENV['TEXT_EXTRACT_MAX_FILE_SIZE_MB'] ? ENV['TEXT_EXTRACT_MAX_FILE_SIZE_MB'].to_i : 50.megabytes + PREVIEWABLE_FILE_TYPES = TEXT_EXTRACT_FILE_TYPES # default preview timeout to 15 minutes PREVIEW_TIMEOUT_SECONDS = ENV['PREVIEW_TIMEOUT_SECONDS'] ? ENV['PREVIEW_TIMEOUT_SECONDS'].to_i : 900 + PREVIEW_MAX_FILE_SIZE = ENV['PREVIEW_MAX_FILE_SIZE_MB'] ? ENV['PREVIEW_MAX_FILE_SIZE_MB'].to_i : 50.megabytes + WHITELISTED_IMAGE_TYPES = [ 'gif', 'jpeg', 'pjpeg', 'png', 'x-png', 'svg+xml', 'bmp', 'tiff', 'jpg' ].freeze diff --git a/db/migrate/20250321114914_remove_data_column_from_asset_text_data.rb b/db/migrate/20250321114914_remove_data_column_from_asset_text_data.rb new file mode 100644 index 000000000..02a59603d --- /dev/null +++ b/db/migrate/20250321114914_remove_data_column_from_asset_text_data.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +class RemoveDataColumnFromAssetTextData < ActiveRecord::Migration[7.0] + def change + remove_column :asset_text_data, :data, :text + end +end diff --git a/db/schema.rb b/db/schema.rb index 3c6536756..1ea8abab1 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -90,7 +90,6 @@ ActiveRecord::Schema[7.0].define(version: 2025_03_25_124848) do end create_table "asset_text_data", force: :cascade do |t| - t.text "data", null: false t.bigint "asset_id", null: false t.datetime "created_at", precision: nil, null: false t.datetime "updated_at", precision: nil, null: false diff --git a/lib/active_storage/analyzer/text_extraction_analyzer.rb b/lib/active_storage/analyzer/text_extraction_analyzer.rb index 41d22f54e..f6cba74a8 100644 --- a/lib/active_storage/analyzer/text_extraction_analyzer.rb +++ b/lib/active_storage/analyzer/text_extraction_analyzer.rb @@ -5,7 +5,9 @@ module ActiveStorage DEFAULT_TIKA_PATH = 'tika-app.jar' def self.accept?(blob) - blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any? + blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && + blob.byte_size <= Constants::TEXT_EXTRACT_MAX_FILE_SIZE && + blob.attachments.where(record_type: 'Asset').any? end def self.analyze_later? @@ -50,14 +52,15 @@ module ActiveStorage def create_or_update_text_data(text_data) @blob.attachments.where(record_type: 'Asset').each do |attachemnt| asset = attachemnt.record - if asset.asset_text_datum.present? - # Update existing text datum if it exists - asset.asset_text_datum.update!(data: text_data) - else - # Create new text datum - asset.create_asset_text_datum!(data: text_data) - end + asset.create_asset_text_datum! if asset.asset_text_datum.blank? + sql = ActiveRecord::Base.sanitize_sql_array( + [ + 'UPDATE "asset_text_data" SET "data_vector" = to_tsvector(:text_data) WHERE "id" = :id', + { text_data: text_data, id: asset.asset_text_datum.id } + ] + ) + AssetTextDatum.connection.execute(sql) asset.update_estimated_size Rails.logger.info "Asset #{asset.id}: file text successfully extracted" diff --git a/spec/factories/asset_text_datums.rb b/spec/factories/asset_text_datums.rb index 34c364647..fc38e0c4b 100644 --- a/spec/factories/asset_text_datums.rb +++ b/spec/factories/asset_text_datums.rb @@ -2,7 +2,6 @@ FactoryBot.define do factory :asset_text_datum do - data { "Sample name\tSample type\n" + "sample6\tsample\n" + "\n" } asset end end diff --git a/spec/models/asset_text_datum_spec.rb b/spec/models/asset_text_datum_spec.rb index 5fbb875ef..79f6f26fd 100644 --- a/spec/models/asset_text_datum_spec.rb +++ b/spec/models/asset_text_datum_spec.rb @@ -15,7 +15,6 @@ describe AssetTextDatum, type: :model do describe 'Database table' do it { should have_db_column :id } - it { should have_db_column :data } it { should have_db_column :asset_id } it { should have_db_column :created_at } it { should have_db_column :data_vector } @@ -26,12 +25,7 @@ describe AssetTextDatum, type: :model do end describe 'Validations' do - describe '#data' do - it { is_expected.to validate_presence_of(:data) } - end - describe '#asset' do - it { is_expected.to validate_presence_of(:asset) } it { expect(asset_text_datum).to validate_uniqueness_of(:asset) } end end