From 063aa5a470c4f24d6cecc85dbc38df571fb407c4 Mon Sep 17 00:00:00 2001 From: Alex Kriuchykhin Date: Thu, 20 Feb 2025 09:41:41 +0100 Subject: [PATCH] Install tika text extractor directly from Dockerfiles, remove old yomu gem [SCI-11589] (#8257) --- Dockerfile | 5 +++++ Dockerfile.production | 5 +++++ Gemfile | 1 - Gemfile.lock | 1 - app/models/concerns/tiny_mce_images.rb | 2 ++ lib/active_storage/analyzer/text_extraction_analyzer.rb | 5 ++++- 6 files changed, 16 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index da11bb47f..c8c236fd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,9 @@ FROM ruby:3.2-bookworm MAINTAINER SciNote +ARG TIKA_DIST_URL="https://dlcdn.apache.org/tika/2.9.3/tika-app-2.9.3.jar" +ENV TIKA_PATH=/usr/local/bin/tika-app.jar + # additional dependecies # libreoffice for file preview generation RUN apt-get update -qq && \ @@ -23,6 +26,8 @@ RUN apt-get update -qq && \ chromium \ chromium-sandbox \ yarnpkg && \ + wget -O $TIKA_PATH $TIKA_DIST_URL && \ + chmod +x $TIKA_PATH && \ ln -s /usr/lib/x86_64-linux-gnu/libvips.so.42 /usr/lib/x86_64-linux-gnu/libvips.so && \ rm -rf /var/lib/apt/lists/* diff --git a/Dockerfile.production b/Dockerfile.production index 2a8af22f0..aedacaf84 100644 --- a/Dockerfile.production +++ b/Dockerfile.production @@ -44,6 +44,9 @@ RUN \ FROM ruby:3.2-bookworm AS runner MAINTAINER SciNote +ARG TIKA_DIST_URL="https://dlcdn.apache.org/tika/2.9.3/tika-app-2.9.3.jar" +ENV TIKA_PATH=/usr/local/bin/tika-app.jar + RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache RUN \ --mount=type=cache,target=/var/cache/apt,sharing=locked \ @@ -79,6 +82,8 @@ RUN \ chromium-sandbox \ libfile-mimeinfo-perl \ yarnpkg && \ + wget -O $TIKA_PATH $TIKA_DIST_URL && \ + chmod +x $TIKA_PATH && \ /usr/share/nodejs/yarn/bin/yarn add puppeteer@npm:puppeteer-core@^22.15.0 && \ apt-get install -y libreoffice && \ ln -s /usr/lib/x86_64-linux-gnu/libvips.so.42 /usr/lib/x86_64-linux-gnu/libvips.so diff --git a/Gemfile b/Gemfile index af5807835..62b9e7530 100644 --- a/Gemfile +++ b/Gemfile @@ -17,7 +17,6 @@ gem 'recaptcha' gem 'sanitize' gem 'sprockets-rails' gem 'view_component' -gem 'yomu', git: 'https://github.com/scinote-eln/yomu', branch: 'master' # Gems for OAuth2 subsystem gem 'doorkeeper', '>= 4.6' diff --git a/Gemfile.lock b/Gemfile.lock index b08be085f..f0aef81a6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -893,7 +893,6 @@ DEPENDENCIES webmock whacamole wicked_pdf - yomu! zip-zip RUBY VERSION diff --git a/app/models/concerns/tiny_mce_images.rb b/app/models/concerns/tiny_mce_images.rb index cc0bcaa88..eae246af5 100644 --- a/app/models/concerns/tiny_mce_images.rb +++ b/app/models/concerns/tiny_mce_images.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require 'mime/types' + module TinyMceImages extend ActiveSupport::Concern diff --git a/lib/active_storage/analyzer/text_extraction_analyzer.rb b/lib/active_storage/analyzer/text_extraction_analyzer.rb index 5925f042b..41d22f54e 100644 --- a/lib/active_storage/analyzer/text_extraction_analyzer.rb +++ b/lib/active_storage/analyzer/text_extraction_analyzer.rb @@ -2,6 +2,8 @@ module ActiveStorage class Analyzer::TextExtractionAnalyzer < Analyzer + DEFAULT_TIKA_PATH = 'tika-app.jar' + def self.accept?(blob) blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any? end @@ -40,7 +42,8 @@ module ActiveStorage end def process_other(file) - text_data = Yomu.new(file.path).text + tika_path = ENV['TIKA_PATH'] || DEFAULT_TIKA_PATH + text_data = IO.popen(['java', '-Djava.awt.headless=true', '-jar', tika_path, '-t', file.path], 'r').read create_or_update_text_data(text_data) end