Install tika text extractor directly from Dockerfiles, remove old yomu gem [SCI-11589] (#8257)

This commit is contained in:
Alex Kriuchykhin 2025-02-20 09:41:41 +01:00 committed by GitHub
parent 5d17d7d0a1
commit 063aa5a470
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 16 additions and 3 deletions

View file

@ -1,6 +1,9 @@
FROM ruby:3.2-bookworm
MAINTAINER SciNote <info@scinote.net>
ARG TIKA_DIST_URL="https://dlcdn.apache.org/tika/2.9.3/tika-app-2.9.3.jar"
ENV TIKA_PATH=/usr/local/bin/tika-app.jar
# additional dependecies
# libreoffice for file preview generation
RUN apt-get update -qq && \
@ -23,6 +26,8 @@ RUN apt-get update -qq && \
chromium \
chromium-sandbox \
yarnpkg && \
wget -O $TIKA_PATH $TIKA_DIST_URL && \
chmod +x $TIKA_PATH && \
ln -s /usr/lib/x86_64-linux-gnu/libvips.so.42 /usr/lib/x86_64-linux-gnu/libvips.so && \
rm -rf /var/lib/apt/lists/*

View file

@ -44,6 +44,9 @@ RUN \
FROM ruby:3.2-bookworm AS runner
MAINTAINER SciNote <info@scinote.net>
ARG TIKA_DIST_URL="https://dlcdn.apache.org/tika/2.9.3/tika-app-2.9.3.jar"
ENV TIKA_PATH=/usr/local/bin/tika-app.jar
RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
RUN \
--mount=type=cache,target=/var/cache/apt,sharing=locked \
@ -79,6 +82,8 @@ RUN \
chromium-sandbox \
libfile-mimeinfo-perl \
yarnpkg && \
wget -O $TIKA_PATH $TIKA_DIST_URL && \
chmod +x $TIKA_PATH && \
/usr/share/nodejs/yarn/bin/yarn add puppeteer@npm:puppeteer-core@^22.15.0 && \
apt-get install -y libreoffice && \
ln -s /usr/lib/x86_64-linux-gnu/libvips.so.42 /usr/lib/x86_64-linux-gnu/libvips.so

View file

@ -17,7 +17,6 @@ gem 'recaptcha'
gem 'sanitize'
gem 'sprockets-rails'
gem 'view_component'
gem 'yomu', git: 'https://github.com/scinote-eln/yomu', branch: 'master'
# Gems for OAuth2 subsystem
gem 'doorkeeper', '>= 4.6'

View file

@ -893,7 +893,6 @@ DEPENDENCIES
webmock
whacamole
wicked_pdf
yomu!
zip-zip
RUBY VERSION

View file

@ -1,5 +1,7 @@
# frozen_string_literal: true
require 'mime/types'
module TinyMceImages
extend ActiveSupport::Concern

View file

@ -2,6 +2,8 @@
module ActiveStorage
class Analyzer::TextExtractionAnalyzer < Analyzer
DEFAULT_TIKA_PATH = 'tika-app.jar'
def self.accept?(blob)
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any?
end
@ -40,7 +42,8 @@ module ActiveStorage
end
def process_other(file)
text_data = Yomu.new(file.path).text
tika_path = ENV['TIKA_PATH'] || DEFAULT_TIKA_PATH
text_data = IO.popen(['java', '-Djava.awt.headless=true', '-jar', tika_path, '-t', file.path], 'r').read
create_or_update_text_data(text_data)
end