mirror of
https://github.com/scinote-eln/scinote-web.git
synced 2025-09-04 20:25:22 +08:00
Install tika text extractor directly from Dockerfiles, remove old yomu gem [SCI-11589] (#8257)
This commit is contained in:
parent
5d17d7d0a1
commit
063aa5a470
6 changed files with 16 additions and 3 deletions
|
@ -1,6 +1,9 @@
|
|||
FROM ruby:3.2-bookworm
|
||||
MAINTAINER SciNote <info@scinote.net>
|
||||
|
||||
ARG TIKA_DIST_URL="https://dlcdn.apache.org/tika/2.9.3/tika-app-2.9.3.jar"
|
||||
ENV TIKA_PATH=/usr/local/bin/tika-app.jar
|
||||
|
||||
# additional dependecies
|
||||
# libreoffice for file preview generation
|
||||
RUN apt-get update -qq && \
|
||||
|
@ -23,6 +26,8 @@ RUN apt-get update -qq && \
|
|||
chromium \
|
||||
chromium-sandbox \
|
||||
yarnpkg && \
|
||||
wget -O $TIKA_PATH $TIKA_DIST_URL && \
|
||||
chmod +x $TIKA_PATH && \
|
||||
ln -s /usr/lib/x86_64-linux-gnu/libvips.so.42 /usr/lib/x86_64-linux-gnu/libvips.so && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
|
|
@ -44,6 +44,9 @@ RUN \
|
|||
FROM ruby:3.2-bookworm AS runner
|
||||
MAINTAINER SciNote <info@scinote.net>
|
||||
|
||||
ARG TIKA_DIST_URL="https://dlcdn.apache.org/tika/2.9.3/tika-app-2.9.3.jar"
|
||||
ENV TIKA_PATH=/usr/local/bin/tika-app.jar
|
||||
|
||||
RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
|
||||
RUN \
|
||||
--mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
|
@ -79,6 +82,8 @@ RUN \
|
|||
chromium-sandbox \
|
||||
libfile-mimeinfo-perl \
|
||||
yarnpkg && \
|
||||
wget -O $TIKA_PATH $TIKA_DIST_URL && \
|
||||
chmod +x $TIKA_PATH && \
|
||||
/usr/share/nodejs/yarn/bin/yarn add puppeteer@npm:puppeteer-core@^22.15.0 && \
|
||||
apt-get install -y libreoffice && \
|
||||
ln -s /usr/lib/x86_64-linux-gnu/libvips.so.42 /usr/lib/x86_64-linux-gnu/libvips.so
|
||||
|
|
1
Gemfile
1
Gemfile
|
@ -17,7 +17,6 @@ gem 'recaptcha'
|
|||
gem 'sanitize'
|
||||
gem 'sprockets-rails'
|
||||
gem 'view_component'
|
||||
gem 'yomu', git: 'https://github.com/scinote-eln/yomu', branch: 'master'
|
||||
|
||||
# Gems for OAuth2 subsystem
|
||||
gem 'doorkeeper', '>= 4.6'
|
||||
|
|
|
@ -893,7 +893,6 @@ DEPENDENCIES
|
|||
webmock
|
||||
whacamole
|
||||
wicked_pdf
|
||||
yomu!
|
||||
zip-zip
|
||||
|
||||
RUBY VERSION
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
require 'mime/types'
|
||||
|
||||
module TinyMceImages
|
||||
extend ActiveSupport::Concern
|
||||
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
module ActiveStorage
|
||||
class Analyzer::TextExtractionAnalyzer < Analyzer
|
||||
DEFAULT_TIKA_PATH = 'tika-app.jar'
|
||||
|
||||
def self.accept?(blob)
|
||||
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any?
|
||||
end
|
||||
|
@ -40,7 +42,8 @@ module ActiveStorage
|
|||
end
|
||||
|
||||
def process_other(file)
|
||||
text_data = Yomu.new(file.path).text
|
||||
tika_path = ENV['TIKA_PATH'] || DEFAULT_TIKA_PATH
|
||||
text_data = IO.popen(['java', '-Djava.awt.headless=true', '-jar', tika_path, '-t', file.path], 'r').read
|
||||
create_or_update_text_data(text_data)
|
||||
end
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue