From f8581c225d9cf697cf6eb9caaccb4d99114e025a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Mon, 25 Sep 2023 18:45:19 +0200 Subject: [PATCH] Support audio transcription on drag and drop (#2227) --- config/config.exs | 3 +- lib/livebook/runtime/definitions.ex | 86 ++++++++++++++++++----------- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/config/config.exs b/config/config.exs index ac02d4eed..a4257e964 100644 --- a/config/config.exs +++ b/config/config.exs @@ -16,8 +16,9 @@ config :logger, :console, # Use Jason for JSON parsing in Phoenix config :phoenix, :json_library, Jason -# Add mime type to upload notebooks with `Phoenix.LiveView.Upload` +# Additional mime types config :mime, :types, %{ + "audio/m4a" => ["m4a"], "text/plain" => ["livemd"] } diff --git a/lib/livebook/runtime/definitions.ex b/lib/livebook/runtime/definitions.ex index 4436c555c..ca77553aa 100644 --- a/lib/livebook/runtime/definitions.ex +++ b/lib/livebook/runtime/definitions.ex @@ -31,12 +31,12 @@ defmodule Livebook.Runtime.Definitions do kino_bumblebee = %{ name: "kino_bumblebee", - dependency: %{dep: {:kino_bumblebee, "~> 0.3.0"}, config: []} + dependency: %{dep: {:kino_bumblebee, github: "livebook-dev/kino_bumblebee"}, config: []} } exla = %{ name: "exla", - dependency: %{dep: {:exla, "~> 0.5.1"}, config: [nx: [default_backend: EXLA.Backend]]} + dependency: %{dep: {:exla, ">= 0.0.0"}, config: [nx: [default_backend: EXLA.Backend]]} } torchx = %{ @@ -338,40 +338,60 @@ defmodule Livebook.Runtime.Definitions do type: :file_action, file_types: ["audio/*"], description: "Transcribe speech", - source: """ - # To explore more models, see "+ Smart" > "Neural Network task" - - {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"}) - {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"}) - {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"}) - {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"}) - generation_config = Bumblebee.configure(generation_config, max_new_tokens: 100) - - #{if windows? do + source: """ - serving = Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config, - chunk_num_seconds: 30, - timestamps: :segments, - compile: [batch_size: 4] - )\ - """ - else - """ - serving = - Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config, - chunk_num_seconds: 30, - timestamps: :segments, - compile: [batch_size: 4], - defn_options: [compiler: EXLA] - )\ - """ - end} + # To explore more models, see "+ Smart" > "Neural Network task" - path = Kino.FS.file_path("{{NAME}}") - output = Nx.Serving.run(serving, {:file, path}) + {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"}) + {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"}) + {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"}) + {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"}) + generation_config = Bumblebee.configure(generation_config, max_new_tokens: 100) - # output.chunks |> Enum.map_join(& &1.text) |> String.trim()\ - """, + """ <> + if windows? do + """ + serving = + Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config, + chunk_num_seconds: 30, + timestamps: :segments, + stream: true, + compile: [batch_size: 4] + ) + """ + else + """ + serving = + Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config, + chunk_num_seconds: 30, + timestamps: :segments, + stream: true, + compile: [batch_size: 4], + defn_options: [compiler: EXLA] + ) + """ + end <> + ~S""" + + path = Kino.FS.file_path("{{NAME}}") + Kino.render(Kino.Text.new("(Start of transcription)", chunk: true)) + + for chunk <- Nx.Serving.run(serving, {:file, path}) do + [start_mark, end_mark] = + for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do + seconds + |> round() + |> Time.from_seconds_after_midnight() + |> Time.to_string() + end + + text = "\n#{start_mark}-#{end_mark}: #{chunk.text}" + Kino.render(Kino.Text.new(text, chunk: true)) + end + + Kino.render(Kino.Text.new("\n(End of transcription)", chunk: true)) + :ok + """, packages: [kino_bumblebee, nx_backend_package] }, %{