Support audio transcription on drag and drop (#2227)

2025-09-06 04:54:29 +08:00 · 2023-09-25 18:45:19 +02:00 · 2023-09-25 18:45:19 +02:00 · f8581c225d
commit f8581c225d
parent f25d6663d8
2 changed files with 55 additions and 34 deletions
--- a/config/config.exs
+++ b/config/config.exs
@ -16,8 +16,9 @@ config :logger, :console,
 # Use Jason for JSON parsing in Phoenix
 config :phoenix, :json_library, Jason
-# Add mime type to upload notebooks with `Phoenix.LiveView.Upload`
+# Additional mime types
 config :mime, :types, %{
  "audio/m4a" => ["m4a"],
  "text/plain" => ["livemd"]
 }
--- a/lib/livebook/runtime/definitions.ex
+++ b/lib/livebook/runtime/definitions.ex
@ -31,12 +31,12 @@ defmodule Livebook.Runtime.Definitions do
  kino_bumblebee = %{
    name: "kino_bumblebee",
-    dependency: %{dep: {:kino_bumblebee, "~> 0.3.0"}, config: []}
+    dependency: %{dep: {:kino_bumblebee, github: "livebook-dev/kino_bumblebee"}, config: []}
  }
  exla = %{
    name: "exla",
-    dependency: %{dep: {:exla, "~> 0.5.1"}, config: [nx: [default_backend: EXLA.Backend]]}
+    dependency: %{dep: {:exla, ">= 0.0.0"}, config: [nx: [default_backend: EXLA.Backend]]}
  }
  torchx = %{
@ -338,40 +338,60 @@ defmodule Livebook.Runtime.Definitions do
      type: :file_action,
      file_types: ["audio/*"],
      description: "Transcribe speech",
-      source: """
+      source:
      # To explore more models, see "+ Smart" > "Neural Network task"
      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
      generation_config = Bumblebee.configure(generation_config, max_new_tokens: 100)
      #{if windows? do
        """
-        serving = Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config,
+        # To explore more models, see "+ Smart" > "Neural Network task"
          chunk_num_seconds: 30,
          timestamps: :segments,
          compile: [batch_size: 4]
        )\
        """
      else
        """
        serving =
          Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config,
            chunk_num_seconds: 30,
            timestamps: :segments,
            compile: [batch_size: 4],
            defn_options: [compiler: EXLA]
          )\
        """
      end}
-      path = Kino.FS.file_path("{{NAME}}")
+        {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      output = Nx.Serving.run(serving, {:file, path})
+        {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
        {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
        {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
        generation_config = Bumblebee.configure(generation_config, max_new_tokens: 100)
-      # output.chunks |> Enum.map_join(& &1.text) |> String.trim()\
+        """ <>
-      """,
+          if windows? do
            """
            serving =
              Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config,
                chunk_num_seconds: 30,
                timestamps: :segments,
                stream: true,
                compile: [batch_size: 4]
              )
            """
          else
            """
            serving =
              Bumblebee.Audio.speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config,
                chunk_num_seconds: 30,
                timestamps: :segments,
                stream: true,
                compile: [batch_size: 4],
                defn_options: [compiler: EXLA]
              )
            """
          end <>
          ~S"""
          path = Kino.FS.file_path("{{NAME}}")
          Kino.render(Kino.Text.new("(Start of transcription)", chunk: true))
          for chunk <- Nx.Serving.run(serving, {:file, path}) do
            [start_mark, end_mark] =
              for seconds <- [chunk.start_timestamp_seconds, chunk.end_timestamp_seconds] do
                seconds
                |> round()
                |> Time.from_seconds_after_midnight()
                |> Time.to_string()
              end
            text = "\n#{start_mark}-#{end_mark}: #{chunk.text}"
            Kino.render(Kino.Text.new(text, chunk: true))
          end
          Kino.render(Kino.Text.new("\n(End of transcription)", chunk: true))
          :ok
          """,
      packages: [kino_bumblebee, nx_backend_package]
    },
    %{