diff --git a/lib/livebook/fly_api.ex b/lib/livebook/fly_api.ex index 21ab280c9..8cb1fc216 100644 --- a/lib/livebook/fly_api.ex +++ b/lib/livebook/fly_api.ex @@ -188,6 +188,22 @@ defmodule Livebook.FlyAPI do %{id: machine["id"], private_ip: machine["private_ip"]} end + @doc """ + Deletes the given machine. + """ + @spec delete_machine(String.t(), String.t(), String.t()) :: :ok | {:error, error} + def delete_machine(token, app_name, machine_id) do + params = %{force: true} + + with {:ok, _data} <- + flaps_request(token, "/v1/apps/#{app_name}/machines/#{machine_id}", + method: :delete, + params: params + ) do + :ok + end + end + @doc """ Waits for the machine to start. """ diff --git a/lib/livebook/runtime.ex b/lib/livebook/runtime.ex index ef6824e50..85585b33a 100644 --- a/lib/livebook/runtime.ex +++ b/lib/livebook/runtime.ex @@ -806,13 +806,20 @@ defprotocol Livebook.Runtime do The `runtime` should be the struct updated with all information necessary for further communication. - In case the initialization is a particularly involved process, the - process may send updates to the caller: + In case the initialization is a particularly involved, the process + may send updates to the caller: * `{:runtime_connect_info, pid, info}` Where `info` is a few word text describing the current initialization step. + + If the caller decides to abort the initialization, they can forecefully + kill the process. The runtime resources should already be tolerant + to abrupt Livebook termination and autodestroy through monitoring + and timeouts. However, when the initialization process gets killed, + it may be desirable to eagerly remove the resources it has already + allocated, which can be achieved with an additional watcher process. """ @spec connect(t()) :: pid() def connect(runtime) diff --git a/lib/livebook/runtime/fly.ex b/lib/livebook/runtime/fly.ex index 8213060cb..9ea467806 100644 --- a/lib/livebook/runtime/fly.ex +++ b/lib/livebook/runtime/fly.ex @@ -116,6 +116,14 @@ defmodule Livebook.Runtime.Fly do |> :erlang.term_to_binary() |> Base.encode64() + parent = self() + + {:ok, watcher_pid} = + DynamicSupervisor.start_child( + Livebook.RuntimeSupervisor, + {Task, fn -> watcher(parent, config) end} + ) + with :ok <- (if config.volume_id && runtime.previous_machine_id do with_log(caller, "await resources", fn -> @@ -128,6 +136,7 @@ defmodule Livebook.Runtime.Fly do with_log(caller, "create machine", fn -> create_machine(config, runtime_data) end), + _ <- send(watcher_pid, {:machine_created, machine_id}), child_node <- :"#{node_base}@#{machine_id}.vm.#{config.app_name}.internal", {:ok, proxy_port} <- with_log(caller, "start proxy", fn -> @@ -151,6 +160,8 @@ defmodule Livebook.Runtime.Fly do send(primary_pid, :node_initialized) + send(watcher_pid, :done) + runtime = %{runtime | node: child_node, server_pid: server_pid, machine_id: machine_id} send(caller, {:runtime_connect_done, self(), {:ok, runtime}}) @@ -172,6 +183,29 @@ defmodule Livebook.Runtime.Fly do {:noreply, state} end + defp watcher(parent, config) do + ref = Process.monitor(parent) + watcher_loop(%{ref: ref, config: config, machine_id: nil}) + end + + defp watcher_loop(state) do + receive do + {:DOWN, ref, :process, _pid, _reason} when ref == state.ref -> + # If the parent process is killed, we try to eagerly free the + # created resources + if machine_id = state.machine_id do + config = state.config + _ = Livebook.FlyAPI.delete_machine(config.token, config.app_name, machine_id) + end + + {:machine_created, machine_id} -> + watcher_loop(%{state | machine_id: machine_id}) + + :done -> + :ok + end + end + defp create_machine(config, runtime_data) do base_image = Enum.find(Livebook.Config.docker_images(), &(&1.tag == config.docker_tag)) image = "ghcr.io/livebook-dev/livebook:#{base_image.tag}" diff --git a/lib/livebook/session.ex b/lib/livebook/session.ex index 9a15e485c..1d09dcb05 100644 --- a/lib/livebook/session.ex +++ b/lib/livebook/session.ex @@ -2424,9 +2424,15 @@ defmodule Livebook.Session do end defp handle_action(state, {:disconnect_runtime, runtime}) do - Runtime.disconnect(runtime) - state = %{state | runtime_monitor_ref: nil} - after_runtime_disconnected(state) + if state.runtime_connect do + Process.demonitor(state.runtime_connect.ref, [:flush]) + Process.exit(state.runtime_connect.pid, :kill) + %{state | runtime_connect: nil} + else + Runtime.disconnect(runtime) + state = %{state | runtime_monitor_ref: nil} + after_runtime_disconnected(state) + end end defp handle_action(state, {:start_evaluation, cell, section, evaluation_opts}) do diff --git a/lib/livebook/session/data.ex b/lib/livebook/session/data.ex index 83dd7caed..9af001556 100644 --- a/lib/livebook/session/data.ex +++ b/lib/livebook/session/data.ex @@ -921,7 +921,7 @@ defmodule Livebook.Session.Data do end def apply_operation(data, {:disconnect_runtime, _client_id}) do - with :connected <- data.runtime_status do + with true <- data.runtime_status in [:connecting, :connected] do data |> with_actions() |> disconnect_runtime() diff --git a/lib/livebook_web/live/session_live/fly_runtime_component.ex b/lib/livebook_web/live/session_live/fly_runtime_component.ex index e2fc4f4d9..5693fa987 100644 --- a/lib/livebook_web/live/session_live/fly_runtime_component.ex +++ b/lib/livebook_web/live/session_live/fly_runtime_component.ex @@ -122,16 +122,27 @@ defmodule LivebookWeb.SessionLive.FlyRuntimeComponent do />