Allow disconnecting Fly runtime during initialization (#2776)

This commit is contained in:
Jonatan Kłosko 2024-09-09 16:27:12 +02:00 committed by GitHub
parent 85ae48ee3c
commit 2eb5963efe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 98 additions and 22 deletions

View file

@ -188,6 +188,22 @@ defmodule Livebook.FlyAPI do
%{id: machine["id"], private_ip: machine["private_ip"]}
end
@doc """
Deletes the given machine.
"""
@spec delete_machine(String.t(), String.t(), String.t()) :: :ok | {:error, error}
def delete_machine(token, app_name, machine_id) do
params = %{force: true}
with {:ok, _data} <-
flaps_request(token, "/v1/apps/#{app_name}/machines/#{machine_id}",
method: :delete,
params: params
) do
:ok
end
end
@doc """
Waits for the machine to start.
"""

View file

@ -806,13 +806,20 @@ defprotocol Livebook.Runtime do
The `runtime` should be the struct updated with all information
necessary for further communication.
In case the initialization is a particularly involved process, the
process may send updates to the caller:
In case the initialization is a particularly involved, the process
may send updates to the caller:
* `{:runtime_connect_info, pid, info}`
Where `info` is a few word text describing the current initialization
step.
If the caller decides to abort the initialization, they can forecefully
kill the process. The runtime resources should already be tolerant
to abrupt Livebook termination and autodestroy through monitoring
and timeouts. However, when the initialization process gets killed,
it may be desirable to eagerly remove the resources it has already
allocated, which can be achieved with an additional watcher process.
"""
@spec connect(t()) :: pid()
def connect(runtime)

View file

@ -116,6 +116,14 @@ defmodule Livebook.Runtime.Fly do
|> :erlang.term_to_binary()
|> Base.encode64()
parent = self()
{:ok, watcher_pid} =
DynamicSupervisor.start_child(
Livebook.RuntimeSupervisor,
{Task, fn -> watcher(parent, config) end}
)
with :ok <-
(if config.volume_id && runtime.previous_machine_id do
with_log(caller, "await resources", fn ->
@ -128,6 +136,7 @@ defmodule Livebook.Runtime.Fly do
with_log(caller, "create machine", fn ->
create_machine(config, runtime_data)
end),
_ <- send(watcher_pid, {:machine_created, machine_id}),
child_node <- :"#{node_base}@#{machine_id}.vm.#{config.app_name}.internal",
{:ok, proxy_port} <-
with_log(caller, "start proxy", fn ->
@ -151,6 +160,8 @@ defmodule Livebook.Runtime.Fly do
send(primary_pid, :node_initialized)
send(watcher_pid, :done)
runtime = %{runtime | node: child_node, server_pid: server_pid, machine_id: machine_id}
send(caller, {:runtime_connect_done, self(), {:ok, runtime}})
@ -172,6 +183,29 @@ defmodule Livebook.Runtime.Fly do
{:noreply, state}
end
defp watcher(parent, config) do
ref = Process.monitor(parent)
watcher_loop(%{ref: ref, config: config, machine_id: nil})
end
defp watcher_loop(state) do
receive do
{:DOWN, ref, :process, _pid, _reason} when ref == state.ref ->
# If the parent process is killed, we try to eagerly free the
# created resources
if machine_id = state.machine_id do
config = state.config
_ = Livebook.FlyAPI.delete_machine(config.token, config.app_name, machine_id)
end
{:machine_created, machine_id} ->
watcher_loop(%{state | machine_id: machine_id})
:done ->
:ok
end
end
defp create_machine(config, runtime_data) do
base_image = Enum.find(Livebook.Config.docker_images(), &(&1.tag == config.docker_tag))
image = "ghcr.io/livebook-dev/livebook:#{base_image.tag}"

View file

@ -2424,9 +2424,15 @@ defmodule Livebook.Session do
end
defp handle_action(state, {:disconnect_runtime, runtime}) do
Runtime.disconnect(runtime)
state = %{state | runtime_monitor_ref: nil}
after_runtime_disconnected(state)
if state.runtime_connect do
Process.demonitor(state.runtime_connect.ref, [:flush])
Process.exit(state.runtime_connect.pid, :kill)
%{state | runtime_connect: nil}
else
Runtime.disconnect(runtime)
state = %{state | runtime_monitor_ref: nil}
after_runtime_disconnected(state)
end
end
defp handle_action(state, {:start_evaluation, cell, section, evaluation_opts}) do

View file

@ -921,7 +921,7 @@ defmodule Livebook.Session.Data do
end
def apply_operation(data, {:disconnect_runtime, _client_id}) do
with :connected <- data.runtime_status do
with true <- data.runtime_status in [:connecting, :connected] do
data
|> with_actions()
|> disconnect_runtime()

View file

@ -122,16 +122,27 @@ defmodule LivebookWeb.SessionLive.FlyRuntimeComponent do
/>
<div class="mt-8">
<.button
phx-click="init"
phx-target={@myself}
disabled={
@runtime_status == :connecting or not @specs_changeset.valid? or
volume_errors(@volume_id, @volumes, @region) != []
}
>
<%= label(@app_name, @runtime, @runtime_status) %>
</.button>
<div class="flex gap-2">
<.button
phx-click="init"
phx-target={@myself}
disabled={
@runtime_status == :connecting or not @specs_changeset.valid? or
volume_errors(@volume_id, @volumes, @region) != []
}
>
<%= label(@app_name, @runtime, @runtime_status) %>
</.button>
<.button
:if={@runtime_status == :connecting}
color="red"
outlined
phx-click="disconnect"
phx-target={@myself}
>
Disconnect
</.button>
</div>
<div
:if={reconnecting?(@app_name, @runtime) && @runtime_connect_info}
class="mt-4 scroll-mb-8"
@ -582,6 +593,11 @@ defmodule LivebookWeb.SessionLive.FlyRuntimeComponent do
{:noreply, socket}
end
def handle_event("disconnect", %{}, socket) do
Session.disconnect_runtime(socket.assigns.session.pid)
{:noreply, socket}
end
def handle_event("open_save_config", %{}, socket) do
changeset = config_secret_changeset(socket, %{name: @config_secret_prefix})
save_config = %{changeset: changeset, inflight: false, error: false}

View file

@ -676,7 +676,7 @@ defmodule LivebookWeb.SessionLive.Render do
</.button>
<.button
:if={@data_view.runtime_status == :connected}
:if={@data_view.runtime_status in [:connected, :connecting]}
color="red"
outlined
type="button"

View file

@ -3940,11 +3940,8 @@ defmodule Livebook.Session.DataTest do
end
describe "apply_operation/2 given :disconnect_runtime" do
test "returns an error if the runtime is not connected" do
data =
data_after_operations!([
{:connect_runtime, @cid}
])
test "returns an error if the runtime is disconnected" do
data = Data.new()
operation = {:disconnect_runtime, @cid}