Show a message when a smart cell crashes and allow restarts (#1381)

This commit is contained in:
Jonatan Kłosko 2022-09-02 15:50:13 +02:00 committed by GitHub
parent e35d026422
commit ab9ad0369c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 203 additions and 13 deletions

View file

@ -409,6 +409,10 @@ defprotocol Livebook.Runtime do
* `{:runtime_smart_cell_started, ref, %{js_view: js_view(), source: String.t()}}`
In case of an unexpected failure it should also send
* `{:runtime_smart_cell_down, ref}`
## Communication
Apart from the regular JS view communication, the cell sends updates

View file

@ -229,7 +229,8 @@ defmodule Livebook.Runtime.ErlDist.RuntimeServer do
{:noreply,
state
|> handle_down_evaluator(message)
|> handle_down_scan_binding(message)}
|> handle_down_scan_binding(message)
|> handle_down_smart_cell(message)}
end
def handle_info({:evaluation_finished, locator}, state) do
@ -275,13 +276,25 @@ defmodule Livebook.Runtime.ErlDist.RuntimeServer do
end
defp handle_down_scan_binding(state, {:DOWN, monitor_ref, :process, _, _}) do
Enum.find_value(state.smart_cells, fn
{ref, %{scan_binding_monitor_ref: ^monitor_ref}} -> ref
_ -> nil
end)
state.smart_cells
|> Enum.find(fn {_ref, info} -> info.scan_binding_monitor_ref == monitor_ref end)
|> case do
{ref, _info} -> finish_scan_binding(ref, state)
nil -> state
ref -> finish_scan_binding(ref, state)
end
end
defp handle_down_smart_cell(state, {:DOWN, monitor_ref, :process, _, _}) do
state.smart_cells
|> Enum.find(fn {_ref, info} -> info.monitor_ref == monitor_ref end)
|> case do
{ref, _info} ->
send(state.owner, {:runtime_smart_cell_down, ref})
{_, state} = pop_in(state.smart_cells[ref])
state
nil ->
state
end
end
@ -412,6 +425,7 @@ defmodule Livebook.Runtime.ErlDist.RuntimeServer do
info = %{
pid: pid,
monitor_ref: Process.monitor(pid),
scan_binding: scan_binding,
base_locator: base_locator,
scan_binding_pending: false,

View file

@ -338,7 +338,15 @@ defmodule Livebook.Session do
end
@doc """
Sends cell convertion request to the server.
Sends cell recover request to the server.
"""
@spec recover_smart_cell(pid(), Cell.id()) :: :ok
def recover_smart_cell(pid, cell_id) do
GenServer.cast(pid, {:recover_smart_cell, self(), cell_id})
end
@doc """
Sends cell conversion request to the server.
"""
@spec convert_smart_cell(pid(), Cell.id()) :: :ok
def convert_smart_cell(pid, cell_id) do
@ -794,6 +802,12 @@ defmodule Livebook.Session do
{:noreply, handle_operation(state, operation)}
end
def handle_cast({:recover_smart_cell, client_pid, cell_id}, state) do
client_id = client_id(state, client_pid)
operation = {:recover_smart_cell, client_id, cell_id}
{:noreply, handle_operation(state, operation)}
end
def handle_cast({:convert_smart_cell, client_pid, cell_id}, state) do
client_id = client_id(state, client_pid)
@ -1068,6 +1082,11 @@ defmodule Livebook.Session do
end
end
def handle_info({:runtime_smart_cell_down, id}, state) do
operation = {:smart_cell_down, @client_id, id}
{:noreply, handle_operation(state, operation)}
end
def handle_info({:runtime_smart_cell_update, id, attrs, source, info}, state) do
case Notebook.fetch_cell_and_section(state.data.notebook, id) do
{:ok, cell, _section} ->

View file

@ -112,7 +112,7 @@ defmodule Livebook.Session.Data do
@type cell_evaluation_validity :: :fresh | :evaluated | :stale | :aborted
@type cell_evaluation_status :: :ready | :queued | :evaluating
@type smart_cell_status :: :dead | :starting | :started
@type smart_cell_status :: :dead | :starting | :started | :down
@type input_id :: String.t()
@ -175,6 +175,8 @@ defmodule Livebook.Session.Data do
Cell.Smart.editor() | nil}
| {:update_smart_cell, client_id(), Cell.id(), Cell.Smart.attrs(), Delta.t(),
reevaluate :: boolean()}
| {:smart_cell_down, client_id(), Cell.id()}
| {:recover_smart_cell, client_id(), Cell.id()}
| {:erase_outputs, client_id()}
| {:set_notebook_name, client_id(), String.t()}
| {:set_section_name, client_id(), Section.id(), String.t()}
@ -586,6 +588,31 @@ defmodule Livebook.Session.Data do
end
end
def apply_operation(data, {:smart_cell_down, _client_id, id}) do
with {:ok, %Cell.Smart{} = cell, _section} <-
Notebook.fetch_cell_and_section(data.notebook, id) do
data
|> with_actions()
|> smart_cell_down(cell)
|> wrap_ok()
else
_ -> :error
end
end
def apply_operation(data, {:recover_smart_cell, _client_id, cell_id}) do
with {:ok, %Cell.Smart{} = cell, section} <-
Notebook.fetch_cell_and_section(data.notebook, cell_id),
:down <- data.cell_infos[cell_id].status do
data
|> with_actions()
|> recover_smart_cell(cell, section)
|> wrap_ok()
else
_ -> :error
end
end
def apply_operation(data, {:erase_outputs, _client_id}) do
data
|> with_actions()
@ -819,7 +846,7 @@ defmodule Livebook.Session.Data do
info = data.cell_infos[cell.id]
data_actions =
if is_struct(cell, Cell.Smart) and info.status != :dead do
if is_struct(cell, Cell.Smart) and info.status in [:started, :starting] do
add_action(data_actions, {:stop_smart_cell, cell})
else
data_actions
@ -1290,6 +1317,11 @@ defmodule Livebook.Session.Data do
|> add_action({:broadcast_delta, client_id, updated_cell, :primary, delta})
end
defp smart_cell_down(data_actions, cell) do
data_actions
|> update_cell_info!(cell.id, &%{&1 | status: :down})
end
defp maybe_queue_updated_smart_cell({data, _} = data_actions, cell, section, reevaluate) do
info = data.cell_infos[cell.id]
@ -1305,6 +1337,14 @@ defmodule Livebook.Session.Data do
end
end
defp recover_smart_cell({data, _} = data_actions, cell, section) do
if Runtime.connected?(data.runtime) do
start_smart_cell(data_actions, cell, section)
else
data_actions
end
end
defp erase_outputs({data, _} = data_actions) do
data_actions
|> clear_all_evaluation()
@ -1499,15 +1539,19 @@ defmodule Livebook.Session.Data do
cells_ready_to_start = Enum.filter(dead_cells, fn {cell, _} -> cell.kind in kinds end)
reduce(data_actions, cells_ready_to_start, fn data_actions, {cell, section} ->
data_actions
|> update_cell_info!(cell.id, &%{&1 | status: :starting})
|> add_action({:start_smart_cell, cell, section})
start_smart_cell(data_actions, cell, section)
end)
else
data_actions
end
end
defp start_smart_cell(data_actions, cell, section) do
data_actions
|> update_cell_info!(cell.id, &%{&1 | status: :starting})
|> add_action({:start_smart_cell, cell, section})
end
defp dead_smart_cells_with_section(data) do
for section <- Notebook.all_sections(data.notebook),
%Cell.Smart{} = cell <- section.cells,
@ -1583,7 +1627,7 @@ defmodule Livebook.Session.Data do
defp update_smart_cell_bases({data, _} = data_actions, prev_data) do
alive_smart_cell_ids =
for {%Cell.Smart{} = cell, _} <- Notebook.cells_with_section(data.notebook),
data.cell_infos[cell.id].status != :dead,
data.cell_infos[cell.id].status in [:started, :starting],
into: MapSet.new(),
do: cell.id

View file

@ -906,6 +906,13 @@ defmodule LivebookWeb.SessionLive do
{:noreply, socket}
end
def handle_event("recover_smart_cell", %{"cell_id" => cell_id}, socket) do
assert_policy!(socket, :read)
Session.recover_smart_cell(socket.assigns.session.pid, cell_id)
{:noreply, socket}
end
def handle_event("convert_smart_cell", %{"cell_id" => cell_id}, socket) do
assert_policy!(socket, :edit)
Session.convert_smart_cell(socket.assigns.session.pid, cell_id)

View file

@ -210,6 +210,18 @@ defmodule LivebookWeb.SessionLive.CellComponent do
Run the notebook setup to show the contents of this Smart cell.
<% end %>
</div>
<% :down -> %>
<div class="info-box flex justify-between items-center">
<span>
The Smart cell crashed unexpectedly, this is most likely a bug.
</span>
<button
class="button-base button-gray"
phx-click={JS.push("recover_smart_cell", value: %{cell_id: @cell_view.id})}
>
Restart Smart cell
</button>
</div>
<% :starting -> %>
<div class="delay-200">
<.content_skeleton empty={false} />

View file

@ -250,6 +250,14 @@ defmodule Livebook.Runtime.ErlDist.RuntimeServerTest do
assert_receive {:runtime_smart_cell_started, "ref", %{js_view: %{}, source: "source"}}
end
@tag opts: @opts
test "notifies runtime owner when a smart cell goes down", %{pid: pid} do
RuntimeServer.start_smart_cell(pid, "dumb", "ref", %{}, {:c1, nil})
assert_receive {:runtime_smart_cell_started, "ref", %{js_view: %{pid: pid}}}
Process.exit(pid, :crashed)
assert_receive {:runtime_smart_cell_down, "ref"}
end
@tag opts: @opts
test "once started scans binding and sends the result to the cell server", %{pid: pid} do
RuntimeServer.start_smart_cell(pid, "dumb", "ref", %{}, {:c1, nil})

View file

@ -2735,6 +2735,56 @@ defmodule Livebook.Session.DataTest do
end
end
describe "apply_operation/2 given :smart_cell_down" do
test "updates the smart cell status" do
data =
data_after_operations!([
{:insert_section, @cid, 0, "s1"},
{:set_runtime, @cid, connected_noop_runtime()},
{:set_smart_cell_definitions, @cid, @smart_cell_definitions},
{:insert_cell, @cid, "s1", 0, :smart, "c1", %{kind: "text"}}
])
operation = {:smart_cell_down, @cid, "c1"}
assert {:ok, %{cell_infos: %{"c1" => %{status: :down}}}, _actions} =
Data.apply_operation(data, operation)
end
end
describe "apply_operation/2 given :recover_smart_cell" do
test "returns an error if the smart cell is not down" do
data =
data_after_operations!([
{:insert_section, @cid, 0, "s1"},
{:set_runtime, @cid, connected_noop_runtime()},
{:set_smart_cell_definitions, @cid, @smart_cell_definitions},
{:insert_cell, @cid, "s1", 0, :smart, "c1", %{kind: "text"}}
])
operation = {:recover_smart_cell, @cid, "c1"}
assert :error = Data.apply_operation(data, operation)
end
test "marks the smart cell as starting if there is a runtime" do
data =
data_after_operations!([
{:insert_section, @cid, 0, "s1"},
{:set_runtime, @cid, connected_noop_runtime()},
{:set_smart_cell_definitions, @cid, @smart_cell_definitions},
{:insert_cell, @cid, "s1", 0, :smart, "c1", %{kind: "text"}},
{:smart_cell_down, @cid, "c1"}
])
operation = {:recover_smart_cell, @cid, "c1"}
assert {:ok, %{cell_infos: %{"c1" => %{status: :starting}}},
[{:start_smart_cell, %{id: "c1"}, %{id: "s1"}}]} =
Data.apply_operation(data, operation)
end
end
describe "apply_operation/2 given :erase_outputs" do
test "clears all sections evaluation and queues" do
data =

View file

@ -105,6 +105,38 @@ defmodule Livebook.SessionTest do
end
end
describe "recover_smart_cell/2" do
test "sends a recover operations to subscribers and starts the smart cell" do
smart_cell = %{Notebook.Cell.new(:smart) | kind: "text", source: "content"}
section = %{Notebook.Section.new() | cells: [smart_cell]}
notebook = %{Notebook.new() | sections: [section]}
session = start_session(notebook: notebook)
send(
session.pid,
{:runtime_smart_cell_definitions, [%{kind: "text", name: "Text", requirement: nil}]}
)
send(
session.pid,
{:runtime_smart_cell_started, smart_cell.id,
%{source: "content!", js_view: %{}, editor: nil}}
)
send(session.pid, {:runtime_smart_cell_down, smart_cell.id})
Session.subscribe(session.id)
Session.recover_smart_cell(session.pid, smart_cell.id)
cell_id = smart_cell.id
assert_receive {:operation, {:recover_smart_cell, _client_id, ^cell_id}}
assert_receive {:operation, {:smart_cell_started, _, ^cell_id, _, _, _}}
end
end
describe "convert_smart_cell/2" do
test "sends a delete and insert operations to subscribers" do
smart_cell = %{Notebook.Cell.new(:smart) | kind: "text", source: "content"}