Improve waiting for Fly machine startup (#2720)

This commit is contained in:
Jonatan Kłosko 2024-07-24 18:07:16 +02:00 committed by GitHub
parent d71dedd842
commit 5686771646
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 11 additions and 7 deletions

View file

@ -193,11 +193,18 @@ defmodule Livebook.FlyAPI do
"""
@spec await_machine_started(String.t(), String.t(), String.t()) :: :ok | {:error, error}
def await_machine_started(token, app_name, machine_id) do
# The maximum supported timeout is 60s, but the machine may take
# longer to start if it uses a large Docker image (such as CUDA),
# provided the image is not already in the Fly cache. To achieve
# a longer wait, we retry request timeouts (and possible network
# errors).
with {:ok, _data} <-
flaps_request(token, "/v1/apps/#{app_name}/machines/#{machine_id}/wait",
params: %{state: "started", timeout: 60},
receive_timeout: 90_000,
retry: false
retry: :safe_transient,
max_retries: 4,
retry_log_level: false
) do
:ok
end

View file

@ -214,13 +214,10 @@ defmodule Livebook.Runtime.Fly do
:ok ->
:ok
{:error, %{status: 408}} ->
{:error,
"timed out while waiting for the machine to start. See the app" <>
" logs in the Fly.io dashboard to determine the reason"}
{:error, %{message: message}} ->
{:error, "failed while waiting for the machine to started, reason: #{message}"}
{:error,
"failed while waiting for the machine to start, reason: #{message}." <>
" See the app logs in the Fly.io dashbaord to determine the reason"}
end
end