mirror of
https://github.com/knadh/listmonk.git
synced 2025-11-10 01:31:24 +08:00
Fix and refactor subscriber batch fetching in campaign processing.
This has been a hair-pulling rabbit hole of an issue. #1931 and others. When the `next-campaign-subscribers` query that fetches $n subscribers per batch for a campaign returns no results, the manager assumes that the campaign is done and marks as finished. Marathon debugging revealed fundamental flaws in qyery's logic that would incorrectly return 0 rows under certain conditions. - Based on the "layout" of subscribers for eg: a series of blocklisted subscribers between confirmed subscribers. A series of unconfirmed subscribers in a batch belonging to a double opt-in list. - Bulk import blocklisting users, but not marking their subscriptions as 'unsubscribed'. - Conditions spread across multiple CTEs resulted in returning an arbitrary number of rows and $N per batch as the selected $N rows would get filtered out elsewhere, possibly even becoming 0. After fixing this and testing it on our prod instance that has 15 million subscribers and ~70 million subscriptions in the `subscriber_lists` table, ended up discovered significant inefficiences in Postgres query planning. When `subscriber_lists` and campaign list IDs are joined dynamically (CTE or ANY() or any kind of JOIN that involves) a query, the Postgres query planner is unable to use the right indexes. After testing dozens of approaches, discovered that statically passing the values to join on (hardcoding or passing via parametrized $1 vars), the query uses the right indexes. The difference is staggering. For the particular scenario on our large prod DB to pull a batch, ~15 seconds vs. ~50ms, a whopping 300x improvement! This patch splits `next-campaign-subscribers` into two separate queries, one which fetches campaign metadata and list_ids, whose values are then passed statically to the next query to fetch subscribers by batch. In addition, it fixes and refactors broken filtering and counting logic in `create-campaign` and `next-campaign` queries. Closes #1931, #1993, #1986.
This commit is contained in:
parent
ee119b0088
commit
cea65c009d
5 changed files with 103 additions and 75 deletions
|
|
@ -20,6 +20,14 @@ type store struct {
|
||||||
h *http.Client
|
h *http.Client
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type runningCamp struct {
|
||||||
|
CampaignID int `db:"campaign_id"`
|
||||||
|
CampaignType string `db:"campaign_type"`
|
||||||
|
LastSubscriberID int `db:"last_subscriber_id"`
|
||||||
|
MaxSubscriberID int `db:"max_subscriber_id"`
|
||||||
|
ListID int `db:"list_id"`
|
||||||
|
}
|
||||||
|
|
||||||
func newManagerStore(q *models.Queries, c *core.Core, m media.Store) *store {
|
func newManagerStore(q *models.Queries, c *core.Core, m media.Store) *store {
|
||||||
return &store{
|
return &store{
|
||||||
queries: q,
|
queries: q,
|
||||||
|
|
@ -42,8 +50,22 @@ func (s *store) NextCampaigns(currentIDs []int64, sentCounts []int64) ([]*models
|
||||||
// and every batch takes the last ID of the last batch and fetches the next
|
// and every batch takes the last ID of the last batch and fetches the next
|
||||||
// batch above that.
|
// batch above that.
|
||||||
func (s *store) NextSubscribers(campID, limit int) ([]models.Subscriber, error) {
|
func (s *store) NextSubscribers(campID, limit int) ([]models.Subscriber, error) {
|
||||||
|
var camps []runningCamp
|
||||||
|
if err := s.queries.GetRunningCampaign.Select(&camps, campID); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var listIDs []int
|
||||||
|
for _, c := range camps {
|
||||||
|
listIDs = append(listIDs, c.ListID)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(listIDs) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
var out []models.Subscriber
|
var out []models.Subscriber
|
||||||
err := s.queries.NextCampaignSubscribers.Select(&out, campID, limit)
|
err := s.queries.NextCampaignSubscribers.Select(&out, camps[0].CampaignID, camps[0].CampaignType, camps[0].LastSubscriberID, camps[0].MaxSubscriberID, pq.Array(listIDs), limit)
|
||||||
return out, err
|
return out, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,12 @@ import (
|
||||||
|
|
||||||
// V4_0_0 performs the DB migrations.
|
// V4_0_0 performs the DB migrations.
|
||||||
func V4_0_0(db *sqlx.DB, fs stuffbin.FileSystem, ko *koanf.Koanf, lo *log.Logger) error {
|
func V4_0_0(db *sqlx.DB, fs stuffbin.FileSystem, ko *koanf.Koanf, lo *log.Logger) error {
|
||||||
|
lo.Println("IMPORTANT: this upgrade might take a while if you have a large database. Please be patient ...")
|
||||||
|
|
||||||
|
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_subs_id_status ON subscribers(id, status);`); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
if _, err := db.Exec(`
|
if _, err := db.Exec(`
|
||||||
CREATE EXTENSION IF NOT EXISTS pgcrypto;
|
CREATE EXTENSION IF NOT EXISTS pgcrypto;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -76,6 +76,7 @@ type Queries struct {
|
||||||
DeleteCampaignLinkClicks *sqlx.Stmt `query:"delete-campaign-link-clicks"`
|
DeleteCampaignLinkClicks *sqlx.Stmt `query:"delete-campaign-link-clicks"`
|
||||||
|
|
||||||
NextCampaigns *sqlx.Stmt `query:"next-campaigns"`
|
NextCampaigns *sqlx.Stmt `query:"next-campaigns"`
|
||||||
|
GetRunningCampaign *sqlx.Stmt `query:"get-running-campaign"`
|
||||||
NextCampaignSubscribers *sqlx.Stmt `query:"next-campaign-subscribers"`
|
NextCampaignSubscribers *sqlx.Stmt `query:"next-campaign-subscribers"`
|
||||||
GetOneCampaignSubscriber *sqlx.Stmt `query:"get-one-campaign-subscriber"`
|
GetOneCampaignSubscriber *sqlx.Stmt `query:"get-one-campaign-subscriber"`
|
||||||
UpdateCampaign *sqlx.Stmt `query:"update-campaign"`
|
UpdateCampaign *sqlx.Stmt `query:"update-campaign"`
|
||||||
|
|
|
||||||
146
queries.sql
146
queries.sql
|
|
@ -488,31 +488,23 @@ DELETE FROM lists WHERE id = ALL($1);
|
||||||
-- campaigns
|
-- campaigns
|
||||||
-- name: create-campaign
|
-- name: create-campaign
|
||||||
-- This creates the campaign and inserts campaign_lists relationships.
|
-- This creates the campaign and inserts campaign_lists relationships.
|
||||||
WITH campLists AS (
|
WITH tpl AS (
|
||||||
-- Get the list_ids and their optin statuses for the campaigns found in the previous step.
|
|
||||||
SELECT lists.id AS list_id, campaign_id, optin FROM lists
|
|
||||||
INNER JOIN campaign_lists ON (campaign_lists.list_id = lists.id)
|
|
||||||
WHERE lists.id = ANY($14::INT[])
|
|
||||||
),
|
|
||||||
tpl AS (
|
|
||||||
-- If there's no template_id given, use the default template.
|
-- If there's no template_id given, use the default template.
|
||||||
SELECT (CASE WHEN $13 = 0 THEN id ELSE $13 END) AS id FROM templates WHERE is_default IS TRUE
|
SELECT (CASE WHEN $13 = 0 THEN id ELSE $13 END) AS id FROM templates WHERE is_default IS TRUE
|
||||||
),
|
),
|
||||||
counts AS (
|
counts AS (
|
||||||
SELECT COALESCE(COUNT(id), 0) as to_send, COALESCE(MAX(id), 0) as max_sub_id
|
-- This is going to be slow on large databases.
|
||||||
FROM subscribers
|
SELECT
|
||||||
LEFT JOIN campLists ON (campLists.campaign_id = ANY($14::INT[]))
|
COALESCE(COUNT(DISTINCT sl.subscriber_id), 0) AS to_send, COALESCE(MAX(s.id), 0) AS max_sub_id
|
||||||
LEFT JOIN subscriber_lists ON (
|
FROM subscriber_lists sl
|
||||||
subscriber_lists.status != 'unsubscribed' AND
|
JOIN lists l ON sl.list_id = l.id
|
||||||
subscribers.id = subscriber_lists.subscriber_id AND
|
JOIN subscribers s ON sl.subscriber_id = s.id
|
||||||
subscriber_lists.list_id = campLists.list_id AND
|
WHERE sl.list_id = ANY($14::INT[])
|
||||||
|
AND s.status != 'blocklisted'
|
||||||
-- For double opt-in lists, consider only 'confirmed' subscriptions. For single opt-ins,
|
AND (
|
||||||
-- any status except for 'unsubscribed' (already excluded above) works.
|
(l.optin = 'double' AND sl.status = 'confirmed') OR
|
||||||
(CASE WHEN campLists.optin = 'double' THEN subscriber_lists.status = 'confirmed' ELSE true END)
|
(l.optin != 'double' AND sl.status != 'unsubscribed')
|
||||||
)
|
)
|
||||||
WHERE subscriber_lists.list_id=ANY($14::INT[])
|
|
||||||
AND subscribers.status='enabled'
|
|
||||||
),
|
),
|
||||||
camp AS (
|
camp AS (
|
||||||
INSERT INTO campaigns (uuid, type, name, subject, from_email, body, altbody, content_type, send_at, headers, tags, messenger, template_id, to_send, max_subscriber_id, archive, archive_slug, archive_template_id, archive_meta)
|
INSERT INTO campaigns (uuid, type, name, subject, from_email, body, altbody, content_type, send_at, headers, tags, messenger, template_id, to_send, max_subscriber_id, archive, archive_slug, archive_template_id, archive_meta)
|
||||||
|
|
@ -672,27 +664,18 @@ campMedia AS (
|
||||||
GROUP BY campaign_id
|
GROUP BY campaign_id
|
||||||
),
|
),
|
||||||
counts AS (
|
counts AS (
|
||||||
-- For each campaign above, get the total number of subscribers and the max_subscriber_id
|
SELECT camps.id AS campaign_id, COUNT(DISTINCT sl.subscriber_id) AS to_send, COALESCE(MAX(sl.subscriber_id), 0) AS max_subscriber_id
|
||||||
-- across all its lists.
|
|
||||||
SELECT id AS campaign_id,
|
|
||||||
COUNT(DISTINCT(subscriber_lists.subscriber_id)) AS to_send,
|
|
||||||
COALESCE(MAX(subscriber_lists.subscriber_id), 0) AS max_subscriber_id
|
|
||||||
FROM camps
|
FROM camps
|
||||||
LEFT JOIN campLists ON (campLists.campaign_id = camps.id)
|
JOIN campLists cl ON cl.campaign_id = camps.id
|
||||||
LEFT JOIN subscriber_lists ON (
|
JOIN subscriber_lists sl ON sl.list_id = cl.list_id
|
||||||
subscriber_lists.list_id = campLists.list_id AND
|
AND (
|
||||||
(CASE
|
CASE
|
||||||
-- For optin campaigns, only e-mail 'unconfirmed' subscribers belonging to 'double' optin lists.
|
WHEN camps.type = 'optin' THEN sl.status = 'unconfirmed' AND cl.optin = 'double'
|
||||||
WHEN camps.type = 'optin' THEN subscriber_lists.status = 'unconfirmed' AND campLists.optin = 'double'
|
WHEN cl.optin = 'double' THEN sl.status = 'confirmed'
|
||||||
|
ELSE sl.status != 'unsubscribed'
|
||||||
-- For regular campaigns with double optin lists, only e-mail 'confirmed' subscribers.
|
END
|
||||||
WHEN campLists.optin = 'double' THEN subscriber_lists.status = 'confirmed'
|
)
|
||||||
|
JOIN subscribers s ON (s.id = sl.subscriber_id AND s.status != 'blocklisted')
|
||||||
-- For regular campaigns with non-double optin lists, e-mail everyone
|
|
||||||
-- except unsubscribed subscribers.
|
|
||||||
ELSE subscriber_lists.status != 'unsubscribed'
|
|
||||||
END)
|
|
||||||
)
|
|
||||||
GROUP BY camps.id
|
GROUP BY camps.id
|
||||||
),
|
),
|
||||||
updateCounts AS (
|
updateCounts AS (
|
||||||
|
|
@ -757,48 +740,63 @@ SELECT COUNT(%s) AS "count", url
|
||||||
WHERE campaign_id=ANY($1) AND link_clicks.created_at >= $2 AND link_clicks.created_at <= $3
|
WHERE campaign_id=ANY($1) AND link_clicks.created_at >= $2 AND link_clicks.created_at <= $3
|
||||||
GROUP BY links.url ORDER BY "count" DESC LIMIT 50;
|
GROUP BY links.url ORDER BY "count" DESC LIMIT 50;
|
||||||
|
|
||||||
|
-- name: get-running-campaign
|
||||||
|
-- Returns the metadata for a running campaign that is required by next-campaign-subscribers to retrieve
|
||||||
|
-- a batch of campaign subscribers for processing.
|
||||||
|
SELECT campaigns.id AS campaign_id, campaigns.type as campaign_type, last_subscriber_id, max_subscriber_id, lists.id AS list_id
|
||||||
|
FROM campaigns
|
||||||
|
LEFT JOIN campaign_lists ON (campaign_lists.campaign_id = campaigns.id)
|
||||||
|
LEFT JOIN lists ON (lists.id = campaign_lists.list_id)
|
||||||
|
WHERE campaigns.id = $1 AND status='running';
|
||||||
|
|
||||||
-- name: next-campaign-subscribers
|
-- name: next-campaign-subscribers
|
||||||
-- Returns a batch of subscribers in a given campaign starting from the last checkpoint
|
-- Returns a batch of subscribers in a given campaign starting from the last checkpoint
|
||||||
-- (last_subscriber_id). Every fetch updates the checkpoint and the sent count, which means
|
-- (last_subscriber_id). Every fetch updates the checkpoint and the sent count, which means
|
||||||
-- every fetch returns a new batch of subscribers until all rows are exhausted.
|
-- every fetch returns a new batch of subscribers until all rows are exhausted.
|
||||||
WITH camps AS (
|
--
|
||||||
SELECT last_subscriber_id, max_subscriber_id, type FROM campaigns WHERE id = $1 AND status='running'
|
-- In previous versions, get-running-campaign + this was a single query spread across multiple
|
||||||
),
|
-- CTEs, but despite numerous permutations and combinations, Postgres query planner simply would not use
|
||||||
campLists AS (
|
-- the right indexes on subscriber_lists when the JOIN or ids were referenced dynamically from campLists
|
||||||
|
-- (be it a CTE or various kinds of joins). However, statically providing the list IDs to JOIN on ($5::INT[])
|
||||||
|
-- the query planner works as expected. The difference is staggering. ~15 seconds on a subscribers table with 15m
|
||||||
|
-- rows and a subscriber_lists table with 70 million rows when fetching subscribers for a campaign with a single list,
|
||||||
|
-- vs. a few million seconds using this current approach.
|
||||||
|
WITH campLists AS (
|
||||||
SELECT lists.id AS list_id, optin FROM lists
|
SELECT lists.id AS list_id, optin FROM lists
|
||||||
LEFT JOIN campaign_lists ON (campaign_lists.list_id = lists.id)
|
LEFT JOIN campaign_lists ON campaign_lists.list_id = lists.id
|
||||||
WHERE campaign_lists.campaign_id = $1
|
WHERE campaign_lists.campaign_id = $1
|
||||||
),
|
),
|
||||||
subIDs AS (
|
|
||||||
SELECT DISTINCT ON (subscriber_lists.subscriber_id) subscriber_id, list_id, status FROM subscriber_lists
|
|
||||||
WHERE
|
|
||||||
-- ARRAY_AGG is 20x faster instead of a simple SELECT because the query planner
|
|
||||||
-- understands the CTE's cardinality after the scalar array conversion. Huh.
|
|
||||||
list_id = ANY((SELECT ARRAY_AGG(list_id) FROM campLists)::INT[]) AND
|
|
||||||
status != 'unsubscribed' AND
|
|
||||||
subscriber_id > (SELECT last_subscriber_id FROM camps) AND
|
|
||||||
subscriber_id <= (SELECT max_subscriber_id FROM camps)
|
|
||||||
ORDER BY subscriber_id LIMIT $2
|
|
||||||
),
|
|
||||||
subs AS (
|
subs AS (
|
||||||
SELECT subscribers.* FROM subIDs
|
SELECT s.*
|
||||||
LEFT JOIN campLists ON (campLists.list_id = subIDs.list_id)
|
FROM (
|
||||||
INNER JOIN subscribers ON (
|
SELECT DISTINCT s.id
|
||||||
subscribers.status != 'blocklisted' AND
|
FROM subscriber_lists sl
|
||||||
subscribers.id = subIDs.subscriber_id AND
|
JOIN campLists ON sl.list_id = campLists.list_id
|
||||||
|
JOIN subscribers s ON s.id = sl.subscriber_id
|
||||||
|
WHERE
|
||||||
|
sl.list_id = ANY($5::INT[])
|
||||||
|
-- last_subscriber_id
|
||||||
|
AND s.id > $3
|
||||||
|
-- max_subscriber_id
|
||||||
|
AND s.id <= $4
|
||||||
|
-- Subscriber should not be blacklisted.
|
||||||
|
AND s.status != 'blocklisted'
|
||||||
|
AND (
|
||||||
|
-- If it's an optin campaign and the list is double-optin, only pick unconfirmed subscribers.
|
||||||
|
($2 = 'optin' AND sl.status = 'unconfirmed' AND campLists.optin = 'double')
|
||||||
|
OR (
|
||||||
|
-- It is a regular campaign.
|
||||||
|
$2 != 'optin' AND (
|
||||||
|
-- It is a double optin list. Only pick confirmed subscribers.
|
||||||
|
(campLists.optin = 'double' AND sl.status = 'confirmed') OR
|
||||||
|
|
||||||
(CASE
|
-- It is a single optin list. Pick all non-unsubscribed subscribers.
|
||||||
-- For optin campaigns, only e-mail 'unconfirmed' subscribers.
|
(campLists.optin != 'double' AND sl.status != 'unsubscribed')
|
||||||
WHEN (SELECT type FROM camps) = 'optin' THEN subIDs.status = 'unconfirmed' AND campLists.optin = 'double'
|
)
|
||||||
|
)
|
||||||
-- For regular campaigns with double optin lists, only e-mail 'confirmed' subscribers.
|
)
|
||||||
WHEN campLists.optin = 'double' THEN subIDs.status = 'confirmed'
|
ORDER BY s.id LIMIT $6
|
||||||
|
) subIDs JOIN subscribers s ON (s.id = subIDs.id) ORDER BY s.id
|
||||||
-- For regular campaigns with non-double optin lists, e-mail everyone
|
|
||||||
-- except unsubscribed subscribers.
|
|
||||||
ELSE subIDs.status != 'unsubscribed'
|
|
||||||
END)
|
|
||||||
)
|
|
||||||
),
|
),
|
||||||
u AS (
|
u AS (
|
||||||
UPDATE campaigns
|
UPDATE campaigns
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ CREATE TABLE subscribers (
|
||||||
);
|
);
|
||||||
DROP INDEX IF EXISTS idx_subs_email; CREATE UNIQUE INDEX idx_subs_email ON subscribers(LOWER(email));
|
DROP INDEX IF EXISTS idx_subs_email; CREATE UNIQUE INDEX idx_subs_email ON subscribers(LOWER(email));
|
||||||
DROP INDEX IF EXISTS idx_subs_status; CREATE INDEX idx_subs_status ON subscribers(status);
|
DROP INDEX IF EXISTS idx_subs_status; CREATE INDEX idx_subs_status ON subscribers(status);
|
||||||
|
DROP INDEX IF EXISTS idx_subs_id_status; CREATE INDEX idx_subs_id_status ON subscribers(id, status);
|
||||||
DROP INDEX IF EXISTS idx_subs_created_at; CREATE INDEX idx_subs_created_at ON subscribers(created_at);
|
DROP INDEX IF EXISTS idx_subs_created_at; CREATE INDEX idx_subs_created_at ON subscribers(created_at);
|
||||||
DROP INDEX IF EXISTS idx_subs_updated_at; CREATE INDEX idx_subs_updated_at ON subscribers(updated_at);
|
DROP INDEX IF EXISTS idx_subs_updated_at; CREATE INDEX idx_subs_updated_at ON subscribers(updated_at);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue