From b9c2110472485c02ad135d60a391737cbb649e04 Mon Sep 17 00:00:00 2001 From: hq1 Date: Thu, 13 Apr 2023 12:09:39 +0200 Subject: [PATCH] V2 migration tweaks for self hosted release (#2825) * Get rid of PASS_V2_SCHEMA_MIGRATION * Use in-memory domain lookup + regular table settings * Remove faulty date arithmetic + prev part calculation * Set V2_MIGRATION_DONE in Mix.env == :dev * Mute credo --- config/.env.dev | 1 + lib/plausible/data_migration/numeric_ids.ex | 60 ++++++++++--------- .../sql/create-dict-from-static-file.sql.eex | 9 --- .../sql/create-domains-lookup.sql.eex | 1 + .../NumericIDs/sql/create-events-v2.sql.eex | 2 +- .../NumericIDs/sql/create-sessions-v2.sql.eex | 2 +- .../NumericIDs/sql/drop-dict.sql.eex | 1 - .../sql/drop-domains-lookup.sql.eex | 1 + .../sql/insert-into-tmp-events-v2.sql.eex | 17 +----- .../sql/insert-into-tmp-sessions-v2.sql.eex | 17 +----- .../20230320094327_create_v2_schemas.exs | 14 ++--- 11 files changed, 44 insertions(+), 81 deletions(-) delete mode 100644 priv/data_migrations/NumericIDs/sql/create-dict-from-static-file.sql.eex create mode 100644 priv/data_migrations/NumericIDs/sql/create-domains-lookup.sql.eex delete mode 100644 priv/data_migrations/NumericIDs/sql/drop-dict.sql.eex create mode 100644 priv/data_migrations/NumericIDs/sql/drop-domains-lookup.sql.eex diff --git a/config/.env.dev b/config/.env.dev index d7e46e3854..1f9cb25d33 100644 --- a/config/.env.dev +++ b/config/.env.dev @@ -16,3 +16,4 @@ GOOGLE_CLIENT_ID=875387135161-l8tp53dpt7fdhdg9m1pc3vl42si95rh0.apps.googleuserco GOOGLE_CLIENT_SECRET=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 PROMEX_DISABLED=false +V2_MIGRATION_DONE=1 diff --git a/lib/plausible/data_migration/numeric_ids.ex b/lib/plausible/data_migration/numeric_ids.ex index 4774f465b6..004f8ca452 100644 --- a/lib/plausible/data_migration/numeric_ids.ex +++ b/lib/plausible/data_migration/numeric_ids.ex @@ -5,7 +5,20 @@ defmodule Plausible.DataMigration.NumericIDs do """ use Plausible.DataMigration, dir: "NumericIDs" - @table_settings "SETTINGS index_granularity = 8192, storage_policy = 'tiered'" + import Ecto.Query + + defmodule DomainsLookup do + @moduledoc false + use Ecto.Schema + + @primary_key false + schema "domains_lookup" do + field :site_id, Ch.Types.UInt64 + field :domain, :string + end + end + + @table_settings "SETTINGS index_granularity = 8192" def ready?() do Application.get_env(:plausible, :v2_migration_done) || false @@ -24,13 +37,6 @@ defmodule Plausible.DataMigration.NumericIDs do max_threads = "NUMERIC_IDS_MIGRATION_MAX_THREADS" |> System.get_env("16") |> String.to_integer() - # TBD: There's most likely a bug in Clickhouse defining Postgres dictionaries, - # we'll use a static URL for now - dict_url = Keyword.get(opts, :dict_url) || System.get_env("DOMAINS_DICT_URL") || "" - - dict_password = - Keyword.get(opts, :dict_password) || System.get_env("DOMAINS_DICT_PASSWORD") || "" - table_settings = Keyword.get(opts, :table_settings) || System.get_env("NUMERIC_IDS_TABLE_SETTINGS") || @table_settings @@ -38,12 +44,7 @@ defmodule Plausible.DataMigration.NumericIDs do start_from = Keyword.get(opts, :start_from) || System.get_env("NUMERIC_IDS_PARTITION_START_FROM") - stop_at = - Keyword.get(opts, :stop_at) || System.get_env("NUMERIC_IDS_PARTITION_STOP_AT") || - previous_part() - - (byte_size(dict_url) > 0 and byte_size(dict_password) > 0) || - raise "Set DOMAINS_DICT_URL and DOMAINS_DICT_PASSWORD" + stop_at = Keyword.get(opts, :stop_at) || System.get_env("NUMERIC_IDS_PARTITION_STOP_AT") @repo.start(db_url, max_threads) @@ -64,7 +65,6 @@ defmodule Plausible.DataMigration.NumericIDs do Got the following migration settings: - max_threads: #{max_threads} - - dict_url: #{dict_url} - dict_password: ✅ - table_settings: #{table_settings} - db url: #{db_url} @@ -94,13 +94,7 @@ defmodule Plausible.DataMigration.NumericIDs do {:ok, _} = run_sql_fn.("drop-sessions-v2", cluster?: cluster?) {:ok, _} = run_sql_fn.("drop-tmp-events-v2", []) {:ok, _} = run_sql_fn.("drop-tmp-sessions-v2", []) - {:ok, _} = run_sql_fn.("drop-dict", []) - - {:ok, _} = - run_sql_fn.("create-dict-from-static-file", - dict_url: dict_url, - dict_password: dict_password - ) + {:ok, _} = run_sql_fn.("drop-domains-lookup", []) {:ok, _} = run_sql_fn.("create-events-v2", table_settings: table_settings, cluster?: cluster?) @@ -110,6 +104,21 @@ defmodule Plausible.DataMigration.NumericIDs do {:ok, _} = run_sql_fn.("create-tmp-events-v2", table_settings: table_settings) {:ok, _} = run_sql_fn.("create-tmp-sessions-v2", table_settings: table_settings) + case run_sql_fn.("create-domains-lookup", table_settings: table_settings) do + {:ok, _} -> + confirm_fn.("Populate domains-lookup with postgres sites", fn -> + mappings = + Plausible.Site + |> select([s], %{site_id: s.id, domain: s.domain}) + |> Plausible.Repo.all() + + @repo.insert_all(DomainsLookup, mappings) + end) + + _ -> + :ignore + end + confirm_fn.("Start migration? (starting from partition: #{start_from})", fn -> IO.puts("start.. #{DateTime.utc_now()}") @@ -135,11 +144,4 @@ defmodule Plausible.DataMigration.NumericIDs do IO.puts("end.. #{DateTime.utc_now()}") end) end - - defp previous_part() do - now = NaiveDateTime.utc_now() - month = String.pad_leading("#{now.month - 1}", 2, "0") - year = "#{now.year}" - "#{year}#{month}" - end end diff --git a/priv/data_migrations/NumericIDs/sql/create-dict-from-static-file.sql.eex b/priv/data_migrations/NumericIDs/sql/create-dict-from-static-file.sql.eex deleted file mode 100644 index be5c45a217..0000000000 --- a/priv/data_migrations/NumericIDs/sql/create-dict-from-static-file.sql.eex +++ /dev/null @@ -1,9 +0,0 @@ -CREATE DICTIONARY domains_lookup ( -`id` UInt64, -`site_id` UInt64, -`domain` String, -`partition` String -) PRIMARY KEY domain -SOURCE(HTTP(URL '<%= @dict_url %>' CREDENTIALS(user 'user' password '<%= @dict_password %>') FORMAT 'CSVWithNames')) -LIFETIME(3600) -LAYOUT(COMPLEX_KEY_HASHED()) diff --git a/priv/data_migrations/NumericIDs/sql/create-domains-lookup.sql.eex b/priv/data_migrations/NumericIDs/sql/create-domains-lookup.sql.eex new file mode 100644 index 0000000000..b18d7fb5f4 --- /dev/null +++ b/priv/data_migrations/NumericIDs/sql/create-domains-lookup.sql.eex @@ -0,0 +1 @@ +CREATE TABLE IF NOT EXISTS domains_lookup(site_id UInt64, domain String) ENGINE Memory; diff --git a/priv/data_migrations/NumericIDs/sql/create-events-v2.sql.eex b/priv/data_migrations/NumericIDs/sql/create-events-v2.sql.eex index 6413354cdf..ebe5fc8619 100644 --- a/priv/data_migrations/NumericIDs/sql/create-events-v2.sql.eex +++ b/priv/data_migrations/NumericIDs/sql/create-events-v2.sql.eex @@ -1,4 +1,4 @@ -CREATE TABLE events_v2 <%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> +CREATE TABLE IF NOT EXISTS events_v2 <%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> ( `timestamp` DateTime CODEC(Delta(4), LZ4), `name` LowCardinality(String), diff --git a/priv/data_migrations/NumericIDs/sql/create-sessions-v2.sql.eex b/priv/data_migrations/NumericIDs/sql/create-sessions-v2.sql.eex index 38d1e5eebc..e03d910762 100644 --- a/priv/data_migrations/NumericIDs/sql/create-sessions-v2.sql.eex +++ b/priv/data_migrations/NumericIDs/sql/create-sessions-v2.sql.eex @@ -1,4 +1,4 @@ -CREATE TABLE sessions_v2 <%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> +CREATE TABLE IF NOT EXISTS sessions_v2 <%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> ( `session_id` UInt64, `sign` Int8, diff --git a/priv/data_migrations/NumericIDs/sql/drop-dict.sql.eex b/priv/data_migrations/NumericIDs/sql/drop-dict.sql.eex deleted file mode 100644 index 3e4f226437..0000000000 --- a/priv/data_migrations/NumericIDs/sql/drop-dict.sql.eex +++ /dev/null @@ -1 +0,0 @@ -DROP DICTIONARY IF EXISTS domains_lookup SYNC diff --git a/priv/data_migrations/NumericIDs/sql/drop-domains-lookup.sql.eex b/priv/data_migrations/NumericIDs/sql/drop-domains-lookup.sql.eex new file mode 100644 index 0000000000..6ca833f126 --- /dev/null +++ b/priv/data_migrations/NumericIDs/sql/drop-domains-lookup.sql.eex @@ -0,0 +1 @@ +DROP TABLE IF EXISTS domains_lookup SYNC; diff --git a/priv/data_migrations/NumericIDs/sql/insert-into-tmp-events-v2.sql.eex b/priv/data_migrations/NumericIDs/sql/insert-into-tmp-events-v2.sql.eex index 454ac09962..c5fd79b4d9 100644 --- a/priv/data_migrations/NumericIDs/sql/insert-into-tmp-events-v2.sql.eex +++ b/priv/data_migrations/NumericIDs/sql/insert-into-tmp-events-v2.sql.eex @@ -10,9 +10,7 @@ INSERT INTO tmp_events_v2 ( transferred_from ) SELECT - dictGet( - 'domains_lookup', 'site_id', domain - ), + (SELECT site_id from domains_lookup WHERE domain = domain LIMIT 1), timestamp, name, user_id, @@ -40,15 +38,4 @@ SELECT transferred_from FROM events -WHERE - ( - domain IN ( - SELECT - domain - FROM - dictionary('domains_lookup') - WHERE - partition <= '<%= @partition %>' - ) - ) -AND (_partition_id = '<%= @partition %>') +WHERE (_partition_id = '<%= @partition %>') diff --git a/priv/data_migrations/NumericIDs/sql/insert-into-tmp-sessions-v2.sql.eex b/priv/data_migrations/NumericIDs/sql/insert-into-tmp-sessions-v2.sql.eex index 194e877af2..c50ca8cbda 100644 --- a/priv/data_migrations/NumericIDs/sql/insert-into-tmp-sessions-v2.sql.eex +++ b/priv/data_migrations/NumericIDs/sql/insert-into-tmp-sessions-v2.sql.eex @@ -6,9 +6,7 @@ operating_system_version, subdivision1_code, subdivision2_code, city_geoname_id, utm_term, transferred_from, entry_meta.key, entry_meta.value ) SELECT - dictGet( - 'domains_lookup', 'site_id', domain - ), + (SELECT site_id from domains_lookup WHERE domain = domain LIMIT 1), session_id, sign, user_id, hostname, timestamp, start, is_bounce, entry_page, exit_page, pageviews, events, duration, referrer, referrer_source, country_code, screen_size, operating_system, browser, utm_medium, utm_source, utm_campaign, browser_version, @@ -16,15 +14,4 @@ SELECT utm_term, transferred_from, entry_meta.key, entry_meta.value FROM sessions -WHERE - ( - domain IN ( - SELECT - domain - FROM - dictionary('domains_lookup') - WHERE - partition <= '<%= @partition %>' - ) - ) -AND (_partition_id = '<%= @partition %>') +WHERE (_partition_id = '<%= @partition %>') diff --git a/priv/ingest_repo/migrations/20230320094327_create_v2_schemas.exs b/priv/ingest_repo/migrations/20230320094327_create_v2_schemas.exs index a77591f64d..b881f5eae6 100644 --- a/priv/ingest_repo/migrations/20230320094327_create_v2_schemas.exs +++ b/priv/ingest_repo/migrations/20230320094327_create_v2_schemas.exs @@ -1,13 +1,11 @@ defmodule Plausible.IngestRepo.Migrations.CreateV2Schemas do @moduledoc """ Normally, for live environments the migration will be done via - `DataMigration.NumericIDs` module (TBD). In which case PASS_V2_SCHEMA_MIGRATION - environment variable needs to be set, to only make the standard migrate - command write an entry into schema_migrations. + `DataMigration.NumericIDs` module. For tests, and entirely new small, self-hosted instances however, we want to keep the ability of preparing the database without enforcing - any data migration. + any _data_ migration. """ use Ecto.Migration @@ -18,12 +16,8 @@ defmodule Plausible.IngestRepo.Migrations.CreateV2Schemas do @settings "SETTINGS index_granularity = 8192" def up do - if System.get_env("PASS_V2_SCHEMA_MIGRATION") do - :ok - else - execute unwrap("create-events-v2", table_settings: @settings, cluster?: @cluster?) - execute unwrap("create-sessions-v2", table_settings: @settings, cluster?: @cluster?) - end + execute unwrap("create-events-v2", table_settings: @settings, cluster?: @cluster?) + execute unwrap("create-sessions-v2", table_settings: @settings, cluster?: @cluster?) end def down do