analytics/lib/mix/tasks/hydrate_clickhouse.ex
Uku Taht d1b703f007
Clickhouse (#63)
* Get stats from clickhosue

* Pull stats from clickhouse

* Use correct Query namespace

* Use Clickhouse in unit tests

* Use Clickhouse in stats controller tests

* Use fixtures for unit tests

* Add Clickhouse to travis

* Use Clickhouse session store for sessions

* Add garbage collection to session store

* Reload session state from Clickhouse on server restart

* Query from sessions table

* Trap exits in event write buffer

* Run hydration without starting the whole app

* Make session length 30 minutes

* Revert changes to fingerprint schema

* Remove clickhouse from fingerprint sessions

* Flush buffers before shutdown

* Use old stats when merging

* Remove old session schema

* Fix tests with CH sessions

* Add has_pageviews? to Stats
2020-05-18 12:44:52 +03:00

167 lines
5.2 KiB
Elixir

defmodule Mix.Tasks.HydrateClickhouse do
use Mix.Task
use Plausible.Repo
require Logger
def run(args) do
Application.ensure_all_started(:db_connection)
Application.ensure_all_started(:hackney)
clickhouse_config = Application.get_env(:plausible, :clickhouse)
Clickhousex.start_link(Keyword.merge([scheme: :http, port: 8123, name: :clickhouse], clickhouse_config))
Ecto.Migrator.with_repo(Plausible.Repo, fn repo ->
execute(repo, args)
end)
end
def execute(repo, _args \\ []) do
create_events()
create_sessions()
hydrate_events(repo)
end
def create_events() do
ddl = """
CREATE TABLE IF NOT EXISTS events (
timestamp DateTime,
name String,
domain String,
user_id FixedString(64),
hostname String,
pathname String,
referrer Nullable(String),
referrer_source Nullable(String),
initial_referrer Nullable(String),
initial_referrer_source Nullable(String),
country_code Nullable(FixedString(2)),
screen_size Nullable(String),
operating_system Nullable(String),
browser Nullable(String)
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp)
ORDER BY (name, domain, timestamp, user_id)
SETTINGS index_granularity = 8192
"""
Clickhousex.query(:clickhouse, ddl, [])
|> log
end
def create_sessions() do
ddl = """
CREATE TABLE IF NOT EXISTS sessions (
session_id UUID,
sign Int8,
domain String,
user_id FixedString(64),
hostname String,
timestamp DateTime,
start DateTime,
is_bounce UInt8,
entry_page Nullable(String),
exit_page Nullable(String),
pageviews Int32,
events Int32,
duration UInt32,
referrer Nullable(String),
referrer_source Nullable(String),
country_code Nullable(FixedString(2)),
screen_size Nullable(String),
operating_system Nullable(String),
browser Nullable(String)
) ENGINE = CollapsingMergeTree(sign)
PARTITION BY toYYYYMM(start)
ORDER BY (domain, start, user_id, session_id)
SETTINGS index_granularity = 8192
"""
Clickhousex.query(:clickhouse, ddl, [])
|> log
end
def chunk_query(queryable, chunk_size, repo) do
chunk_stream = Stream.unfold(0, fn page_number ->
offset = chunk_size * page_number
page = from(
q in queryable,
offset: ^offset,
limit: ^chunk_size
) |> repo.all(timeout: :infinity)
{page, page_number + 1}
end)
Stream.take_while(chunk_stream, fn [] -> false; _ -> true end)
end
def escape_quote(s) do
String.replace(s, "'", "''")
end
def hydrate_events(repo, _args \\ []) do
event_chunks = from(e in Plausible.Event, where: e.domain == "plausible.io", order_by: e.id) |> chunk_query(10_000, repo)
Enum.reduce(event_chunks, %{}, fn events, session_cache ->
{session_cache, sessions} = Enum.reduce(events, {session_cache, []}, fn event, {session_cache, sessions} ->
found_session = session_cache[event.fingerprint]
active = is_active?(found_session, event)
cond do
found_session && active ->
new_session = update_session(found_session, event)
{
Map.put(session_cache, event.fingerprint, new_session),
[%{new_session | sign: 1}, %{found_session | sign: -1} | sessions]
}
found_session && !active ->
new_session = new_session_from_event(event)
{
Map.put(session_cache, event.fingerprint, new_session),
[new_session | sessions]
}
true ->
new_session = new_session_from_event(event)
{
Map.put(session_cache, event.fingerprint, new_session),
[new_session | sessions]
}
end
end)
Plausible.Clickhouse.insert_events(events)
Plausible.Clickhouse.insert_sessions(sessions)
session_cache
end)
end
defp is_active?(session, event) do
session && Timex.diff(event.timestamp, session.timestamp, :minute) <= 29
end
defp update_session(session, event) do
%{session | timestamp: event.timestamp, exit_page: event.pathname, is_bounce: false, duration: Timex.diff(event.timestamp, session.start, :second), pageviews: (if event.name == "pageview", do: session.pageviews + 1, else: session.pageviews), events: session.events + 1}
end
defp new_session_from_event(event) do
%Plausible.ClickhouseSession{
sign: 1,
session_id: UUID.uuid4(),
hostname: event.hostname,
domain: event.domain,
user_id: event.fingerprint,
entry_page: event.pathname,
exit_page: event.pathname,
is_bounce: true,
duration: 0,
pageviews: (if event.name == "pageview", do: 1, else: 0),
events: 1,
referrer: event.referrer,
referrer_source: event.referrer_source,
country_code: event.country_code,
operating_system: event.operating_system,
browser: event.browser,
timestamp: event.timestamp,
start: event.timestamp
}
end
defp log({:ok, res}), do: Logger.info("#{inspect res}")
defp log({:error, e}), do: Logger.error("[ERROR] #{inspect e}")
end