CSV imports (no UI) (#3895)

* encode/decode date range in filenames

* Update lib/plausible/imported/csv_importer.ex

Co-authored-by: Adrian Gruntkowski <adrian.gruntkowski@gmail.com>

* Update lib/plausible/imported/csv_importer.ex

Co-authored-by: Adrian Gruntkowski <adrian.gruntkowski@gmail.com>

* drop unused functions

* send failure email if there is no data to export

* use PlausibleWeb.Email.mailer_email_from()

* ensure we get dates from minmax date query

---------

Co-authored-by: Adrian Gruntkowski <adrian.gruntkowski@gmail.com>
This commit is contained in:
ruslandoga 2024-03-19 19:06:47 +08:00 committed by GitHub
parent 4242b52be4
commit 279e89c693
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 290 additions and 125 deletions

View File

@ -27,3 +27,4 @@ S3_SECRET_ACCESS_KEY=minioadmin
S3_REGION=us-east-1
S3_ENDPOINT=http://localhost:10000
S3_EXPORTS_BUCKET=dev-exports
S3_IMPORTS_BUCKET=dev-imports

View File

@ -22,3 +22,4 @@ S3_SECRET_ACCESS_KEY=minioadmin
S3_REGION=us-east-1
S3_ENDPOINT=http://localhost:10000
S3_EXPORTS_BUCKET=test-exports
S3_IMPORTS_BUCKET=test-imports

View File

@ -736,6 +736,10 @@ unless s3_disabled? do
%{
name: "S3_EXPORTS_BUCKET",
example: "my-csv-exports-bucket"
},
%{
name: "S3_IMPORTS_BUCKET",
example: "my-csv-imports-bucket"
}
]
@ -771,5 +775,7 @@ unless s3_disabled? do
host: s3_host,
port: s3_port
config :plausible, Plausible.S3, exports_bucket: s3_env_value.("S3_EXPORTS_BUCKET")
config :plausible, Plausible.S3,
exports_bucket: s3_env_value.("S3_EXPORTS_BUCKET"),
imports_bucket: s3_env_value.("S3_IMPORTS_BUCKET")
end

View File

@ -20,7 +20,13 @@ defmodule Plausible.Imported.CSVImporter do
@impl true
def import_data(site_import, opts) do
%{id: import_id, site_id: site_id} = site_import
%{
id: import_id,
site_id: site_id,
start_date: start_date,
end_date: end_date
} = site_import
uploads = Keyword.fetch!(opts, :uploads)
%{access_key_id: s3_access_key_id, secret_access_key: s3_secret_access_key} =
@ -31,14 +37,10 @@ defmodule Plausible.Imported.CSVImporter do
|> Keyword.replace!(:pool_size, 1)
|> Ch.start_link()
ranges =
Enum.map(uploads, fn upload ->
Enum.each(uploads, fn upload ->
%{"filename" => filename, "s3_url" => s3_url} = upload
".csv" = Path.extname(filename)
table = Path.rootname(filename)
ensure_importable_table!(table)
{table, _, _} = parse_filename!(filename)
s3_structure = input_structure!(table)
statement =
@ -46,6 +48,7 @@ defmodule Plausible.Imported.CSVImporter do
INSERT INTO {table:Identifier} \
SELECT {site_id:UInt64} AS site_id, *, {import_id:UInt64} AS import_id \
FROM s3({s3_url:String},{s3_access_key_id:String},{s3_secret_access_key:String},{s3_format:String},{s3_structure:String}) \
WHERE date >= {start_date:Date} AND date <= {end_date:Date}\
"""
params =
@ -57,26 +60,13 @@ defmodule Plausible.Imported.CSVImporter do
"s3_access_key_id" => s3_access_key_id,
"s3_secret_access_key" => s3_secret_access_key,
"s3_format" => "CSVWithNames",
"s3_structure" => s3_structure
"s3_structure" => s3_structure,
"start_date" => start_date,
"end_date" => end_date
}
Ch.query!(ch, statement, params, timeout: :infinity)
%Ch.Result{rows: [[min_date, max_date]]} =
Ch.query!(
ch,
"SELECT min(date), max(date) FROM {table:Identifier} WHERE site_id = {site_id:UInt64} AND import_id = {import_id:UInt64}",
%{"table" => table, "site_id" => site_id, "import_id" => import_id}
)
Date.range(min_date, max_date)
end)
{:ok,
%{
start_date: Enum.min_by(ranges, & &1.first, Date).first,
end_date: Enum.max_by(ranges, & &1.last, Date).last
}}
rescue
# we are cancelling on any argument or ClickHouse errors
e in [ArgumentError, Ch.Error] ->
@ -103,12 +93,85 @@ defmodule Plausible.Imported.CSVImporter do
"date Date, visitors UInt64, pageviews UInt64, bounces UInt64, visits UInt64, visit_duration UInt64"
}
for {table, input_structure} <- input_structures do
defp input_structure!(unquote(table)), do: unquote(input_structure)
defp ensure_importable_table!(unquote(table)), do: :ok
@doc """
Extracts min/max date range from a list of uploads.
Examples:
iex> date_range([
...> %{"filename" => "imported_devices_20190101_20210101.csv"},
...> "imported_pages_20200101_20220101.csv"
...> ])
Date.range(~D[2019-01-01], ~D[2022-01-01])
iex> date_range([])
** (ArgumentError) empty uploads
"""
@spec date_range([String.t() | %{String.t() => String.t()}, ...]) :: Date.Range.t()
def date_range([_ | _] = uploads), do: date_range(uploads, _start_date = nil, _end_date = nil)
def date_range([]), do: raise(ArgumentError, "empty uploads")
defp date_range([upload | uploads], prev_start_date, prev_end_date) do
filename =
case upload do
%{"filename" => filename} -> filename
filename when is_binary(filename) -> filename
end
defp ensure_importable_table!(table) do
raise ArgumentError, "table #{table} is not supported for data import"
{_table, start_date, end_date} = parse_filename!(filename)
start_date =
if prev_start_date do
Enum.min([start_date, prev_start_date], Date)
else
start_date
end
end_date =
if prev_end_date do
Enum.max([end_date, prev_end_date], Date)
else
end_date
end
date_range(uploads, start_date, end_date)
end
defp date_range([], first, last), do: Date.range(first, last)
@spec parse_date!(String.t()) :: Date.t()
defp parse_date!(date) do
date |> Timex.parse!("{YYYY}{0M}{0D}") |> NaiveDateTime.to_date()
end
@doc """
Extracts table name and min/max dates from the filename.
Examples:
iex> parse_filename!("my_data.csv")
** (ArgumentError) invalid filename
iex> parse_filename!("imported_devices_00010101_20250101.csv")
{"imported_devices", ~D[0001-01-01], ~D[2025-01-01]}
"""
@spec parse_filename!(String.t()) ::
{table :: String.t(), start_date :: Date.t(), end_date :: Date.t()}
def parse_filename!(filename)
for {table, input_structure} <- input_structures do
defp input_structure!(unquote(table)), do: unquote(input_structure)
def parse_filename!(
<<unquote(table)::bytes, ?_, start_date::8-bytes, ?_, end_date::8-bytes, ".csv">>
) do
{unquote(table), parse_date!(start_date), parse_date!(end_date)}
end
end
def parse_filename!(_filename) do
raise ArgumentError, "invalid filename"
end
end

View File

@ -18,9 +18,64 @@ defmodule Plausible.S3 do
@spec exports_bucket :: String.t()
def exports_bucket, do: config(:exports_bucket)
@doc """
Returns the pre-configured S3 bucket for CSV imports.
config :plausible, Plausible.S3,
imports_bucket: System.fetch_env!("S3_IMPORTS_BUCKET")
Example:
iex> imports_bucket()
"test-imports"
"""
@spec imports_bucket :: String.t()
def imports_bucket, do: config(:imports_bucket)
defp config, do: Application.fetch_env!(:plausible, __MODULE__)
defp config(key), do: Keyword.fetch!(config(), key)
@doc """
Presigns an upload for an imported file.
In the current implementation the bucket always goes into the path component.
Example:
iex> %{
...> s3_url: "http://localhost:10000/test-imports/123/imported_browsers.csv",
...> presigned_url: "http://localhost:10000/test-imports/123/imported_browsers.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin" <> _
...> } = import_presign_upload(_site_id = 123, _filename = "imported_browsers.csv")
"""
def import_presign_upload(site_id, filename) do
config = ExAws.Config.new(:s3)
s3_path = Path.join(to_string(site_id), filename)
bucket = imports_bucket()
{:ok, presigned_url} = ExAws.S3.presigned_url(config, :put, bucket, s3_path)
%{s3_url: extract_s3_url(presigned_url), presigned_url: presigned_url}
end
# to make ClickHouse see MinIO in dev and test envs we replace
# the host in the S3 URL with whatever's set in S3_CLICKHOUSE_HOST env var
if Mix.env() in [:dev, :test, :small_dev, :small_test] do
defp extract_s3_url(presigned_url) do
[s3_url, _] = String.split(presigned_url, "?")
if ch_host = System.get_env("S3_CLICKHOUSE_HOST") do
URI.to_string(%URI{URI.parse(s3_url) | host: ch_host})
else
s3_url
end
end
else
defp extract_s3_url(presigned_url) do
[s3_url, _] = String.split(presigned_url, "?")
s3_url
end
end
@doc """
Chunks and uploads Zip archive to the provided S3 destination.
@ -77,6 +132,12 @@ defmodule Plausible.S3 do
@doc """
Returns `access_key_id` and `secret_access_key` to be used by ClickHouse during imports from S3.
Example:
iex> import_clickhouse_credentials()
%{access_key_id: "minioadmin", secret_access_key: "minioadmin"}
"""
@spec import_clickhouse_credentials ::
%{access_key_id: String.t(), secret_access_key: String.t()}

View File

@ -25,22 +25,34 @@ defmodule Plausible.Workers.ExportCSV do
|> Keyword.replace!(:pool_size, 1)
|> Ch.start_link()
# NOTE: should we use site.timezone?
# %Ch.Result{rows: [[min_date, max_date]]} =
# Ch.query!(
# ch,
# "SELECT toDate(min(timestamp)), toDate(max(timestamp)) FROM events_v2 WHERE site_id={site_id:UInt64}",
# %{"site_id" => site_id}
# )
%Ch.Result{rows: [[%Date{} = min_date, %Date{} = max_date]]} =
Ch.query!(
ch,
"SELECT toDate(min(timestamp)), toDate(max(timestamp)) FROM events_v2 WHERE site_id={site_id:UInt64}",
%{"site_id" => site_id}
)
if max_date == ~D[1970-01-01] do
# NOTE: replace with proper Plausible.Email template
Plausible.Mailer.deliver_now!(
Bamboo.Email.new_email(
from: PlausibleWeb.Email.mailer_email_from(),
to: email,
subject: "EXPORT FAILURE",
text_body: "there is nothing to export"
)
)
else
download_url =
DBConnection.run(
ch,
fn conn ->
conn
|> Plausible.Exports.stream_archive(
# date_range: Date.range(min_date, max_date)
Plausible.Exports.export_queries(site_id, extname: ".csv"),
Plausible.Exports.export_queries(site_id,
date_range: Date.range(min_date, max_date),
extname: ".csv"
),
format: "CSVWithNames"
)
|> Plausible.S3.export_upload_multipart(s3_bucket, s3_path, s3_config_overrides(args))
@ -51,7 +63,7 @@ defmodule Plausible.Workers.ExportCSV do
# NOTE: replace with proper Plausible.Email template
Plausible.Mailer.deliver_now!(
Bamboo.Email.new_email(
from: "plausible@email.com",
from: PlausibleWeb.Email.mailer_email_from(),
to: email,
subject: "EXPORT SUCCESS",
text_body: """
@ -62,6 +74,7 @@ defmodule Plausible.Workers.ExportCSV do
"""
)
)
end
:ok
end

View File

@ -198,7 +198,8 @@ defmodule Plausible.ConfigTest do
{"S3_SECRET_ACCESS_KEY", nil},
{"S3_REGION", nil},
{"S3_ENDPOINT", nil},
{"S3_EXPORTS_BUCKET", nil}
{"S3_EXPORTS_BUCKET", nil},
{"S3_IMPORTS_BUCKET", nil}
]
result =
@ -211,13 +212,14 @@ defmodule Plausible.ConfigTest do
assert %ArgumentError{} = result
assert Exception.message(result) == """
Missing S3 configuration. Please set S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_REGION, S3_ENDPOINT, S3_EXPORTS_BUCKET environment variable(s):
Missing S3 configuration. Please set S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_REGION, S3_ENDPOINT, S3_EXPORTS_BUCKET, S3_IMPORTS_BUCKET environment variable(s):
\tS3_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE
\tS3_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
\tS3_REGION=us-east-1
\tS3_ENDPOINT=https://<ACCOUNT_ID>.r2.cloudflarestorage.com
\tS3_EXPORTS_BUCKET=my-csv-exports-bucket
\tS3_IMPORTS_BUCKET=my-csv-imports-bucket
"""
end
@ -227,7 +229,8 @@ defmodule Plausible.ConfigTest do
{"S3_SECRET_ACCESS_KEY", nil},
{"S3_REGION", "eu-north-1"},
{"S3_ENDPOINT", nil},
{"S3_EXPORTS_BUCKET", "my-exports"}
{"S3_EXPORTS_BUCKET", "my-exports"},
{"S3_IMPORTS_BUCKET", nil}
]
result =
@ -240,10 +243,11 @@ defmodule Plausible.ConfigTest do
assert %ArgumentError{} = result
assert Exception.message(result) == """
Missing S3 configuration. Please set S3_SECRET_ACCESS_KEY, S3_ENDPOINT environment variable(s):
Missing S3 configuration. Please set S3_SECRET_ACCESS_KEY, S3_ENDPOINT, S3_IMPORTS_BUCKET environment variable(s):
\tS3_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
\tS3_ENDPOINT=https://<ACCOUNT_ID>.r2.cloudflarestorage.com
\tS3_IMPORTS_BUCKET=my-csv-imports-bucket
"""
end
@ -253,7 +257,8 @@ defmodule Plausible.ConfigTest do
{"S3_SECRET_ACCESS_KEY", "minioadmin"},
{"S3_REGION", "us-east-1"},
{"S3_ENDPOINT", "http://localhost:6000"},
{"S3_EXPORTS_BUCKET", "my-exports"}
{"S3_EXPORTS_BUCKET", "my-exports"},
{"S3_IMPORTS_BUCKET", "my-imports"}
]
config = runtime_config(env)
@ -266,8 +271,9 @@ defmodule Plausible.ConfigTest do
s3: [scheme: "http://", host: "localhost", port: 6000]
]
assert get_in(runtime_config(env), [:plausible, Plausible.S3]) == [
exports_bucket: "my-exports"
assert get_in(config, [:plausible, Plausible.S3]) == [
exports_bucket: "my-exports",
imports_bucket: "my-imports"
]
end
end

View File

@ -4,6 +4,8 @@ defmodule Plausible.Imported.CSVImporterTest do
alias Testcontainers.MinioContainer
require SiteImport
doctest CSVImporter, import: true
@moduletag :minio
setup_all do
@ -55,9 +57,12 @@ defmodule Plausible.Imported.CSVImporterTest do
"imported_visitors"
]
start_date = "20231001"
end_date = "20240102"
uploads =
Enum.map(tables, fn table ->
filename = "#{table}.csv"
filename = "#{table}_#{start_date}_#{end_date}.csv"
%{
"filename" => filename,
@ -65,11 +70,12 @@ defmodule Plausible.Imported.CSVImporterTest do
}
end)
date_range = CSVImporter.date_range(uploads)
assert {:ok, job} =
CSVImporter.new_import(site, user,
# to satisfy the non null constraints on the table I'm providing "0" dates (according to ClickHouse)
start_date: ~D[1970-01-01],
end_date: ~D[1970-01-01],
start_date: date_range.first,
end_date: date_range.last,
uploads: uploads
)
@ -80,8 +86,8 @@ defmodule Plausible.Imported.CSVImporterTest do
%{
id: ^import_id,
source: :csv,
start_date: ~D[1970-01-01],
end_date: ~D[1970-01-01],
start_date: ~D[2023-10-01],
end_date: ~D[2024-01-02],
status: SiteImport.pending()
}
] = Plausible.Imported.list_all_imports(site)
@ -97,7 +103,7 @@ defmodule Plausible.Imported.CSVImporterTest do
test "imports tables from S3", %{site: site, user: user, s3: s3, container: minio} do
csvs = [
%{
name: "imported_browsers.csv",
name: "imported_browsers_20211230_20211231.csv",
body: """
"date","browser","visitors","visits","visit_duration","bounces"
"2021-12-30","Amazon Silk",2,2,0,2
@ -122,7 +128,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_devices.csv",
name: "imported_devices_20211230_20220102.csv",
body: """
"date","device","visitors","visits","visit_duration","bounces"
"2021-12-30","Desktop",25,28,75,27
@ -140,7 +146,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_entry_pages.csv",
name: "imported_entry_pages_20211230_20211231.csv",
body: """
"date","visitors","entrances","visit_duration","bounces","entry_page"
"2021-12-30",6,6,0,6,"/14776416252794997127"
@ -173,7 +179,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_exit_pages.csv",
name: "imported_exit_pages_20211230_20211231.csv",
body: """
"date","visitors","exits","exit_page"
"2021-12-30",6,6,"/14776416252794997127"
@ -198,7 +204,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_locations.csv",
name: "imported_locations_20211230_20211231.csv",
body: """
"date","country","region","city","visitors","visits","visit_duration","bounces"
"2021-12-30","AU","",0,1,1,43,0
@ -235,7 +241,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_operating_systems.csv",
name: "imported_operating_systems_20211230_20220101.csv",
body: """
"date","operating_system","visitors","visits","visit_duration","bounces"
"2021-12-30","Android",25,26,254,24
@ -255,7 +261,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_pages.csv",
name: "imported_pages_20211230_20220101.csv",
body: """
"date","visitors","pageviews","exits","time_on_page","hostname","page"
"2021-12-30",1,1,0,43,"lucky.numbers.com","/14776416252794997127"
@ -277,7 +283,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_sources.csv",
name: "imported_sources_20211230_20220106.csv",
body: """
"date","source","utm_medium","utm_campaign","utm_content","utm_term","visitors","visits","visit_duration","bounces"
"2021-12-30","","","","","",25,26,254,24
@ -307,7 +313,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_visitors.csv",
name: "imported_visitors_20111225_20111230.csv",
body: """
"date","visitors","pageviews","bounces","visits","visit_duration"
"2011-12-25",5,50,2,7,8640
@ -327,13 +333,12 @@ defmodule Plausible.Imported.CSVImporterTest do
%{"filename" => name, "s3_url" => minio_url(minio, "imports", key)}
end
date_range = CSVImporter.date_range(uploads)
{:ok, job} =
CSVImporter.new_import(
site,
user,
# to satisfy the non null constraints on the table I'm providing "0" dates (according to ClickHouse)
start_date: ~D[1970-01-01],
end_date: ~D[1970-01-01],
CSVImporter.new_import(site, user,
start_date: date_range.first,
end_date: date_range.last,
uploads: uploads
)
@ -341,7 +346,6 @@ defmodule Plausible.Imported.CSVImporterTest do
assert :ok = Plausible.Workers.ImportAnalytics.perform(job)
# on successfull import the start and end dates are updated
assert %SiteImport{
start_date: ~D[2011-12-25],
end_date: ~D[2022-01-06],
@ -355,7 +359,7 @@ defmodule Plausible.Imported.CSVImporterTest do
test "fails on invalid CSV", %{site: site, user: user, s3: s3, container: minio} do
csvs = [
%{
name: "imported_browsers.csv",
name: "imported_browsers_20211230_20211231.csv",
body: """
"date","browser","visitors","visits","visit_duration","bounces"
"2021-12-30","Amazon Silk",2,2,0,2
@ -368,7 +372,7 @@ defmodule Plausible.Imported.CSVImporterTest do
"""
},
%{
name: "imported_devices.csv",
name: "imported_devices_20211230_20211231.csv",
body: """
"date","device","visitors","visit_duration","bounces"
"2021-12-30","Desktop",28,ehhhh....
@ -383,12 +387,12 @@ defmodule Plausible.Imported.CSVImporterTest do
%{"filename" => name, "s3_url" => minio_url(minio, "imports", key)}
end
date_range = CSVImporter.date_range(uploads)
{:ok, job} =
CSVImporter.new_import(
site,
user,
start_date: ~D[1970-01-01],
end_date: ~D[1970-01-01],
CSVImporter.new_import(site, user,
start_date: date_range.first,
end_date: date_range.last,
uploads: uploads
)
@ -508,12 +512,12 @@ defmodule Plausible.Imported.CSVImporterTest do
end)
# run importer
date_range = CSVImporter.date_range(uploads)
{:ok, job} =
CSVImporter.new_import(
site,
user,
start_date: ~D[1970-01-01],
end_date: ~D[1970-01-01],
CSVImporter.new_import(site, user,
start_date: date_range.first,
end_date: date_range.last,
uploads: uploads
)
@ -533,7 +537,13 @@ defmodule Plausible.Imported.CSVImporterTest do
end
defp minio_url(minio, bucket, key) do
arch = to_string(:erlang.system_info(:system_architecture))
if String.contains?(arch, "darwin") do
Path.join(["http://#{minio.ip_address}:9000", bucket, key])
else
port = minio |> MinioContainer.connection_opts() |> Keyword.fetch!(:port)
Path.join(["http://172.17.0.1:#{port}", bucket, key])
end
end
end

View File

@ -0,0 +1,4 @@
defmodule Plausible.S3Test do
use ExUnit.Case, async: true
doctest Plausible.S3, import: true
end