mirror of
https://github.com/adambard/learnxinyminutes-docs.git
synced 2024-11-26 09:21:00 +03:00
[dbt/en] dbt framework add
This commit is contained in:
parent
810ab009d5
commit
8c8790c7e1
218
dbt.html.markdown
Normal file
218
dbt.html.markdown
Normal file
@ -0,0 +1,218 @@
|
||||
---
|
||||
language: dbt
|
||||
filename: learndbt.sql
|
||||
contributors:
|
||||
- ["isabel faulds", "https://github.com/manyshapes"]
|
||||
|
||||
---
|
||||
|
||||
Data Build Tool or dbt™ (core) is an open-source command-line tool and framework
|
||||
for data transformation workflows. There is also a dbt cloud for managed, hosted
|
||||
dbt services with extended capabilities. dbt (core) utilizes .sql, .py, .yml,
|
||||
and .bash or any cli for orchestrating data warehouse command executions and
|
||||
generating .json artifacts.
|
||||
|
||||
dbt is agnostic to cloud provider, warehouse, and sql dialect. dbt works most
|
||||
effectively with version controlled systems and is implemented commonly with
|
||||
[git](git.html.markdown). dbt leverages [jinja](jinja.html.markdown) functions
|
||||
for dynamic values within [yaml](yaml.html.markdown), [sql](sql.html.markdown)
|
||||
and [python](python.html.markdown) .
|
||||
|
||||
### dbt SQL Models
|
||||
|
||||
```sql
|
||||
-- models can have optional configs specified within them
|
||||
{{ config(
|
||||
-- names in database will be filenames unless an alias
|
||||
alias='report' ,
|
||||
-- models can be views, tables, incremental (updated tables),
|
||||
-- ephemeral (temp tables), and snapshots (historical tables)
|
||||
materialized='incremental',
|
||||
-- if incremental, data capture methods can be specified with a strategy
|
||||
incremental_strategy='delete+insert',
|
||||
-- that can fail if the data schema changes
|
||||
on_schema_change='fail'
|
||||
--
|
||||
)}}
|
||||
|
||||
-- dbt emphasizes cte based development
|
||||
with staging_data as (
|
||||
select * ,
|
||||
current_timestamp() as loaded_date
|
||||
-- with refs to use other dbt seeds and models
|
||||
from {{ ref('seed_data') }}
|
||||
)
|
||||
|
||||
select
|
||||
_id,
|
||||
account_id,
|
||||
transaction_date,
|
||||
total_usd,
|
||||
loaded_date
|
||||
|
||||
from staging_data
|
||||
-- dbt can use {% if %} statements for conditional code block executions and
|
||||
-- is_incremental() to check if the current model is incremental
|
||||
{% if is_incremental() %}
|
||||
-- {{ this }} self references the current model
|
||||
where transaction_date > ( select max(transaction_date) from {{ this }} )
|
||||
-- the code block is not executed if the if statement is not met
|
||||
{% endif %}
|
||||
|
||||
|
||||
-- Snapshot models preserve historical data, or slowly changing dimensions
|
||||
{% snapshot snapshotted_model %}
|
||||
{{
|
||||
config(
|
||||
-- specifying a unique id
|
||||
unique_key='_id'
|
||||
-- dbt will create new records for snapshotted data if there are changes in the
|
||||
-- tracked columns
|
||||
strategy='check'
|
||||
check_cols=['account_manager'],
|
||||
snapshot_date='snapshot_date'
|
||||
)
|
||||
}}
|
||||
|
||||
with staging_data as (
|
||||
select * from {{ ref('staging_account_transactions')}}
|
||||
),
|
||||
|
||||
final as (
|
||||
select
|
||||
_id,
|
||||
transaction_date,
|
||||
account_id,
|
||||
-- when a change is detected in account_manager a new row will be saved
|
||||
account_manager,
|
||||
-- the row can have the more recent date of the snapshot for ordering
|
||||
CURRENT_TIMESTAMP() as snapshot_date,
|
||||
total_transaction_amount,
|
||||
-- saved, reusable sql operations can be performed with dbt macros
|
||||
{{ get_account_status('last_transaction_date', 'last_payment_date')
|
||||
}} as account_status,
|
||||
|
||||
from
|
||||
account_transactions
|
||||
)
|
||||
|
||||
select * from final
|
||||
{% endsnapshot %}
|
||||
|
||||
|
||||
-- macros are saved in files like macros/account_management_macros
|
||||
-- a macro is defined with the variables it's expecting
|
||||
{% macro get_account_status(last_transaction_date, last_payment_date) %}
|
||||
-- the sql saved for a macro will be performed on the given fields
|
||||
case
|
||||
when {{ last_transaction_date }} < {{last_payment_date}} and
|
||||
last_payment_date < CURRENT_DATE - INTERVAL '1 year' then 'Dormant'
|
||||
when {{ last_transaction_date }} > {{last_payment_date}} - INTERVAL
|
||||
'90 days' then 'Overdue'
|
||||
else 'Active'
|
||||
end
|
||||
-- % endmacro marks the end of the macro code block
|
||||
{% endmacro %}
|
||||
|
||||
```
|
||||
|
||||
## Configurations
|
||||
|
||||
```yml
|
||||
#########################################################
|
||||
# dbt_project.yml
|
||||
#########################################################
|
||||
#
|
||||
# cli commands are executed in same working directory as dbt_project.yml
|
||||
# dbt_project.yml will always have following values
|
||||
name: organization_project_name
|
||||
version: "1.0"
|
||||
profile: database_profile
|
||||
|
||||
# And many optional values with implicit defaults
|
||||
# like folder locations
|
||||
model-paths: ["models"]
|
||||
# or the output of the model
|
||||
models:
|
||||
organization_project:
|
||||
+materialized: view # Default materialization for models
|
||||
# these configurations are hierarchical and will act as defaults for files
|
||||
# without config blocks
|
||||
|
||||
#########################################################
|
||||
# profiles.yml
|
||||
#########################################################
|
||||
# The profile specified in dbt_project.yml is defined within `profiles.yml`
|
||||
database_profile:
|
||||
# Like all dbt files it can contain hard coded values
|
||||
target: hardcoded_target_environment_name
|
||||
outputs:
|
||||
dev:
|
||||
type: postgres
|
||||
# or environment variables using jinja
|
||||
user: "{{ env_var('POSTGRES_USER') }}"
|
||||
password: "{{ env_var('POSTGRES_PW') }}"
|
||||
# with defaults for variables if not available
|
||||
database: "{{ env_var('POSTGRES_DB', 'core') }}"
|
||||
# and python augmentation of variables
|
||||
schema: "{{ '_'.join([env_var('POSTGRES_USER').replace('.', '_').upper()
|
||||
, env_var('POSTGRES_SCHEMA') ]) }}"
|
||||
role: "{{ env_var('POSTGRES_ROLE')}}"
|
||||
```
|
||||
|
||||
### CLI Commands
|
||||
|
||||
```bash
|
||||
# cli commands are executed in same working directory as dbt_project.yml
|
||||
|
||||
# .csv files are seeded into database
|
||||
dbt seed
|
||||
# .sql or .py models are materialized in the database as tables or view
|
||||
dbt run
|
||||
# .sql or .yml tests can be performed
|
||||
dbt test
|
||||
# models can be materialized, ran, and shapshotted
|
||||
dbt build
|
||||
# a command can specify a model
|
||||
dbt build --select final_model
|
||||
# with upstream dependencies
|
||||
dbt build --select +final_model
|
||||
# and / or downstream dependencies
|
||||
dbt build --select +final_model+
|
||||
# metadata can be generated on materialized models
|
||||
dbt docs generate
|
||||
# full command list available in
|
||||
dbt list
|
||||
```
|
||||
|
||||
### Repository Structure
|
||||
|
||||
```text
|
||||
dbt has a default file structure when configurations do not define location
|
||||
|
||||
repository/
|
||||
└── dbt/
|
||||
├── dbt_project.yml # Required
|
||||
├── profiles.yml # Required
|
||||
├── models/ # Required , optional name
|
||||
│ ├── staging/ # Optional subfolders
|
||||
│ | └── staging_model.sql
|
||||
│ └── final_model.sql
|
||||
├── macros/ # Optional macro functions
|
||||
│ └── custom_macros.sql
|
||||
├── snapshots/ # Optional snapshot models
|
||||
│ └── snapshot.sql
|
||||
├── seeds/ # Optional csv files
|
||||
│ └── seed_data.csv
|
||||
├── logs/ # Output location
|
||||
├── target/ # Output location
|
||||
└── tests/ # Optional model tests
|
||||
└── custom_tests.sql
|
||||
```
|
||||
|
||||
## Further Reading
|
||||
|
||||
* [dbt logging](https://docs.getdbt.com/reference/global-configs/logs) - dbt documentation on outputs logs that can capture execution &
|
||||
debug logging
|
||||
* [dbt metadata artifacts](https://docs.getdbt.com/reference/artifacts/dbt-artifacts) - dbt documentation on generated artifacts, such as
|
||||
json documents for detailing attributes & metadata of a project
|
Loading…
Reference in New Issue
Block a user