Contents — happy new year everyone

This commit is contained in:
Adam Smith 2021-12-31 23:54:19 -08:00
commit 70fa808fcd
6259 changed files with 935979 additions and 0 deletions

4
LICENSES.md Normal file
View File

@ -0,0 +1,4 @@
## [Android Robot icon](./sidebar/src/assets/editorIcons/android-studio@2x.png)
- License
> The Android robot is reproduced or modified from work created and shared by Google and used according to terms described in the Creative Commons 3.0 Attribution License.
- Guidelines at https://developer.android.com/distribute/marketing-tools/brand-guidelines#android_robot

195
Makefile Executable file
View File

@ -0,0 +1,195 @@
#!/usr/bin/env make
# This variable is used to set the environment variable UIDEBUG
# in sidebar-bindata. It is initialized with the value of UIDEBUG
# from the environment but is explicitly set to 1 when
# usernode-debug-ui is run.
UI_DEBUG := $(UIDEBUG)
# Allow for a test backend to be passed into the makefile, but make sure that we have
# a sensible default too
REACT_APP_TEST_BACKEND := $(if $(REACT_APP_TEST_BACKEND),$(REACT_APP_TEST_BACKEND),$("https://staging.kite.com"))
GITCOMMIT := $(shell git rev-parse HEAD)
.PHONY: run-standalone
default: install-standalone
#####################################
# Go build and verification tools #
#####################################
install-ci-deps:
# This target contains a minimal set of tools needed by CI.
# Do not add things here lightly!
go get -u golang.org/x/lint/golint
go get -u golang.org/x/tools/cmd/goimports
go get github.com/jteeuwen/go-bindata/...
go get gotest.tools/gotestsum
install-deps: install-ci-deps
# Protocol buffers
go get github.com/golang/protobuf/proto
go get github.com/golang/protobuf/protoc-gen-go
# Install some utilities
go install github.com/kiteco/kiteco/kite-go/cmds/printjson
go install github.com/kiteco/kiteco/kite-go/cmds/importchanged
datadeps-bindata:
go install github.com/kiteco/kiteco/kite-go/client/internal/kitelocal/cmds/datadeps-bindata
build-datadeps:
./scripts/build_datadeps.sh
generate:
go generate ./...
test:
# Run gotestsum with codecov reports for kite-go and kite-golib
gotestsum --raw-command scripts/go_test_coverage ./kite-go/... ./kite-golib/...
# Run gotestsum for checking build & test for local-pipelines (not part of codecov)
gotestsum ./local-pipelines/...
# Only run the data race checker on goroutine-heavy packages
go test -race \
./kite-go/sandbox \
./kite-go/client/internal/client \
./kite-go/client/internal/clientapp \
./kite-go/health/cmds/healthd \
./kite-go/core \
./kite-go/lang/python/pythonlocal
# Linux only, run tests with libtcmalloc overriding malloc, free, ...
test-tcmalloc:
LD_PRELOAD="${PWD}/linux/tcmalloc/libtcmalloc_minimal_debug.so" ${MAKE} test
build:
go build -v ./kite-go/... ./kite-golib/... ./local-pipelines/... ./kite-answers/...
vet:
# Run go-vet on all directories
go vet ./kite-go/... ./kite-golib/... ./local-pipelines/... ./kite-answers/...
lint:
true ./scripts/custom_lint.sh
# Run golint only on files that are not auto-generated
find kite-go kite-golib local-pipelines kite-answers -name "*.go" | grep -v ".pb.go" | grep -v "bindata.go" | grep -v "stackoverflow-xml.go" | grep -v "lsp/types/protocol.go" | xargs -I file golint file > /tmp/golint.test 2>&1
cat /tmp/golint.test
! test -s /tmp/golint.test
fmt:
find kite-go kite-golib local-pipelines kite-answers -name "*.go" | grep -v "bindata.go" | grep -v ".*.pb.go" | grep -v "/corpus/go/.*.go" | xargs -I file goimports -l=true file > /tmp/gofmt.test 2>/dev/null
cat /tmp/gofmt.test
! test -s /tmp/gofmt.test
check-client-fatal:
true git grep 'log.Fatal' ./kite-go/client/internal/ ':(exclude)*_test.go' ':(exclude)*/cmds/*' > /tmp/fatal.test 2>&1
cat /tmp/fatal.test
! test -s /tmp/fatal.test
bin-check:
! git status --porcelain --untracked-files=no | sed s/".* "// | xargs -I f file ../f | grep -E '(ELF|x86)'
verify: fmt lint vet bin-check build test
pull-frontend-docker:
docker pull kiteco/build-frontend
install-libtensorflow:
sudo rm -f /usr/local/lib/libtensorflow* || true
curl -L "https://s3-us-west-1.amazonaws.com/kite-data/tensorflow/libtensorflow-cpu-`go env GOOS`-x86_64-1.15.0.tar.gz" | sudo tar -C /usr/local -xz
install-libtensorflow-avx2:
sudo rm -f /usr/local/lib/libtensorflow* || true
curl -L "https://s3-us-west-1.amazonaws.com/kite-data/tensorflow/libtensorflow-cpu-`go env GOOS`-x86_64-avx2-1.15.0.tar.gz" | sudo tar -C /usr/local -xz
#######################################
# Webapp assets/bindata generation #
#######################################
# Ref for seemingly extravagant npm invocations: https://github.com/imagemin/pngquant-bin/issues/52#issuecomment-260247356
webapp-deps: pull-frontend-docker
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm rebuild --quiet; npm uninstall --quiet; npm install --quiet"
webapp-tests: webapp-deps
# TODO(tarak): Use the right commands to run the tests here?
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build-test"
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app -t kiteco/build-frontend npm test
webapp-build: webapp-deps
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
webapp-build-dev: webapp-deps
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-e "REACT_APP_ENV=development"\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
webapp-build-staging: webapp-deps
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-e "REACT_APP_BACKEND=https://staging.kite.com" -e "REACT_APP_ENV=staging"\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
webapp-build-prod: webapp-deps
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-e "REACT_APP_BACKEND=https://alpha.kite.com" -e "REACT_APP_ENV=production"\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
webapp-build-testing: webapp-deps
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
-e "REACT_APP_BACKEND=$(REACT_APP_TEST_BACKEND)" -e "REACT_APP_ENV=development"\
-t kiteco/build-frontend\
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
#######################################
# kited.exe: windows #
#######################################
force:
kited.exe: force
go build -buildmode=exe \
-ldflags "-H windowsgui -X github.com/kiteco/kiteco/kite-go/client/internal/clientapp.gitCommit=$(GITCOMMIT)" \
github.com/kiteco/kiteco/kite-go/client/cmds/kited
WINDOWS_BUILD_VERSION ?= "9.9.9.9"
KiteSetup.exe: kited.exe kite-lsp.exe
mv kited.exe windows/
mv kite-lsp.exe windows/
mkdir -p windows/installer/current_build_bin/out
cd windows/installer && ./nant.bat -D:prevPatchVersion="${WINDOWS_PATCH_BASE}" -D:buildnumstring="${WINDOWS_BUILD_VERSION}" build
KiteUpdateInfo.xml: KiteSetup.exe
@cd windows/tools/kite_update_signer_cmd/bin/Debug && ./KiteUpdateSignerCmd.exe ${WINDOWS_PASS}
KitePatchUpdateInfo.xml: KiteSetup.exe
@[[ -n "${WINDOWS_PATCH_BASE}" ]] && cd windows/tools/kite_patch_update_signer_cmd/bin/Debug && ./KitePatchUpdateSignerCmd.exe ${WINDOWS_PASS}
kite-lsp.exe: force
go build \
-ldflags "-H windowsgui" \
github.com/kiteco/kiteco/kite-go/lsp/cmds/kite-lsp
kite-windows: KiteSetup.exe KiteUpdateInfo.xml KitePatchUpdateInfo.xml
#######################################
install-standalone:
./scripts/standalone.sh install
run-standalone:
./scripts/standalone.sh run
run-web-node:
go run github.com/kiteco/kiteco/kite-go/cmds/web-node/

233
README.md Normal file
View File

@ -0,0 +1,233 @@
Getting started with the codebase
=================================
Our codebase is primarily located at github.com/kiteco/kiteco (http://github.com/kiteco/kiteco). There are a few auxiliary repositories that host very experimental code, but the goal is to make the “kiteco” repository the point of truth for all of our services.
Summary (TL;DR)
---------------
* Our codebase is primarily Go. (`kite-go`, `kite-golib` directories)
* Infrastructure uses Terraform (for AWS) provisioning, and Fabric/shell scripts for deployment and management of remote hosts (`devops` directory)
* You need VPN credentials to access any of our remote AWS (or Azure) hosts.
* Platform-specific logic & instructions live in subdirectories `osx`, `windows`, `linux`. You probably don't need these.
Git LFS
--
We use [Git LFS](https://git-lfs.github.com/) to store our various `bindata.go` files. You will need to install the command line tool to get the contents of those files when you pull the repository. Installation instructions are on their website, but for MacOS you can install it by running (from inside the `kiteco` repository)
```
brew update
brew install git-lfs
git lfs install
```
Then do a `git pull` to get the bindata.go files. If they do not download from LFS, try running `git lfs pull` (you should only need to do this once - subsequent `git pull`s should update the bindata correctly).
### Optional: Improving Performance
`git lfs install` installs a [smudge filter](https://git-scm.com/docs/gitattributes) that automatically downloads and replaces the contents of newly checked out "pointer files" with their content.
By default smudge filters operate on checked out blobs in sequence, so cannot download in batch as would typically happen when running `git lfs pull`.
Furthermore, by default, git checkouts will block on downloading the new LFS files which can be annoying.
You might prefer to disable the smudge filter (this can be run even if you've already run the regular `git lfs install`):
```
git lfs install --skip-smudge
git lfs pull
```
Then, when building after a new checkout, you may see an error of the form "expected package got ident."
This occurs because `go` reads some Go files and sees the Git LFS pointers instead of the actual data file.
At this point, you can download the latest files with `git lfs pull` and rebuilding should work.
Nothing needs to be done when pushing LFS blobs. That will still happen automatically.
Go
--
The bulk of our code is currently in Go.
This can be found at github.com/kiteco/kiteco/kite-go (http://github.com/kiteco/kiteco/kite-go).
To get started working in this part of the codebase, first make sure you have your Go environment setup correctly (i.e Go is installed, $GOPATH is set, etc.).
Locally, however, you will need to install Go 1.15.3. The following steps will get you going.
Set `$GOPATH` in your .profile / .bashrc/ .bash_profile / .zshrc, e.g:
```sh
export GOROOT=/usr/local/go
export GOPATH=$HOME/go
export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
```
Make sure to create these directories as well:
```sh
mkdir $HOME/go
mkdir $HOME/go/src $HOME/go/bin $HOME/go/pkg
```
If you are on a Mac and set the above in either .bashrc or .zshrc, make sure to load it in either your .profile or .bash_profile.
See [this](http://www.joshstaiger.org/archives/2005/07/bash_profile_vs.html) for an explanation.
It would be useful to become familiar with how `go` code is organized. Check out https://golang.org/doc/code.html for more on this topic.
Navigate to where the `kiteco` repo will live in your `GOPATH`, and clone the repo.
```sh
# Create kiteco directory within GOPATH, and clone the repo there
mkdir -p ~/go/src/github.com/kiteco
cd ~/go/src/github.com/kiteco
git clone git@github.com:kiteco/kiteco
```
To install the latest version of Go that's compatible with our codebase, run:
```sh
cd ~/go/src/github.com/kiteco/kiteco
cd devops/scripts
./install-golang.sh
```
From here, just run `make install-deps` from the root of the `kiteco` repo to get basic utilities installed.
```sh
# Install dependencies
make install-deps
```
Use `./scripts/update-golang-version.sh` if you'd like to make Kite require a newer version of Golang.
### Tensorflow
For development builds (see below), you may need to have Tensorflow installed globally on your system.
```bash
make install-libtensorflow
```
Building Kite
-------------
You're now ready to build Kite! First, build the sidebar for your platform
```bash
./osx/build_electron.sh force
# ./linux/build_electron.sh force
# ./windows/build_electron.sh force
```
This process is asynchronous to the Kite daemon build,
so you must manually rebuild the sidebar as needed.
Now build and run Kite:
```bash
make run-standalone
```
Note that this is not a full Kite build, but is the recommended approach for development, as it is much faster.
Some functionality is disabled in the development build (depending on the platform):
- Kite system tray icon
- Updater service
Development
-----------
You should be able to develop, build, and test Kite entirely on your local machine.
However, we do have cloud instances & VMs available for running larger jobs and for
[testing our cloud services](VAGRANT.md)
### Dependency Management with Go Modules
We use the [Go Modules](https://blog.golang.org/using-go-modules) system for dependency management.
General tips:
- make sure in `~/go/src/github.com/kiteco/kiteco` and not a symlink
- make sure deps are updated to the versions in `go.mod`: `go mod download`
- Set `$GOPRIVATE` in your .profile / .bashrc/ .bash_profile / .zshrc, e.g: `export GOPRIVATE=github.com/kiteco/*`.
To add or update a dependency, all you need to do is `go get` it, which
will automatically update the `go.mod` and `go.sum` files. To remove a dependency,
remove references to it in the code and run `go mod tidy`. In general, make sure to
run `go mod tidy` to make sure all new dependencies have been added and unused ones
have been removed before committing any dependency changes.
The process for updating a dependency is:
- `go get -u github.com/foo/bar`
- (optional) run any `go` command, such as `go build`, `go test`
- `go mod tidy`
- `git add go.mod go.sum`
- `git commit ...`
The process for adding a dependency is:
- `go get github.com/foo/bar`
- edit code to import "github.com/foo/bar"
- `go mod tidy`
- `git add go.mod go.sum`
- `git commit ...`
#### HTTPS Auth
`godep` may attempt to clone private repositories via HTTPS, requiring manual authentication.
Instead, you can add the following section to your `~/.gitconfig` in order to force SSH authentication:
```
[url "git@github.com:"]
insteadOf = https://github.com/
```
### Datasets, Datadeps
We bundle a lot of pre-computed datasets & machine learning models into the Kite app
through the use of a custom filemap & encoding on top of [go-bindata](https://github.com/jteeuwen/go-bindata).
The data, located in `kite-go/client/datadeps`, is kept in Git-LFS.
All needed data files is first stored on S3.
There are pointers at various places in our codebase to S3 URIs.
After updating references to these datasets, the datadeps file must be manually rebuilt:
```
$ ./scripts/build_datadeps.sh
```
This will bundle all data that is loaded at Kite initialization time.
You must ensure the needed data is loaded at initialization, otherwise it will not be included!
### Logs
Some logs are displayed in Xcode, but most are written to a log file:
```shell
tail -F ~/.kite/logs/client.log
```
### Testing and Continuous Integration
Your Go code should pass several quality criteria before being allowed into the master branch. Travis CI (https://travis-ci.org/) acts as the gatekeeper between pull requests and merging. You can test your code before pushing to a pull request to speed up the process by navigating to the `kite-go` directory and running `make *` commands directly (any of `make (fmt|lint|vet|bin-check|build|test)`).
### VPN Access
You will need access to our VPN to connect to our backend hosts.
* Get VPN credentials (*.ovpn file) from @tarak (You will need to type in a password IRL - don't IM/chat it)
* Install Tunnelblick for OS X (https://code.google.com/p/tunnelblick/)
* Double click on the “.ovpn” file that contains your credentials.
* Tunnelblick should automatically apply the configuration.. look for the icon on the OS X status bar
* Click on the Tunnelblick icon, select your config, and enter your VPN password. (**NOTE**: Tunnelblick will complain saying the IP hasn't changed. Check the box to disable the message and continue.)
* Ping 'test-0.kite.com' and make sure it resolves. It's okay if the pings timeout; ICMP is disabled by default on aws instances.
### SSH Access
Kite's Dropbox has ssh credentials for all the machines on AWS and Azure under Shared > Engineering > keys > kite-dev.pem and Shared > Engineering > keys > kite-dev-azure. Place both of these in your .ssh directory, i.e. ~/.ssh/kite-dev.pem. As a convenience, you should add the following to your `~/.ssh/config`:
```
Host *.kite.com
ForwardAgent yes
IdentityFile ~/.ssh/kite-dev.pem
User ubuntu
# Test instances are on Azure
Host test-*.kite.com
User ubuntu
IdentityFile ~/.ssh/kite-dev-azure
```
Don't forget to set appropriate permissions on the credential files (e.g. 700)

46
VAGRANT.md Normal file
View File

@ -0,0 +1,46 @@
Cloud Development
=================
In the past, Kite's language analysis facilities ran on an AWS/Azure backend instead of the users machine.
There are still several backend components including exposing the symbol API for serving web docs as well as
servicing the web sandbox.
### Vagrant
We use VMs for backend development to guarantee a consistent environment between development and production.
To get this set up, first [set up Vagrant](vagrant-boxes/kite-dev/README.md)
Once you have a shell in the virtual machine, the kiteco repo's working directory should be at:
```sh
$HOME/go/src/github.com/kiteco/kiteco
```
NOTE: This is a symlink to `/kiteco`, mounted as a NFS share in the `Vagrantfile`
All commands (`make *`, `go build`, etc) must be run from the full `$HOME/go/src/github.com/kiteco/kiteco` path (not a symlinked directory).
From here, you may need to repeat some of the steps from the original dev setup, e.g:
```sh
# Install dependencies
make install-deps
```
Because `user-node` takes too many resources to load locally, there are test instances available on AWS/Azure for you to run/test your development changes to `user-node`.
Please see https://kite.quip.com/Phk4AB8lLqh9 for a list of test instances; we no longer have per-developer test instances,
so please notify others before deploying the backend or otherwise running resource intensive processes.
Once you have your test instance, you can deploy your local changes to it by running:
```sh
cd ~/go/src/github.com/kiteco/kiteco
./scripts/deploy_test.sh test-N.kite.com
```
#### Infrastructure
Our AWS infrastructure makes use of Terraform (http://www.terraform.io (http://www.terraform.io/)). Terraform helps us manage our AWS topology. Please do not modify this unless you know what you are doing :). Our terraform configuration files can be found in github.com/kiteco/kiteco/devops/terraform (http://github.com/kiteco/kiteco/devops/terraform).
We use Fabric to execute some commands on remote hosts (others are simply shell scripts that invoke SSH). The fabric scripts can be found at github.com/kiteco/kiteco/devops/fabric (http://github.com/kiteco/kiteco/devops/fabric).

1
airflow/.dockerignore Normal file
View File

@ -0,0 +1 @@
dev/**

2
airflow/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
dev/
terraform/.terraform.lock.hcl

2
airflow/.pep8 Normal file
View File

@ -0,0 +1,2 @@
[flake8]
max-line-length = 160

25
airflow/Dockerfile Normal file
View File

@ -0,0 +1,25 @@
FROM apache/airflow:1.10.12
USER airflow
ARG KITECO=${AIRFLOW_HOME}/kiteco
WORKDIR ${KITECO}
COPY --chown=airflow:root airflow/requirements.txt airflow/MANIFEST.in airflow/setup.py ./airflow/
COPY --chown=airflow:root airflow/kite_airflow ./airflow/kite_airflow
COPY --chown=airflow:root kite-python/metrics ./kite-python/metrics
RUN python -m pip install --user --upgrade pip && \
python -m pip install --user --no-cache-dir -r airflow/requirements.txt && \
python -m pip install --user ./airflow/ && \
python -m pip install --user ./kite-python/metrics/
WORKDIR ${AIRFLOW_HOME}
RUN rm -rf dags
RUN ln -s ${KITECO}/airflow/kite_airflow/dags dags
RUN mkdir conf
COPY airflow/conf/prod/airflow.cfg conf/
VOLUME /opt/airflow/conf
RUN ln -s conf/airflow.cfg .

2
airflow/MANIFEST.in Normal file
View File

@ -0,0 +1,2 @@
graft kite_airflow/templates
graft kite_airflow/files

32
airflow/Makefile Normal file
View File

@ -0,0 +1,32 @@
ECR_REPO_URL=XXXXXXX.dkr.ecr.us-west-1.amazonaws.com
ECR_PACKAGE_NAME=kite-airflow
TAG=$(shell git rev-parse --short HEAD)
docker.login:
aws ecr get-login-password --region us-west-1 | docker login --username AWS --password-stdin $(ECR_REPO_URL)
docker.build:
docker build -t $(ECR_PACKAGE_NAME):$(TAG) ../ -f Dockerfile
cd containers/monetizable && make TAG=$(TAG) docker.build
docker.tag:
docker tag $(ECR_PACKAGE_NAME):$(TAG) $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
cd containers/monetizable && make TAG=$(TAG) docker.tag
docker.push:
docker push $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
cd containers/monetizable && make TAG=$(TAG) docker.push
docker.all: docker.login docker.build docker.tag docker.push
deployment.apply:
docker run --rm -w /opt/terraform -v $(PWD)/terraform:/opt/terraform -v $(HOME)/.gcloud/:/root/.gcloud -v $(HOME)/.aws/:/root/.aws -v $(HOME)/.config/gcloud/:/root/.config/gcloud -it ljfranklin/terraform-resource:latest terraform apply -var tag=$(TAG)
deployment.list:
@aws ecs list-tasks --cluster airflow | jq -r '.taskArns[]' | xargs aws ecs describe-tasks --cluster airflow --tasks | jq -r '.tasks[] | {"group": .group, "container": .containers[]} | (.group + " " + .container.name + " " + (.container.image | capture(".*:(?<tag>[0-9a-f]+)$$") | .tag) + " " + .container.lastStatus)'
deployment.shell:
exec docker run --rm -w /opt/terraform -v $(PWD)/terraform:/opt/terraform -v $(HOME)/.gcloud/:/root/.gcloud -v $(HOME)/.aws/:/root/.aws -v $(HOME)/.config/gcloud/:/root/.config/gcloud -it ljfranklin/terraform-resource:latest /bin/bash
python.lint:
flake8 kite_airflow

32
airflow/README.md Normal file
View File

@ -0,0 +1,32 @@
Kite Airflow
============
UI
-------------
Airflow is deploy to https://airflow.kite.dev. Requires VPN.
How to Deploy
-------------
Requirements:
* AWS CLI
* JQ (https://stedolan.github.io/jq/download/)
* Docker
Deployment:
* Login to AWS ECR: make docker_login
* Deploy: make build deploy
* Confirm Terraform deploy by type "yes"
To see deployment status:
* make show_containers
Adding metrics to kite status 1d
--------------------------------
* Ensure the field is in dags/files/kite_status.schema.yaml.
* Add the aggregation to dags/templates/athena/queries/kite_status_1d.tmpl.sql.
* Deploy.
* Manually trigger the DAG "update_kite_status_schema": http://XXXXXXX:8080/admin/airflow/tree?dag_id=update_kite_status_schema
* Let the kite_status_1d jobs run at their normally-scheduled time.

View File

@ -0,0 +1,22 @@
[core]
executor = CeleryExecutor
load_examples = False
remote_logging = True
remote_log_conn_id = aws_us_east_1
remote_base_log_folder = s3://kite-backend-logs/airflow/logs
enable_xcom_pickling = False
dag_concurrency = 32
max_active_runs_per_dag = 32
dag_file_processor_timeout = 6000
parallelism = 64
[scheduler]
max_threads = 8
[celery]
broker_url = XXXXXXX
worker_concurrency = 16
[secrets]
backend = kite_airflow.secrets_backend.SecretsManagerBackend
backend_kwargs = {"connections_prefix": "airflow/connections", "variables_prefix": "airflow/variables", "region_name": "us-west-1"}

View File

@ -0,0 +1 @@
build/

View File

@ -0,0 +1,13 @@
FROM ubuntu:20.04
# Fixes x509: certificate signed by unknown authority when fetching from AWS
RUN apt-get update && apt-get install -y \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
ARG BUILD_HASH=unset
ENV BUILD_HASH=$BUILD_HASH
WORKDIR /opt/svc/
COPY build/monetizable .
ENTRYPOINT ["./monetizable"]

View File

@ -0,0 +1,41 @@
DEV_TAG=airflow_monetizable_dev:latest
KITECO=$(PWD)/../../..
DOCKER_RUN_CMD=docker run --rm -it -e AWS_ACCESS_KEY_ID=$(AWS_ACCESS_KEY_ID) -e AWS_SECRET_ACCESS_KEY=$(AWS_SECRET_ACCESS_KEY) -v $(KITECO):/go/src/github.com/kiteco/kiteco $(DEV_TAG)
ECR_REPO_URL=XXXXXXX.dkr.ecr.us-west-1.amazonaws.com
ECR_PACKAGE_NAME=kite-airflow-monetizable
TAG=$(shell git rev-parse --short HEAD)
KITECO=$${PWD%/kiteco/**}/kiteco
CWD_RELATIVE=$${PWD\#/**/kiteco}
GO_IMAGE=golang:1.15.3-buster
dev.build: build/monetizable
docker build --build-arg BUILD_HASH=$(TAG) -t $(DEV_TAG) .
dev.shell:
@exec $(DOCKER_RUN_CMD) /bin/bash
docker.login:
aws ecr get-login-password --region us-west-1 | docker login --username AWS --password-stdin $(ECR_REPO_URL)
docker.build: build/monetizable
docker build --build-arg BUILD_HASH=$(TAG) -t $(ECR_PACKAGE_NAME):$(TAG) .
docker.tag:
docker tag $(ECR_PACKAGE_NAME):$(TAG) $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
docker.push:
docker push $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
docker.all: docker.login docker.build docker.tag docker.push
build/monetizable: build main.go
docker run --rm -e "GOPRIVATE=github.com/kiteco/*" \
-v $(KITECO):/go/src/github.com/kiteco/kiteco \
-v $(PWD)/build:/build \
-w /go/src/github.com/kiteco/$(CWD_RELATIVE) \
$(GO_IMAGE) go build -o /build/monetizable .
build:
mkdir -p build

View File

@ -0,0 +1,112 @@
package main
import (
"bufio"
"compress/gzip"
"encoding/json"
"flag"
"fmt"
"log"
"net/url"
"os"
"path"
"strings"
"time"
"github.com/kiteco/kiteco/kite-golib/awsutil"
"github.com/kiteco/kiteco/kite-golib/conversion/monetizable"
"github.com/kiteco/kiteco/kite-golib/fileutil"
)
// Result type
type Result struct {
Score float64 `json:"score"`
Timestamp int64 `json:"timestamp"`
Userid string `json:"userid"`
ModelVersion string `json:"model_version"`
}
// Inputs alias
type Inputs = monetizable.Inputs
// Input type
type Input struct {
Userid string `json:"userid"`
Inputs
}
func main() {
var dataPath string
flag.StringVar(&dataPath, "data", "", "path to data directory")
var region string
flag.StringVar(&region, "region", "us-east-1", "AWS region of source data path")
var destPath string
flag.StringVar(&destPath, "dest", "", "path to destination directory")
flag.Parse()
buildHash := os.Getenv("BUILD_HASH")
runTS := time.Now().Unix()
dataURL, err := url.Parse(dataPath)
if err != nil {
log.Fatalf("Error parsing data path, %v", err)
}
keys, err := awsutil.S3ListObjects(region, dataURL.Hostname(), strings.TrimPrefix(dataURL.Path, "/"))
if err != nil {
log.Fatalf("Error listing data directory, %v", err)
}
for _, key := range keys {
srcFilename := fmt.Sprintf("s3://%s", path.Join(dataURL.Hostname(), key))
dstFilename := fmt.Sprintf("%s/%s", destPath, fmt.Sprintf("%s.json", strings.TrimSuffix(path.Base(key), ".gz")))
log.Printf("Processing file %s, destination=%s", srcFilename, dstFilename)
handleFile(srcFilename, dstFilename, buildHash, runTS)
}
}
func handleFile(srcFilename string, dstFilename string, buildHash string, runTS int64) {
zReader, err := fileutil.NewReader(srcFilename)
if err != nil {
log.Fatalf("Error reading data file %s", srcFilename)
}
defer zReader.Close()
gr, err := gzip.NewReader(zReader)
if err != nil {
log.Fatalf("Error reading gzip data in file %s", srcFilename)
}
defer gr.Close()
outf, err := fileutil.NewBufferedWriter(dstFilename)
if err != nil {
log.Fatalf("Error opening file %s for writing, %v", dstFilename, err)
}
defer outf.Close()
writer := json.NewEncoder(outf)
scanner := bufio.NewScanner(gr)
for scanner.Scan() {
var input Input
err = json.Unmarshal(scanner.Bytes(), &input)
if err != nil {
log.Fatalf("Error parsing JSON from %s, %v", srcFilename, err)
}
score, err := monetizable.Score(input.Inputs)
if err != nil {
log.Fatalf("Error computing score, %v", err)
}
var result = Result{
Score: score,
Timestamp: runTS,
Userid: input.Userid,
ModelVersion: buildHash,
}
if err := writer.Encode(result); err != nil {
log.Fatalf("Error writing result data file %s, %v", dstFilename, err)
}
}
}

View File

@ -0,0 +1,71 @@
version: "3"
services:
postgres:
image: "postgres:9.6"
container_name: "postgres"
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=XXXXXXX
- POSTGRES_DB=airflow
ports:
- "5432:5432"
volumes:
- ./dev/data/postgres:/var/lib/postgresql/data
initdb:
build:
context: ../
dockerfile: ./airflow/Dockerfile
entrypoint: airflow initdb
depends_on:
- postgres
volumes:
- ./conf/dev:/opt/airflow/conf
- $HOME/.aws/:/home/airflow/.aws
webserver:
build:
context: ../
dockerfile: ./airflow/Dockerfile
restart: always
depends_on:
- initdb
environment:
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- AWS_SESSION_TOKEN
volumes:
- ../:/opt/airflow/kiteco
- ./conf/dev:/opt/airflow/conf
- ./dev/logs:/opt/airflow/logs
- $HOME/.aws/:/home/airflow/.aws
ports:
- "8080:8080"
entrypoint: airflow webserver
healthcheck:
test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"]
interval: 30s
timeout: 30s
retries: 3
scheduler:
build:
context: ../
dockerfile: ./airflow/Dockerfile
restart: always
depends_on:
- initdb
environment:
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- AWS_SESSION_TOKEN
volumes:
- ../:/opt/airflow/kiteco
- ./conf/dev:/opt/airflow/conf
- ./dev/logs:/opt/airflow/logs
- $HOME/.aws/:/home/airflow/.aws
ports:
- "8793:8793"
entrypoint: airflow scheduler
healthcheck:
test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-scheduler.pid ]"]
interval: 30s
timeout: 30s
retries: 3

View File

View File

View File

@ -0,0 +1,18 @@
from airflow.models import Variable
from kite_airflow.common import utils
CIO_CREDENTIALS = Variable.get('cio_credentials' if utils.is_production() else 'cio_credentials_dev', deserialize_json=True)
CIO_MAX_CONCURRENT_REQUESTS = 50
MP_CREDENTIALS = Variable.get('mixpanel_credentials' if utils.is_production() else 'mixpanel_credentials_dev', deserialize_json=True)
MP_MAX_CONCURRENT_REQUESTS = 100
# S3
AWS_CONN_ID = 'aws_us_east_1'
BUCKET = 'kite-metrics' if utils.is_production() else 'kite-metrics-test'
DIR_SCRATCH_SPACE = 'athena-scratch-space'
DIR_SCRATCH_URI = 's3://{}/{}'.format(BUCKET, DIR_SCRATCH_SPACE)
# Athena
DB_KITE_METRICS = 'kite_metrics'

View File

@ -0,0 +1,73 @@
import csv
import codecs
from airflow.hooks.S3_hook import S3Hook
from kite_airflow.common import utils
from kite_airflow.common import configs
def get_scratch_csv_dict_reader(ti, task_id, sub_directory):
s3 = S3Hook(configs.AWS_CONN_ID)
filename = ti.xcom_pull(task_ids=task_id)
s3key = s3.get_key(
'{}/{}/{}.csv'.format(configs.DIR_SCRATCH_SPACE, sub_directory, filename),
configs.BUCKET,
)
return csv.DictReader(
codecs.getreader("utf-8")(s3key.get()['Body'])
)
def get_full_scratch_space_csv(ti, task_id, sub_directory):
reader = get_scratch_csv_dict_reader(ti, task_id, sub_directory)
row_list = []
for row in reader:
row_list.append(row)
return row_list
def get_line_of_scratch_space_csv(ti, task_id, sub_directory):
reader = get_scratch_csv_dict_reader(ti, task_id, sub_directory)
i = 0
for row in reader:
i += 1
yield i, row
def get_csv_file_as_dict(bucket, file_path):
s3 = S3Hook(configs.AWS_CONN_ID)
s3key = s3.get_key(file_path, bucket)
reader = csv.DictReader(codecs.getreader("utf-8")(s3key.get()['Body']))
row_list = []
for row in reader:
row_list.append(row)
return row_list
def write_dict_on_csv_file(bucket, file_path, data_list):
if(len(data_list) == 0):
return
s3_hook = S3Hook(configs.AWS_CONN_ID)
upload_data_list = []
keys = data_list[0].keys()
upload_data_list.append(','.join(keys))
for item in data_list:
values = item.values()
upload_data_list.append(','.join(values))
s3_hook.load_bytes(
'\n'.join(upload_data_list).encode('utf-8'),
file_path,
bucket,
replace=True,
)

View File

@ -0,0 +1,35 @@
import ast
import datetime
import time
import uuid
import json
from airflow.models import Variable
import kite_metrics
kite_status_config = kite_metrics.load_context('kite_status')
def is_production():
return Variable.get('env', 'dev') == 'production'
def get_supported_languages():
return kite_status_config['languages']
def get_unique_suffix():
return '-{}-{}.json'.format(
get_date_time_in_ISO(),
uuid.uuid4().hex,
)
def get_date_time_in_ISO():
date_time = datetime.datetime.fromtimestamp(time.time())
return date_time.isoformat() + 'Z'
def string_to_dict(string):
return ast.literal_eval(string.replace('=', ':'))

View File

View File

@ -0,0 +1,287 @@
import concurrent.futures
import datetime
import threading
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from jinja2 import PackageLoader
from customerio import CustomerIO
from kite_airflow.common import configs
from kite_airflow.common import utils
from kite_airflow.common import files
from kite_airflow.slack_alerts import task_fail_slack_alert
DIR_BASE_URI = 's3://{}/{}'.format(configs.BUCKET, 'coding-stats-mail')
DIR_APPROX_PERCENTILES = 'approx_percentiles'
DIR_DAILY_ACTIVE_USERS = 'daily_active_users'
DIR_CODING_STATS = 'coding_stats'
TABLE_DAILY_ACTIVE_USERS = 'kite_daily_active_users' if utils.is_production() else 'kite_daily_active_users_dev'
USER_LIMIT = -1 # helpful to reduce time during development
NUM_OF_WEEKS = 6
EVENT_STATS_EMAIL = 'send_stats_email_weekly'
cio_local = threading.local()
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2021, 1, 24),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
kite_coding_stats_email_dag = DAG(
'kite_coding_stats_mail',
description='Weekly coding stats emails to users that are active in last 2 weeks.',
default_args=default_args,
schedule_interval='0 20 * * SUN', # Every Sunday 20:00
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
approx_percentiles_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='get_approx_percentiles',
query='athena/coding_stats_mail/queries/approx_percentiles.sql',
params={
'languages': utils.get_supported_languages(),
},
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_APPROX_PERCENTILES),
database=configs.DB_KITE_METRICS,
dag=kite_coding_stats_email_dag,
)
drop_daily_active_users_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='drop_daily_active_users',
query='athena/coding_stats_mail/queries/drop_daily_active_users.sql',
params={
'table_name': TABLE_DAILY_ACTIVE_USERS,
},
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_DAILY_ACTIVE_USERS),
database=configs.DB_KITE_METRICS,
dag=kite_coding_stats_email_dag,
)
create_daily_active_users_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='create_daily_active_users',
query='athena/coding_stats_mail/tables/kite_daily_active_users.sql',
params={
'table_name': TABLE_DAILY_ACTIVE_USERS,
'data_location': '{}/{}/'.format(DIR_BASE_URI, DIR_DAILY_ACTIVE_USERS),
},
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_DAILY_ACTIVE_USERS),
database=configs.DB_KITE_METRICS,
dag=kite_coding_stats_email_dag,
)
update_daily_active_users_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='update_daily_active_users',
query='athena/coding_stats_mail/queries/update_daily_active_users.sql',
params={
'table_name': TABLE_DAILY_ACTIVE_USERS,
'languages': utils.get_supported_languages(),
},
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_DAILY_ACTIVE_USERS),
database=configs.DB_KITE_METRICS,
dag=kite_coding_stats_email_dag,
)
coding_stats_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='coding_stats',
query='athena/coding_stats_mail/queries/coding_stats.sql',
params={
'table_daily_active_users': TABLE_DAILY_ACTIVE_USERS,
'languages': utils.get_supported_languages(),
'num_of_weeks': NUM_OF_WEEKS,
},
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_CODING_STATS),
database=configs.DB_KITE_METRICS,
dag=kite_coding_stats_email_dag,
)
def get_approx_percentiles(ti):
percentiles_list = files.get_full_scratch_space_csv(
ti,
approx_percentiles_op.task_id,
DIR_APPROX_PERCENTILES,
)[0]
approx_percentiles = []
for percentile_index in range(1, 100):
approx_percentiles.append({
"percentile": percentile_index,
"value": float(percentiles_list[f'pct_{percentile_index}']),
})
return approx_percentiles
def get_coding_time_percentile(coding_hours, percentiles):
max_coding_time_percentile = 0
for index in range(len(percentiles)):
if percentiles[index]["value"] <= coding_hours:
max_coding_time_percentile = percentiles[index]["percentile"]
return max_coding_time_percentile
def is_inactive_user(record):
'''
Checks if user is inactive by looking at first two weeks of data
Adding first two weeks of coding_hours & completions_selected, if both of them are zero
then returns True
'''
return (
(record['coding_hours'].get(0, 0) + record['coding_hours'].get(1, 0) == 0) and
(record['completions_selected'].get(0, 0) + record['completions_selected'].get(1, 0) == 0)
)
def get_track_object(coding_stat_row, execution_date, all_percentiles):
'''Returns the track_object OR None in case of inactive user'''
# transforms coding stat data to their respective types
coding_stat_row['total_weeks'] = int(coding_stat_row['total_weeks'])
coding_stat_row['streak'] = int(coding_stat_row['streak'])
coding_stat_row['completions_selected'] = utils.string_to_dict(coding_stat_row['completions_selected'])
coding_stat_row['coding_hours'] = utils.string_to_dict(coding_stat_row['coding_hours'])
coding_stat_row['python_hours'] = utils.string_to_dict(coding_stat_row['python_hours'])
if is_inactive_user(coding_stat_row):
return None
coding_time_graph = []
max_coding_hours = max(coding_stat_row['coding_hours'].values())
max_python_hours = max(coding_stat_row['python_hours'].values())
exec_date_end = datetime.datetime(execution_date.year, execution_date.month, execution_date.day) + datetime.timedelta(days=7)
sat_offset = (exec_date_end.weekday() - 5) % 7
sun_offset = (exec_date_end.weekday() - 6) % 7
for week_index in range(NUM_OF_WEEKS - 1, -1, -1):
start_date = exec_date_end.replace(hour=0, minute=0, second=0) - datetime.timedelta(days=7 * (week_index + 1) + sun_offset) # Sunday 12:00am
end_date = exec_date_end.replace(hour=23, minute=59, second=59) - datetime.timedelta(days=7 * week_index + sat_offset) # Saturday 11:59:59pm
coding_hours = coding_stat_row['coding_hours'].get(week_index, 0)
python_hours = coding_stat_row['python_hours'].get(week_index, 0)
completions_selected = coding_stat_row['completions_selected'].get(week_index, 0)
coding_time_graph.append({
'start_date': int(start_date.timestamp()),
'end_date': int(end_date.timestamp()),
'coding_hours': coding_hours,
'scaled_coding_hours': coding_hours / max_coding_hours if max_coding_hours > 0 else 0,
'py_hours': python_hours,
'scaled_py_hours': python_hours / max_python_hours if max_python_hours > 0 else 0,
'completions_used': completions_selected,
'time_saved': python_hours * 0.18,
})
return dict(
all_time_weeks = coding_stat_row['total_weeks'],
streak = coding_stat_row['streak'],
coding_time_percentile = get_coding_time_percentile(
coding_stat_row['coding_hours'].get(week_index, 0),
all_percentiles
),
coding_time_graph = coding_time_graph,
)
def iteration(ti, execution_date, storage_task_name):
all_percentiles = get_approx_percentiles(ti)
start_row = ti.xcom_pull(task_ids=storage_task_name, key='progress')
for i, coding_stat_row in files.get_line_of_scratch_space_csv(ti, coding_stats_op.task_id, DIR_CODING_STATS):
if i <= start_row:
continue
yield (
i,
coding_stat_row['userid'],
get_track_object(coding_stat_row, execution_date, all_percentiles)
)
if i == USER_LIMIT:
return
def send_event_to_cio(item):
i, userid, track_object = item
if not hasattr(cio_local, 'client'):
cio_local.client = CustomerIO(configs.CIO_CREDENTIALS['site_id'], configs.CIO_CREDENTIALS['api_key'])
if track_object != None:
cio_local.client.track(customer_id=userid, name=EVENT_STATS_EMAIL, **track_object)
return i
def submissions_to_cio(ti, execution_date, dag_run, storage_task_name, **context):
queue_size = 100
futures = []
records_iter = iteration(ti, execution_date, storage_task_name)
has_values = True
with concurrent.futures.ThreadPoolExecutor(max_workers=configs.CIO_MAX_CONCURRENT_REQUESTS) as executor:
while has_values:
while len(futures) < queue_size:
try:
futures.append(executor.submit(send_event_to_cio, next(records_iter)))
except StopIteration:
has_values = False
break
mode = concurrent.futures.FIRST_COMPLETED if has_values else concurrent.futures.ALL_COMPLETED
done, not_done = concurrent.futures.wait(futures, timeout=6000, return_when=mode)
futures = list(not_done)
for future in done:
try:
i = future.result()
except Exception:
dag_run.get_task_instance(storage_task_name).xcom_push(
key='progress',
value=i - configs.CIO_MAX_CONCURRENT_REQUESTS # subtracting because due to threading we can't get the exact index
)
raise
progress_storage_operator = PythonOperator(
python_callable=lambda ti, **kwargs: ti.xcom_push(key='progress', value=0),
task_id='progress_storage_{}'.format(submissions_to_cio.__name__),
dag=kite_coding_stats_email_dag,
provide_context=True,
)
submissions_to_cio_operator = PythonOperator(
python_callable=submissions_to_cio,
task_id=submissions_to_cio.__name__,
dag=kite_coding_stats_email_dag,
provide_context=True,
op_kwargs={'storage_task_name': 'progress_storage_{}'.format(submissions_to_cio.__name__)}
)
(
approx_percentiles_op,
drop_daily_active_users_op >> create_daily_active_users_op >> update_daily_active_users_op >> coding_stats_op,
progress_storage_operator,
) >> submissions_to_cio_operator

View File

@ -0,0 +1,92 @@
import requests
import time
from airflow.models import Variable
from airflow import DAG
import pendulum
import datetime
import json
from airflow.operators.python_operator import PythonOperator
from kite_airflow.slack_alerts import task_fail_slack_alert
KIBANA_VERSION = '7.9.3'
KIBANA_URL = XXXXXXX
SLACK_URL = 'https://slack.com/api/files.upload'
local_tz = pendulum.timezone('America/Los_Angeles')
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2020, 10, 27, tzinfo=local_tz),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'on_failure_callback': task_fail_slack_alert,
}
dag = DAG(
'slack_dashboards',
default_args=default_args,
description='Render and post dashboards to Slack.',
schedule_interval='0 10 * * *',
)
def dashboards(conf, **context):
import logging
logger = logging.getLogger("airflow.task")
kibana_requests_kwargs = {'headers': {'kbn-version': KIBANA_VERSION}, 'auth': ('elastic', Variable.get('elastic_password'))}
dashboards = Variable.get("slack_dashboards", deserialize_json=True)
enqueued = []
for dashboard in dashboards:
res = requests.post(dashboard['url'], **kibana_requests_kwargs)
if res.status_code != 200:
raise Exception("Error requesting dashboard, config={}, code={}, response={}".format(json.dumps(dashboard), res.status_code, res.text))
logger.info("ENQUEUE RES={}".format(res.json()))
enqueued.append(res.json())
errors = []
for dashboard, rendered_url in zip(dashboards, enqueued):
logger.info('Waiting for dashboard "{}"'.format(dashboard['slackParams']['title']))
while True:
res = requests.get("{}{}".format(KIBANA_URL, rendered_url['path']), **kibana_requests_kwargs)
if res.status_code == 503:
logger.info('Received 503 response, sleeping.')
time.sleep(60)
continue
elif res.status_code != 200:
errors.append('Error fetching rendered dashboard, config={}, code={}, response={}'.format(json.dumps(dashboard), res.status_code, res.text))
break
logger.info('Kibana response: code={}, response={}'.format(res.status_code, res.content))
filename = dashboard['slackParams']['filename']
logger.info('Slack request: files={}, headers={}, url={}'.format({
'file': (filename, res.content),
**{k: (None, v) for k, v in dashboard['slackParams'].items()},
}, {'Authorization': 'Bearer {}'.format(Variable.get('slack_token'))}, SLACK_URL))
slack_res = requests.post(
SLACK_URL,
files={
'file': (filename, res.content),
**{k: (None, v) for k, v in dashboard['slackParams'].items()},
},
headers={'Authorization': 'Bearer {}'.format(Variable.get('slack_token'))}
)
logger.info('Slack response: code={}, response={}'.format(slack_res.status_code, slack_res.text))
break
if errors:
raise Exception('\n'.join(errors))
dashboards_operator = PythonOperator(
python_callable=dashboards,
task_id='dashboards',
dag=dag,
provide_context=True,
)

View File

@ -0,0 +1,363 @@
import logging
import datetime
import tempfile
import requests
import yaml
import json
import gzip
import re
import time
from airflow import DAG
from jinja2 import Template
import customerio
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.hooks.S3_hook import S3Hook
from airflow.hooks.postgres_hook import PostgresHook
from airflow.operators.python_operator import PythonOperator
from airflow.models import Variable
from airflow.sensors.external_task_sensor import ExternalTaskSensor
from airflow.contrib.operators.s3_list_operator import S3ListOperator
from kite_airflow.dags.kite_status_1d import dag as kits_status_1d_dag, read_s3_json_files
from jinja2 import PackageLoader
import concurrent.futures
import pkg_resources
import kite_metrics
from kite_airflow.slack_alerts import task_fail_slack_alert
logger = logging.getLogger(__name__)
default_args = {
'owner': 'airflow',
'depends_on_past': True,
'start_date': datetime.datetime(2020, 6, 28),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
DATA_LOC = 's3://kite-metrics/firehose/kite_status/'
PROD_RESULT_LOC_PREFIX = 's3://kite-metrics/athena-results'
kite_status_config = kite_metrics.load_context('kite_status')
LANGS = kite_status_config['languages']
EDITORS = kite_status_config['editors']
contat_props_tmpl = Template(pkg_resources.resource_string('kite_airflow', 'files/hubspot_contactprops.yaml').decode('utf8'))
contact_props_yaml = contat_props_tmpl.render(editors=EDITORS, langs=LANGS)
contact_props = yaml.load(contact_props_yaml, Loader=yaml.FullLoader)
dag = DAG(
'hubspot_user_metrics',
default_args=default_args,
description='A simple tutorial DAG',
schedule_interval='30 0 * * *',
max_active_runs=1,
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
previous_dag_run_sensor = ExternalTaskSensor(
task_id='previous_dag_run_sensor',
dag=dag,
external_dag_id=dag.dag_id,
execution_delta=datetime.timedelta(days=1),
mode='reschedule',
)
kite_status_dag_run_sensor = ExternalTaskSensor(
task_id='kite_status_dag_run_sensor',
dag=dag,
execution_delta=datetime.timedelta(minutes=20),
external_dag_id=kits_status_1d_dag.dag_id,
mode='reschedule',
)
drop_intermediate_table = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='drop_intermediate_table',
query='DROP TABLE kite_metrics.hubspot_intermediate',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)
create_intermediate_table = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_intermediate_table',
query='athena/tables/hubspot_intermediate.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'props': contact_props},
)
(previous_dag_run_sensor, kite_status_dag_run_sensor) >> drop_intermediate_table >> create_intermediate_table
insert_deltas = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='insert_deltas',
query='athena/queries/hubspot_delta.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
params={'props': contact_props},
dag=dag,
)
insert_deltas >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_delta_table',
query='DROP TABLE hubspot_delta_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)
(previous_dag_run_sensor, kite_status_dag_run_sensor) >> insert_deltas
EMAIL_RE = re.compile(r'^\s*[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\s*$', re.I)
def write_contact_prop_data(ti, **context):
props = [p['name'] for p in contact_props if 'label' in p]
props.append('user_id')
s3 = S3Hook('aws_us_east_1')
buffer = []
# Hubspot validates emails against some list of domain extensions. Go fetch a list to replicate that.
domains_resp = requests.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt')
domains = set([d.lower() for d in domains_resp.text.split('\n') if re.match('^[a-z]+$', d.lower())])
counter = 0
for file in sorted(ti.xcom_pull(task_ids='list_hubspot_json_files')):
obj = s3.get_key(file, 'kite-metrics')
for line in gzip.open(obj.get()['Body']):
counter += 1
if counter % 1000 == 0:
logger.info('Processed {} records'.format(counter))
rec = json.loads(line)
email = rec['email']
if not EMAIL_RE.match(email) or email.rsplit('.', 1)[1] not in domains:
logger.info('Skipping invalid email address {}'.format(email))
continue
if any([rec.get('{}_percentage'.format(key)) is not None for key in LANGS]):
rec['user_data_primary_language'] = max(LANGS, key=lambda x: rec.get('{}_percentage'.format(x)) or 0)
if any([rec.get('python_edits_in_{}'.format(key)) for key in EDITORS]):
rec['user_data_primary_python_editor'] = max(EDITORS, key=lambda x: rec.get('python_edits_in_{}'.format(x)) or 0)
hs_props = {prop: rec[prop] for prop in props if rec.get(prop) is not None}
hs_props['kite_lifecycle_stages'] = 'User' # This property is called 'Source' in HS
buffer.append({'email': email, 'properties': [{'property': prop, 'value': value} for prop, value in hs_props.items()]})
if len(buffer) >= 100:
make_hubspot_request('contacts/v1/contact/batch', buffer)
buffer = []
if buffer:
make_hubspot_request('contacts/v1/contact/batch', buffer)
def copy_kite_users():
pg_hook = PostgresHook(postgres_conn_id='community')
s3 = S3Hook('aws_us_east_1')
tf = tempfile.NamedTemporaryFile()
pg_hook.copy_expert("COPY public.user (id, name, email) TO STDOUT WITH (FORMAT csv)", tf.name)
s3.load_file(tf.name, 'enrichment/kite/users/users.csv', bucket_name='kite-metrics', replace=True)
copy_kite_users_operator = PythonOperator(
python_callable=copy_kite_users,
task_id=copy_kite_users.__name__,
dag=dag,
)
setup_partitions = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='setup_final_partitions',
query='MSCK REPAIR TABLE hubspot_intermediate',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)
[create_intermediate_table, insert_deltas] >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='insert_rollups',
query='athena/queries/hubspot_rollup.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'scalar_props': [p for p in contact_props if 'agg' in p['sql']],
'map_props': [p for p in contact_props if 'map_agg' in p['sql']],
'scalar_time_rollups': set([prop['sql']['agg_days'] for prop in contact_props if 'agg_days' in prop['sql']]),
},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_rollup_table',
query='DROP TABLE hubspot_rollup_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
) >> setup_partitions
(copy_kite_users_operator, setup_partitions) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_hubspot_final_table',
query='athena/queries/hubspot_final.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'scalar_props': [p for p in contact_props if 'map_agg' not in p['sql']],
'map_props': [p for p in contact_props if 'map_agg' in p['sql']],
},
) >> S3ListOperator(
aws_conn_id='aws_us_east_1',
task_id='list_hubspot_json_files',
bucket='kite-metrics',
prefix='athena/hubspot/final/{{ds}}/',
delimiter='/',
dag=dag,
) >> PythonOperator(
python_callable=write_contact_prop_data,
task_id=write_contact_prop_data.__name__,
dag=dag,
provide_context=True,
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_final_table',
query='DROP TABLE hubspot_final_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)
def write_cio_profile_attrs(task_instance, execution_date, dag_run, **context):
cio_creds = Variable.get('cio_credentials', deserialize_json=True)
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_cio_json_files'))
def iter():
for i, rec in enumerate(iter_records):
if not rec['id'] or not all(ord(c) < 128 for c in rec['id']):
continue
if 'time_zone' in rec:
rec['timezone'] = rec.pop('time_zone')
yield i, rec
def call_cio(item):
i, kwargs = item
customerio.CustomerIO(cio_creds['site_id'], cio_creds['api_key']).identify(**kwargs)
return i
queue_size = 100
pool_size = 20
futures = []
records_iter = iter()
max_i = 0
has_values = True
with concurrent.futures.ThreadPoolExecutor(max_workers=pool_size) as executor:
while has_values:
while len(futures) < queue_size:
try:
futures.append(executor.submit(call_cio, next(records_iter)))
except StopIteration:
has_values = False
break
mode = concurrent.futures.FIRST_COMPLETED if has_values else concurrent.futures.ALL_COMPLETED
done, not_done = concurrent.futures.wait(futures, timeout=6000, return_when=mode)
futures = list(not_done)
for future in done:
i = future.result()
if max_i > 0 and (i // 1000) > (max_i // 1000):
logger.info("Processed line {}".format(i))
max_i = max(max_i, i)
setup_partitions >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_cio_table',
query='athena/queries/cio_profile_attrs.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'props': ["country_name", "city_name", "subdivision_1_name", "time_zone"]
},
) >> S3ListOperator(
aws_conn_id='aws_us_east_1',
task_id='list_cio_json_files',
bucket='kite-metrics',
prefix='athena/cio_profile_attrs/{{ds}}/',
delimiter='/',
dag=dag,
) >> PythonOperator(
python_callable=write_cio_profile_attrs,
task_id=write_cio_profile_attrs.__name__,
dag=dag,
provide_context=True,
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_cio_table',
query='DROP TABLE cio_profile_attrs_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)
USER_DATA_PROPGROUP_NAME = 'user_data'
MAX_TRIES = 3
def make_hubspot_request(path, data=None, method=None, tries=0):
url = 'https://api.hubapi.com/{}?hapikey={}'.format(path, Variable.get('hubspot_apikey'))
req_fn = getattr(requests, method) if method else (requests.post if data else requests.get)
resp = req_fn(url, **({'json': data} if data else {}))
tries = tries + 1
if resp.status_code == 502 and tries < MAX_TRIES:
logger.warn('Got 502 from Hubspot API, sleeping 60 seconds before retry.')
time.sleep(60)
return make_hubspot_request(path, data, method, tries)
if resp.status_code >= 300:
raise Exception('Error make hubspot request, code={}, response={}'.format(resp.status_code, resp.text))
return resp
def update_contact_props():
props = make_hubspot_request('properties/v1/contacts/properties').json()
props_dict = {prop['name']: prop for prop in props if prop['groupName'] == USER_DATA_PROPGROUP_NAME}
for prop in contact_props:
if 'label' not in prop:
continue
prop = prop.copy()
prop.pop('sql', None)
prop['groupName'] = USER_DATA_PROPGROUP_NAME
if prop['name'] not in props_dict:
make_hubspot_request('properties/v1/contacts/properties', prop)
continue
if {k: v for k, v in props_dict[prop['name']].items() if k in prop} == prop:
continue
make_hubspot_request('properties/v1/contacts/properties/named/{}'.format(prop['name']), prop, 'put')
update_contact_props_operator = PythonOperator(
python_callable=update_contact_props,
task_id=update_contact_props.__name__,
dag=dag,
)
previous_dag_run_sensor >> update_contact_props_operator

View File

@ -0,0 +1,96 @@
import datetime
import logging
import time
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.models import Variable
from jinja2 import PackageLoader
import mixpanel
from kite_airflow.dags.hubspot import make_hubspot_request
from kite_airflow.plugins.google import GoogleSheetsRangeOperator
from kite_airflow.slack_alerts import task_fail_slack_alert
logger = logging.getLogger(__name__)
default_args = {
'owner': 'airflow',
'depends_on_past': True,
'start_date': datetime.datetime(2021, 1, 7),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
dag = DAG(
'hubspot_companies',
default_args=default_args,
description='Sychronizes user company data from hubspot to other systems.',
schedule_interval='0 12 * * *',
max_active_runs=1,
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
MP_COMPANY_PROP = 'Company name'
def write_company_assignments(ti, **ctx):
mp_consumer = mixpanel.BufferedConsumer(max_size=100)
mp_client = mixpanel.Mixpanel(Variable.get('mixpanel_credentials', deserialize_json=True)['token'], consumer=mp_consumer)
logger.info("Fetching company list")
supported_companies = [rec[0] for rec in ti.xcom_pull(task_ids='get_companies_sheet')['values']]
for company in supported_companies:
logger.info("Starting processing for company {}".format(company))
params = {
'limit': 100,
'filterGroups': [{'filters': [
{'propertyName': 'company', 'operator': 'EQ', 'value': company},
{'propertyName': 'user_id', 'operator': 'HAS_PROPERTY'}
]}],
'properties': ['user_id'],
}
n_done = 0
while True:
resp = make_hubspot_request('crm/v3/objects/contacts/search', params).json()
if resp['total'] == 0:
raise Exception('No results for company "{}". Is it mis-spelled?'.format(company))
for res in resp['results']:
mp_client.people_set(
res['properties']['user_id'],
{MP_COMPANY_PROP: company},
meta={'$ignore_time': 'true', '$ip': 0})
n_done += 1
logger.info(" {} / {} records processed".format(n_done, resp['total']))
after = resp.get('paging', {}).get('next', {}).get('after')
if not after:
break
params['after'] = after
time.sleep(20)
mp_consumer.flush()
GoogleSheetsRangeOperator(
gcp_conn_id='google_cloud_kite_dev',
spreadsheet_id='XXXXXXX',
range="'Companies to Import to Mixpanel'!CompanyNames",
task_id='get_companies_sheet',
dag=dag,
) >> PythonOperator(
python_callable=write_company_assignments,
task_id=write_company_assignments.__name__,
dag=dag,
provide_context=True,
)

View File

@ -0,0 +1,314 @@
from airflow import DAG
import datetime
from airflow.hooks.S3_hook import S3Hook
from elasticsearch import Elasticsearch
import json
import gzip
import io
import logging
import base64
from elasticsearch.helpers import bulk
from airflow.operators.python_operator import PythonOperator
from airflow.models import Variable
from airflow.contrib.operators.s3_list_operator import S3ListOperator
from airflow.models.xcom import XCom
import itertools
from airflow.operators.python_operator import ShortCircuitOperator
from jinja2 import PackageLoader
import time
from kite_airflow.plugins.google import GoogleSheetsRangeOperator
import kite_metrics
from kite_airflow.slack_alerts import task_fail_slack_alert
logger = logging.getLogger(__name__)
INDEX_GRANULARITY = datetime.timedelta(days=10)
BUCKET = 'kite-metrics'
KS_INDEX_PREFIX = 'kite_status'
def resolve_dotted_path(doc, path):
container = doc
field_name = path
while '.' in field_name:
container_name, field_name = path.split('.', 1)
if container_name not in container:
return None, None
container = container[container_name]
if field_name in container:
return container, field_name
return None, None
def get_index_shard(dt, granularity, epoch=datetime.date(1970, 1, 1)):
date = datetime.date(dt.year, dt.month, dt.day)
rounded = epoch + (date - epoch) // granularity * granularity
return rounded.isoformat()
def iter_s3_file(s3_hook, bucket, key):
json_file = s3_hook.get_key(key, BUCKET)
for line in gzip.open(json_file.get()['Body']):
yield json.loads(line)
def client_event_convert_fn(docs, index_date_suffix, deployments):
for doc in docs:
if 'messageId' not in doc:
continue
if 'properties' not in doc:
continue
event = doc.get('event')
if event == 'Index Build':
index_prefix = 'index_build'
elif event == 'Completion Stats':
index_prefix = 'completions_selected'
else:
continue
index_name = '{}_{}'.format(index_prefix, index_date_suffix)
for field in ['originalTimestamp']:
if field in doc:
del doc[field]
for field in ['repo_stats', 'receivedAt', 'sentAt', 'sent_at', 'parse_info.parse_errors']:
container, field_name = resolve_dotted_path(doc['properties'], field)
if container:
del container[field_name]
for field in ['cpu_info.sum', 'lexical_metrics.score']:
container, field_name = resolve_dotted_path(doc['properties'], field)
if container:
container[field_name] = float(container[field_name])
for field in ['completion_stats']:
if field in doc['properties']:
# completions_stats is an encoded list
data = doc['properties'][field]
data = base64.b64decode(data)
data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
data = json.loads(data)
del doc['properties'][field]
# create one document per completion stat
i = 0
for stat in data:
i += 1
elem = doc
for key in stat:
elem['properties'][key] = stat[key]
yield {
'_index': index_name,
'_id': doc['messageId'] + "-" + str(i),
'_source': elem
}
else:
yield {
'_index': index_name,
'_id': doc['messageId'],
'_source': doc
}
def scrub(a_dict, schema):
res = {}
for k, v in schema['properties'].items():
if k not in a_dict:
continue
a_val = a_dict[k]
elastic = v.get('elastic', False)
if isinstance(a_val, dict):
if elastic:
res[k] = {k1: v1 for k1, v1 in a_val.items() if k1}
elif 'properties' in v:
res[k] = scrub(a_val, v)
continue
if elastic:
res[k] = a_val
return res
kite_status_config = kite_metrics.load_context('kite_status')
kite_status_schema = kite_metrics.load_schema('kite_status')
def kite_status_convert_fn(docs, index_date_suffix, deployments):
total_time = 0
for i, doc in enumerate(docs):
if i and i % 10000 == 0:
logger.info('Done {} records, avg time / record={}'.format(i, total_time / i))
start_time = time.perf_counter()
if doc.get('event') != 'kite_status':
total_time += (time.perf_counter() - start_time)
continue
if not doc.get('messageId'):
total_time += (time.perf_counter() - start_time)
continue
if 'properties' not in doc:
total_time += (time.perf_counter() - start_time)
continue
if sum(doc['properties'].get('{}_events'.format(lang), 0) for lang in kite_status_config['languages']) == 0:
total_time += (time.perf_counter() - start_time)
continue
index_name = '{}_active_{}'.format(KS_INDEX_PREFIX, index_date_suffix)
doc = scrub(doc, kite_status_schema)
for field in ['cpu_samples_list', 'active_cpu_samples_list']:
if not doc['properties'].get(field):
continue
p = field.split('_')[:-2]
new_field = '_'.join(['max'] + p)
doc['properties'][new_field] = max(map(float, doc['properties'][field]))
# We got some bogus timestamps, TODO: validate and cleanup data
for field in ['license_expire', 'plan_end']:
if isinstance(doc['properties'].get(field), int):
if 0 < doc['properties'][field] < 2524636800:
doc['properties'][field] = datetime.datetime.fromtimestamp(doc['properties'][field])
else:
del doc['properties'][field]
# Next block is for backcompatibilty only
# can be removed once the content of the PR https://github.com/kiteco/kiteco/pull/10638/ has been released to
# most of our users
for field in ['cpu_samples', 'active_cpu_samples']:
if field in doc['properties']:
samples_str = doc['properties'].pop(field)
if len(samples_str) == 0:
continue
p = field.split('_')[:-1]
new_field = '_'.join(['max'] + p)
doc['properties'][new_field] = max(map(float, samples_str.split(',')))
deployment_id = doc['properties'].get('server_deployment_id')
if deployment_id and deployment_id in deployments:
doc['properties']['server_deployment_name'] = deployments[deployment_id]
doc['payload_size'] = len(doc)
total_time += (time.perf_counter() - start_time)
yield {'_index': index_name, '_id': doc['messageId'], '_source': doc}
kite_status_dag = DAG(
'elastic_load_kite_status',
description='Load kite_status to Kibana.',
default_args={
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': datetime.datetime(2020, 10, 15),
'on_failure_callback': task_fail_slack_alert,
},
schedule_interval='*/10 * * * *',
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
client_events_dag = DAG(
'elastic_load_client_events',
description='Load client_events to Kibana.',
default_args={
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': datetime.datetime(2020, 10, 15),
'on_failure_callback': task_fail_slack_alert,
},
schedule_interval='*/10 * * * *',
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
convert_fns = {'kite_status': kite_status_convert_fn, 'client_events': client_event_convert_fn}
def bulk_index_metrics(bucket, s3_keys, granularity, key, deployments):
s3_hook = S3Hook('aws_us_east_1')
es = Elasticsearch(
cloud_id="metrics:XXXXXXX",
http_auth=("elastic", Variable.get('elastic_password')),
)
def iter():
for s3_key in s3_keys:
dt = datetime.date(*map(int, s3_key.split('/')[2:5]))
index_date_suffix = get_index_shard(dt, granularity)
for rec in convert_fns[key](iter_s3_file(s3_hook, bucket, s3_key), index_date_suffix, deployments):
yield rec
bulk(es, iter())
def skip_no_new_files(ti, **kwargs):
prev_files = set(itertools.chain(*[result.value for result in XCom.get_many(
execution_date=ti.execution_date,
dag_ids=ti.dag_id,
task_ids=ti.task_id,
include_prior_dates=True,
limit=100
)]))
all_files = set(ti.xcom_pull(task_ids='list_prev_json_files') + (ti.xcom_pull(task_ids='list_next_json_files') or []))
curr_files = list(all_files - prev_files)
ti.xcom_push(key='curr_files', value=curr_files)
return len(curr_files) > 0
for key, dag in [('kite_status', kite_status_dag), ('client_events', client_events_dag)]:
list_ops = [
S3ListOperator(
aws_conn_id='aws_us_east_1',
task_id='list_{}_json_files'.format(k),
bucket='kite-metrics',
prefix="firehose/{}/{{{{ (execution_date + macros.timedelta(hours={})).format('%Y/%m/%d/%H') }}}}/".format(key, diff),
delimiter='/',
dag=dag,
) for k, diff in [('prev', 0), ('next', 1)]
]
def load_fn(ti, params, **kwargs):
s3_keys = ti.xcom_pull(task_ids=skip_no_new_files.__name__, key='curr_files')
logger.info("Loading files {}".format(', '.join(s3_keys)))
deployments_data = ti.xcom_pull(task_ids='copy_server_deployments')['values']
id_col = deployments_data[1].index('Deployment ID')
name_col = deployments_data[1].index('Name')
deployments = {d[id_col]: d[name_col] for d in deployments_data[2:] if len(d) > max(id_col, name_col) and d[name_col].strip()}
bulk_index_metrics(BUCKET, s3_keys, INDEX_GRANULARITY, params['key'], deployments)
return s3_keys
list_ops >> ShortCircuitOperator(
task_id=skip_no_new_files.__name__,
python_callable=skip_no_new_files,
dag=dag,
provide_context=True,
depends_on_past=True,
) >> GoogleSheetsRangeOperator(
gcp_conn_id='google_cloud_kite_dev',
spreadsheet_id='1-XXXXXXX',
range='A:D',
task_id='copy_server_deployments',
dag=dag,
provide_context=True,
) >> PythonOperator(
python_callable=load_fn,
task_id='load_{}'.format(key),
dag=dag,
provide_context=True,
params={'key': key}
)

View File

@ -0,0 +1,366 @@
from datetime import timedelta
import base64
import hashlib
import mixpanel
import gzip
import json
import customerio
from airflow.contrib.operators.s3_list_operator import S3ListOperator
# The DAG object; we'll need this to instantiate a DAG
from airflow import DAG
# Operators; we need this to operate!
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.models import Variable
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.operators.python_operator import ShortCircuitOperator
import logging
import datetime
from jinja2 import PackageLoader
import kite_metrics
from kite_airflow.slack_alerts import task_fail_slack_alert
logger = logging.getLogger(__name__)
MP_START_DATE = datetime.datetime(2020, 5, 29)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2020, 5, 24),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
DATA_LOC = 's3://kite-metrics/firehose/kite_status/'
PROD_RESULT_LOC_PREFIX = 's3://kite-metrics/athena-results'
dag = DAG(
'kite_status_1d',
default_args=default_args,
description='A simple tutorial DAG',
schedule_interval='10 0 * * *',
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
kite_status_config = kite_metrics.load_context('kite_status')
kite_status_schema = kite_metrics.load_schema('kite_status')
schema_reload_ops = []
for table_name in ['kite_status', 'kite_status_segment', 'kite_status_normalized']:
schema_reload_ops.append(AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='drop_{}'.format(table_name),
query='DROP TABLE {{params.table_name}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'table_name': table_name},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_{}'.format(table_name),
query='athena/tables/{}.tmpl.sql'.format(table_name),
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'schema': kite_status_schema, 'table_name': table_name}
))
insert_kite_status_normalized = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='insert_kite_status_normalized',
query='athena/queries/kite_status_normalized.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'schema': kite_status_schema}
)
cleanup_kite_status_normalized_table = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_kite_status_normalized_table',
query='DROP TABLE kite_status_normalized_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)
schema_reload_ops >> insert_kite_status_normalized >> cleanup_kite_status_normalized_table
def read_s3_json_files(bucket, file_list):
s3 = S3Hook('aws_us_east_1')
for file in sorted(file_list):
obj = s3.get_key(file, bucket)
for line in gzip.open(obj.get()['Body']):
rec = json.loads(line)
to_clean = [rec]
while to_clean:
this = to_clean.pop()
for k in list(this.keys()):
v = this[k]
if isinstance(v, dict):
to_clean.append(v)
continue
if v is None:
del this[k]
yield rec
def load_athena_to_elastic(task_instance, execution_date, **context):
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
es = Elasticsearch(
cloud_id="metrics:XXXXXXX",
http_auth=("elastic", Variable.get('elastic_password')),
)
def iter():
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_mixpanel_json_files'))
for i, rec in enumerate(iter_records):
try:
if sum(rec.get('{}_events'.format(lang), 0) for lang in kite_status_config['languages']) == 0:
continue
if rec['event'] != 'kite_status':
continue
ts = datetime.datetime.fromtimestamp(rec['end_time'])
rec_id_str = '{}::{}'.format(rec.get('userid', ''), ts.strftime('%Y/%m/%d'))
rec_id = hashlib.md5(rec_id_str.encode('utf8')).hexdigest()
rec['timestamp'] = ts
yield {'_index': 'kite_status_1d_{}'.format(execution_date.format('%Y%m')), '_id': rec_id, '_source': rec}
except Exception:
logger.exception("Error processing line {}, content={}".format(i, rec))
raise
bulk(es, iter())
event_names = {
'anon_supported_file_edited': 'anon_supported_file_edited_1d',
'anon_kite_status': 'anon_kite_status_1d',
'kite_status': 'kite_status_1d',
}
def load_athena_to_mixpanel(task_instance, execution_date, dag_run, storage_task_name, **context):
mp_consumer = mixpanel.BufferedConsumer(max_size=100)
mp_client = mixpanel.Mixpanel(Variable.get('mixpanel_credentials', deserialize_json=True)['token'], consumer=mp_consumer)
start_row = task_instance.xcom_pull(task_ids=storage_task_name, key='progress')
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_mixpanel_json_files'))
for i, rec in enumerate(iter_records):
if i <= start_row:
continue
try:
insert_id = str(base64.b64encode(
hashlib.md5('{}::{}'.format(
rec['userid'],
execution_date.strftime('%Y/%m/%d')).encode('utf8')
).digest())[:16])
rec.update({
'time': rec['end_time'],
'_group': 'firehose/kite_status/{}/'.format(execution_date.strftime('%Y/%m/%d')),
'_version': '1.0.0',
'$insert_id': insert_id,
})
user_id = rec['userid']
name = event_names.get(rec['event'])
if name is None:
continue
if datetime.datetime.today() - execution_date < datetime.timedelta(days=4):
mp_client.track(user_id, name, rec)
else:
ts = rec.pop('time')
mp_client.import_data(Variable.get('mixpanel_credentials', deserialize_json=True)['api_key'], user_id, name, ts, rec)
if i > 0 and i % 10000 == 0:
logger.info("Processed line {}".format(i))
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=i)
except Exception:
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=i-100)
logger.exception("Error processing line {}, content={}".format(i, rec))
raise
mp_consumer.flush()
def load_athena_to_cio(task_instance, execution_date, dag_run, storage_task_name, **context):
import concurrent.futures
cio_creds = Variable.get('cio_credentials', deserialize_json=True)
start_row = task_instance.xcom_pull(task_ids=storage_task_name, key='progress')
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_cio_json_files'))
def iter():
for i, rec in enumerate(iter_records):
if i <= start_row:
continue
if rec['event'] != 'kite_status':
continue
rec.update({
'time': rec['end_time'],
'_group': 'firehose/kite_status/{}/'.format(execution_date.strftime('%Y/%m/%d')),
'_version': '1.0.0',
})
user_id = rec['userid']
if not user_id or not all(ord(c) < 128 for c in user_id):
continue
name = event_names.get(rec['event'])
if name is None:
continue
yield i, (user_id, name, rec['time']), rec
def call_cio(item):
i, args, kwargs = item
customerio.CustomerIO(cio_creds['site_id'], cio_creds['api_key']).backfill(*args, **kwargs)
return i
max_i = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
try:
for i in executor.map(call_cio, iter()):
if max_i > 0 and (i // 1000) > (max_i // 1000):
logger.info("Processed line {}".format(i))
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=max(max_i, i))
max_i = max(max_i, i)
except Exception:
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=max_i)
raise
for key, group_by, downstreams in [
('mixpanel', 'regexp_replace(kite_metrics.kite_status_normalized.userId, \'\p{Cntrl}\')', [(False, load_athena_to_elastic), (True, load_athena_to_mixpanel)]),
('cio', 'regexp_replace(coalesce(kite_metrics.kite_status_normalized.properties__forgetful_metrics_id, kite_metrics.kite_status_normalized.userId), \'\p{Cntrl}\')', [(True, load_athena_to_cio)])
]:
operator = insert_kite_status_normalized >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='insert_kite_status_1d_{}'.format(key),
query='athena/queries/kite_status_1d.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
params={
'key': key,
'group_by': group_by,
'languages': kite_status_config['languages'],
'editors': kite_status_config['editors'],
'lexical_providers': kite_status_config['lexical_providers'],
'python_providers': kite_status_config['python_providers']
},
dag=dag,
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='generate_{}_json'.format(key),
query='athena/queries/kite_status_1d_json.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
params={'key': key, 'languages': kite_status_config['languages']},
dag=dag,
)
operator >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_{}_table_json'.format(key),
query='DROP TABLE kite_status_1d_{{params.key}}_{{ds_nodash}}_json',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
params={'key': key},
dag=dag,
)
operator >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_{}_table'.format(key),
query='DROP TABLE kite_status_1d_{{params.key}}_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
params={'key': key},
dag=dag,
)
operator = operator >> S3ListOperator(
aws_conn_id='aws_us_east_1',
task_id='list_{}_json_files'.format(key),
bucket='kite-metrics',
prefix='athena/kite_status_1d_{{params.key}}/json/{{ds}}/',
delimiter='/',
params={'key': key},
dag=dag,
)
def skip_older(execution_date, **ctx):
return execution_date >= MP_START_DATE or (datetime.datetime(2020, 5, 19) < execution_date < datetime.datetime(2020, 5, 26))
skip_older_operator = ShortCircuitOperator(
task_id='skip_older_{}'.format(key),
python_callable=skip_older,
dag=dag,
provide_context=True
)
for skip_older, downstream in downstreams:
progress_operator = PythonOperator(
python_callable=lambda ti, **kwargs: ti.xcom_push(key='progress', value=0),
task_id='progress_storage_{}'.format(downstream.__name__),
dag=dag,
provide_context=True,
)
ds_operator = PythonOperator(
python_callable=downstream,
task_id=downstream.__name__,
dag=dag,
retries=4,
provide_context=True,
op_kwargs={'storage_task_name': 'progress_storage_{}'.format(downstream.__name__)}
)
if skip_older:
operator >> skip_older_operator >> progress_operator >> ds_operator
else:
operator >> progress_operator >> ds_operator
insert_kite_status_normalized >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='update_activations_table',
query='athena/queries/insert_activations.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
depends_on_past=True,
dag=dag,
)
update_schema_dag = DAG(
'update_kite_status_schema',
default_args=default_args,
description='Update the kite_status and kite_status_normalized schemas.',
schedule_interval=None,
)
for table_name in ['kite_status', 'kite_status_segment', 'kite_status_normalized']:
AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='drop_{}'.format(table_name),
query='DROP TABLE {{params.table_name}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=update_schema_dag,
params={'table_name': table_name},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_{}'.format(table_name),
query='athena/tables/{}.tmpl.sql'.format(table_name),
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=update_schema_dag,
params={'schema': kite_status_schema, 'table_name': table_name}
)

View File

@ -0,0 +1,53 @@
import datetime
import logging
from airflow import DAG
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from jinja2 import PackageLoader
import kite_metrics
from kite_airflow.slack_alerts import task_fail_slack_alert
logger = logging.getLogger(__name__)
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2017, 4, 27),
'end_date': datetime.datetime(2020, 2, 23),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
dag = DAG(
'kite_status_segment',
default_args=default_args,
description='Load Segment data into kite_status_normalized',
schedule_interval='10 0 * * *',
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
kite_status_schema = kite_metrics.load_schema('kite_status')
AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='insert_kite_status_normalized',
query='athena/queries/kite_status_normalized_segment.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'schema': kite_status_schema}
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_kite_status_normalized_table',
query='DROP TABLE kite_status_normalized_{{ds_nodash}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
)

View File

@ -0,0 +1,204 @@
from airflow import DAG
import ipaddress
import datetime
import requests
import io
import os
import csv
import zipfile
from airflow.models import Variable
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from kite_airflow.s3_utils import S3DeletePrefixOperator
from jinja2 import PackageLoader
from kite_airflow.slack_alerts import task_fail_slack_alert
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2020, 6, 12),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
dag = DAG(
'maxmind_geolite2',
description='Load the Maxmind Geolite2 database.',
default_args=default_args,
schedule_interval='0 0 * * 0',
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
maxmind_url = 'https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-{}-CSV&license_key={}&suffix=zip'
maxmind_files = [
'GeoLite2-Country-Blocks-IPv4',
'GeoLite2-Country-Blocks-IPv6',
'GeoLite2-Country-Locations-en',
]
bucket_name = 'kite-metrics'
key_prefix_template = 'enrichment/maxmind/{prefix}/{dataset}/{ds}/{filename}/'
key_template = key_prefix_template + '{filename}.csv'
def maxmind_operator_fn(ds, **context):
for dataset in ['city', 'country']:
mm_resp = requests.get(maxmind_url.format(dataset.title(), Variable.get('maxmind_license_key')))
mm_zipfile = zipfile.ZipFile(io.BytesIO(mm_resp.content))
s3 = S3Hook('aws_us_east_1')
for path in mm_zipfile.namelist():
if not path.endswith('.csv'):
continue
mm_file = mm_zipfile.open(path)
filename = os.path.splitext(os.path.basename(path))[0]
s3.load_file_obj(mm_file, key_template.format(prefix='raw', dataset=dataset, ds=ds, filename=filename), bucket_name=bucket_name, replace=True)
ipv4_path = [p for p in mm_zipfile.namelist() if p.endswith('GeoLite2-{}-Blocks-IPv4.csv'.format(dataset.title()))][0]
ipv4_file = io.TextIOWrapper(mm_zipfile.open(ipv4_path, 'r'))
ipv4_reader = csv.DictReader(ipv4_file)
ipv4_output = io.StringIO()
ipv4_writer = csv.DictWriter(ipv4_output, ipv4_reader.fieldnames + ['address', 'mask'])
for rec in ipv4_reader:
net = ipaddress.IPv4Network(rec['network'])
rec['address'] = int(net.network_address)
rec['mask'] = int(net.netmask)
ipv4_writer.writerow(rec)
key = key_template.format(prefix='expanded', dataset=dataset, ds=ds, filename='GeoLite2-{}-Blocks-IPv4'.format(dataset))
s3.load_string(ipv4_output.getvalue(), key, bucket_name=bucket_name, replace=True)
maxmind_operator = PythonOperator(
python_callable=maxmind_operator_fn,
task_id='load_maxmind_to_s3',
dag=dag,
provide_context=True,
)
for dataset in ['city']:
maxmind_operator >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_{}_names_table'.format(dataset),
query='''CREATE EXTERNAL TABLE kite_metrics.maxmind_{{params.dataset}}_names_{{ds_nodash}} (
geoname_id string,
locale_code string,
continent_code string,
continent_name string,
country_iso_code string,
country_name string,
subdivision_1_iso_code string,
subdivision_1_name string,
subdivision_2_iso_code string,
subdivision_2_name string,
city_name string,
metro_code string,
time_zone string,
is_in_european_union string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
LOCATION 's3://{{params.bucket}}/{{params.key_prefix_template.format(ds=ds, dataset=params.dataset, prefix='raw', filename=params.filename)}}'
TBLPROPERTIES ('skip.header.line.count'='1')
''',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'bucket': bucket_name,
'key_prefix_template': key_prefix_template,
'filename': 'GeoLite2-{}-Locations-en'.format(dataset.title()),
'dataset': dataset},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_ipv4_{}_table'.format(dataset),
query='''CREATE EXTERNAL TABLE kite_metrics.maxmind_ipv4_{{params.dataset}}_{{ds_nodash}} (
network string,
geoname_id string,
registered_country_geoname_id string,
represented_country_geoname_id string,
is_anonymous_proxy string,
is_satellite_provider string,
postal_code string,
latitude string,
longitude string,
accuracy_radius string,
address bigint,
mask bigint)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
LOCATION 's3://{{params.bucket}}/{{params.key_prefix_template.format(ds=ds, dataset=params.dataset, prefix='expanded', filename=filename)}}'
TBLPROPERTIES ('skip.header.line.count'='1')
''',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'bucket': bucket_name,
'key_prefix_template': key_prefix_template,
'filename': 'GeoLite2-{}-Blocks-IPv4'.format(dataset.title()),
'dataset': dataset
},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='drop_ipv4_{}_table'.format(dataset),
query='''DROP TABLE kite_metrics.maxmind_{{params.dataset}}_ipv4''',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'dataset': dataset
},
) >> S3DeletePrefixOperator(
aws_conn_id='aws_us_east_1',
task_id='prepare_ipv4_{}_join_destination'.format(dataset),
bucket='kite-metrics',
keys='enrichment/maxmind/join/{{params.dataset}}/ipv4/',
params={'dataset': dataset},
dag=dag,
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_ipv4_{}_join_table'.format(dataset),
query='''CREATE TABLE kite_metrics.maxmind_{{params.dataset}}_ipv4
WITH (format='PARQUET',
parquet_compression='SNAPPY',
external_location = 's3://{{params.bucket}}/enrichment/maxmind/join/{{params.dataset}}/ipv4/')
AS SELECT
kite_metrics.maxmind_city_names_{{ds_nodash}}.country_iso_code country_iso_code,
kite_metrics.maxmind_city_names_{{ds_nodash}}.country_name country_name,
kite_metrics.maxmind_city_names_{{ds_nodash}}.subdivision_1_name subdivision_1_name,
kite_metrics.maxmind_city_names_{{ds_nodash}}.city_name city_name,
kite_metrics.maxmind_city_names_{{ds_nodash}}.time_zone time_zone,
kite_metrics.maxmind_ipv4_city_{{ds_nodash}}.address address,
kite_metrics.maxmind_ipv4_city_{{ds_nodash}}.mask mask
FROM kite_metrics.maxmind_ipv4_city_{{ds_nodash}}
JOIN kite_metrics.maxmind_city_names_{{ds_nodash}}
ON kite_metrics.maxmind_ipv4_city_{{ds_nodash}}.geoname_id = kite_metrics.maxmind_city_names_{{ds_nodash}}.geoname_id
''',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'bucket': bucket_name, 'dataset': dataset},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_ipv4_{}_table'.format(dataset),
query='''DROP TABLE kite_metrics.maxmind_ipv4_{{params.dataset}}_{{ds_nodash}}''',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'dataset': dataset
},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_{}_names_table'.format(dataset),
query='''DROP TABLE kite_metrics.maxmind_{{params.dataset}}_names_{{ds_nodash}}''',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={
'dataset': dataset
},
)

View File

@ -0,0 +1,168 @@
import datetime
import io
import gzip
import json
import time
from airflow import DAG
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.models import Variable
import pytz
import requests
import yaml
from jinja2 import PackageLoader
import pkg_resources
from kite_airflow.slack_alerts import task_fail_slack_alert
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2020, 1, 1),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
dag = DAG(
'mixpanel_ingest',
default_args=default_args,
description='Mixpanel data ingest DAG.',
schedule_interval='10 4 * * *',
max_active_runs=1,
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
pacific = pytz.timezone('America/Los_Angeles')
people_schema = yaml.load(pkg_resources.resource_stream('kite_airflow', 'files/mixpanel_people.schema.yaml'), Loader=yaml.FullLoader)
def copy_profile_deltas(task_instance, execution_date, prev_execution_date_success, next_execution_date, **context):
ex_day = execution_date.replace(hour=0, minute=0, second=0, microsecond=0)
if prev_execution_date_success:
ex_day = prev_execution_date_success.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
next_ex_day = next_execution_date.replace(hour=0, minute=0, second=0, microsecond=0)
chunks = [ex_day]
while chunks[-1] < next_ex_day:
chunks.append(chunks[-1] + datetime.timedelta(hours=4))
gz_file = io.BytesIO()
with gzip.GzipFile(fileobj=gz_file, mode="w") as f:
start_date = chunks.pop(0)
for chunk in chunks:
filters = []
for cmp, dt in [['>=', start_date], ['<', chunk]]:
filters.append('user.time {} {}'.format(cmp, 1000 * int(time.mktime(dt.astimezone(pacific).timetuple()))))
start_date = chunk
print(filters)
script = 'function main() {{ return People().filter(function(user) {{ return {}; }})}}'.format(' && '.join(filters))
res = requests.post('https://mixpanel.com/api/2.0/jql',
auth=(Variable.get('mixpanel_credentials', deserialize_json=True)['secret'], ''),
data={'script': script})
if res.status_code != 200:
raise Exception(res.text)
for line in res.json():
to_scrub = [line]
while to_scrub:
curr = to_scrub.pop(0)
for key, value in list(curr.items()):
if isinstance(value, (dict, list)) and len(value) == 0:
del curr[key]
if isinstance(value, dict):
to_scrub.append(value)
if key.startswith('$'):
curr[key[1:]] = value
del curr[key]
for ts_field in ['last_seen', 'time']:
pacific_ts = datetime.datetime.fromtimestamp(line[ts_field] / 1000).replace(tzinfo=pacific)
line[ts_field] = int(time.mktime(pacific_ts.astimezone(pytz.utc).timetuple()))
f.write(json.dumps(line).encode('utf8'))
f.write(b'\n')
s3 = S3Hook('aws_us_east_1')
key = 'mixpanel/people/raw/year={}/month={}/day={}/deltas.json.gz'.format(
execution_date.year, execution_date.month, execution_date.day
)
s3.load_bytes(gz_file.getvalue(), key, 'kite-metrics')
PythonOperator(
python_callable=copy_profile_deltas,
task_id=copy_profile_deltas.__name__,
dag=dag,
retries=2,
provide_context=True,
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='rollup_people',
query='athena/queries/mixpanel_people_rollup.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'schema': people_schema},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='cleanup_rollup_table',
query="DROP TABLE mixpanel_people_rollup_{{ds_nodash}}",
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'schema': people_schema},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='update_people_table_location',
query="""ALTER TABLE mixpanel_people
SET LOCATION 's3://kite-metrics/mixpanel/people/rollups/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'""",
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=dag,
params={'schema': people_schema},
)
ddl_dag = DAG(
'mixpanel_ingest_schema_update',
default_args=default_args,
description='Mixpanel data schema definition.',
schedule_interval=None,
max_active_runs=1,
)
for table_name, s3_prefix in {'mixpanel_people_raw': 'mixpanel/people/raw', 'mixpanel_people': 'mixpanel/people/rollups'}.items():
AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='drop_{}'.format(table_name),
query='DROP TABLE {{params.table_name}}',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=ddl_dag,
params={'table_name': table_name},
) >> AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_{}'.format(table_name),
query='athena/tables/mixpanel_people.tmpl.sql',
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_metrics',
dag=ddl_dag,
params={
'schema': people_schema,
'table_name': table_name,
's3_prefix': s3_prefix,
'partitioned': table_name == 'mixpanel_people_raw',
'json': table_name == 'mixpanel_people_raw',
}
)

View File

@ -0,0 +1,105 @@
import datetime
import io
import gzip
import json
import time
from airflow import DAG
from airflow.hooks.S3_hook import S3Hook
from airflow.operators.python_operator import PythonOperator
from airflow.models import Variable
import pendulum
import requests
from jinja2 import PackageLoader
from kite_airflow.slack_alerts import task_fail_slack_alert
pacific = pendulum.timezone('America/Los_Angeles')
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2020, 1, 1, tzinfo=pacific),
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
dag = DAG(
'mixpanel_ingest_events',
default_args=default_args,
description='Mixpanel events ingest DAG.',
schedule_interval='30 * * * *',
max_active_runs=6,
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
def copy_mp_raw_events(task_instance, execution_date, **context):
pac_date = execution_date.astimezone(pacific)
pac_hour = pac_date.replace(minute=0, second=0, microsecond=0)
script = '''function main() {{
return Events({{from_date: "{date}", to_date: "{date}"}}).filter(function(event) {{
return !event.name.startsWith("kite_status") && event.time >= {start} && event.time < {end};
}});
}}'''.format(
date=pac_date.strftime('%Y-%m-%d'),
start=1000 * int(time.mktime(pac_hour.timetuple())),
end=1000 * int(time.mktime((pac_hour + datetime.timedelta(hours=1)).timetuple())),
)
print(script)
res = requests.post('https://mixpanel.com/api/2.0/jql',
auth=(Variable.get('mixpanel_credentials', deserialize_json=True)['secret'], ''),
data={'script': script},)
if res.status_code != 200:
raise Exception(res.text)
files = {}
for line in res.json():
to_scrub = [line]
while to_scrub:
curr = to_scrub.pop(0)
for key, value in list(curr.items()):
if isinstance(value, (dict, list)) and len(value) == 0:
del curr[key]
continue
if isinstance(value, dict):
to_scrub.append(value)
continue
if key.startswith('$'):
curr[key[1:]] = value
del curr[key]
pacific_ts = datetime.datetime.fromtimestamp(line['time'] / 1000).replace(tzinfo=pacific)
utc_ts = pacific_ts.astimezone(pendulum.timezone('UTC'))
line['time'] = int(time.mktime(utc_ts.timetuple()))
file_key = 'year={}/month={}/day={}/hour={}/event={}'.format(utc_ts.year, utc_ts.month, utc_ts.day, utc_ts.hour, line['name'])
if file_key not in files:
b_io = io.BytesIO()
files[file_key] = (b_io, gzip.GzipFile(fileobj=b_io, mode="w"))
files[file_key][1].write(json.dumps(line).encode('utf8'))
files[file_key][1].write(b'\n')
s3 = S3Hook('aws_us_east_1')
for prefix, (b_io, gz_file) in files.items():
gz_file.close()
s3.load_bytes(b_io.getvalue(), 'mixpanel/events/raw/{}/events.json.gz'.format(prefix), 'kite-metrics', replace=True)
PythonOperator(
python_callable=copy_mp_raw_events,
task_id=copy_mp_raw_events.__name__,
dag=dag,
retries=2,
provide_context=True,
)

View File

@ -0,0 +1 @@
XXXXXXX

View File

@ -0,0 +1,312 @@
from airflow import DAG
import datetime
from airflow.hooks.S3_hook import S3Hook
from airflow.contrib.hooks.aws_sqs_hook import SQSHook
import json
import gzip
import hashlib
import collections
import codecs
import logging
import uuid
import csv
import time
from airflow.operators.python_operator import PythonOperator
import requests
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from jinja2 import PackageLoader
from kite_airflow.slack_alerts import task_fail_slack_alert
logger = logging.getLogger(__name__)
BUCKET = 'kite-youtube-data'
SCRATCH_SPACE_LOC = 's3://{}/athena-scratch-space/'.format(BUCKET)
def iter_s3_file(s3_hook, bucket, key):
json_file = s3_hook.get_key(key, BUCKET)
for line in gzip.open(json_file.get()['Body']):
yield json.loads(line)
youtube_search_dag = DAG(
'youtube_search',
description='Find new Youtube channels.',
default_args={
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': datetime.datetime(2020, 11, 6),
'on_failure_callback': task_fail_slack_alert,
},
schedule_interval='0 4 * * *',
max_active_runs=1,
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
schema_operators = []
for table in ['youtube_queries', 'youtube_searches', 'youtube_channels', 'youtube_channel_details', 'youtube_socialblade_stats']:
drop_op = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='drop_table_{}'.format(table),
query='DROP TABLE IF EXISTS {}'.format(table),
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_youtube_crawl',
dag=youtube_search_dag
)
create_op = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='create_table_{}'.format(table),
query='athena/tables/{}.tmpl.sql'.format(table),
output_location='s3://kite-metrics-test/athena-results/ddl',
database='kite_youtube_crawl',
dag=youtube_search_dag,
)
drop_op >> create_op
schema_operators.append(create_op)
BATCH_SIZE = 100
MAX_RELATED_GENERATION = 1
get_queries_op = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='get_queries',
query='SELECT q.*, s.query IS NOT NULL AS searched FROM youtube_queries q LEFT OUTER JOIN youtube_searches s ON (q.query=s.query) ORDER BY q.count DESC',
output_location=SCRATCH_SPACE_LOC,
database='kite_youtube_crawl',
dag=youtube_search_dag,
)
schema_operators >> get_queries_op
get_existing_channels_op = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='get_existing_channels',
query='SELECT id FROM youtube_channels',
output_location=SCRATCH_SPACE_LOC,
database='kite_youtube_crawl',
dag=youtube_search_dag,
)
schema_operators >> get_existing_channels_op
def get_scratch_space_csv(s3hook, ti, task_id):
filename = ti.xcom_pull(task_ids=task_id)
s3key = s3hook.get_key('athena-scratch-space/{}.csv'.format(filename), BUCKET)
return csv.DictReader(codecs.getreader("utf-8")(s3key.get()['Body']))
def write_gzip_string_to_s3(s3hook, contents, key, bucket):
s3hook.load_string(gzip.compress(contents.encode('utf8')), key, bucket)
def youtube_crawl(ti, ts_nodash, **kwargs):
s3 = S3Hook('aws_us_east_1')
ex_channels = {c['id'] for c in get_scratch_space_csv(s3, ti, get_existing_channels_op.task_id)}
curr_time = datetime.datetime.now()
queries = get_scratch_space_csv(s3, ti, get_queries_op.task_id)
selected_queries = [q for q in queries if q['searched'] == 'false'][:BATCH_SIZE]
all_queries = {q['query'] for q in queries}
search_records = []
new_channels = []
new_queries = []
try:
for query in selected_queries:
print("Running query {}".format(query['query']))
query_hash = hashlib.md5(query['query'].encode('utf8')).hexdigest()
# resp = requests.get('https://serpapi.com/search.json',
# params={'engine': 'youtube', 'search_query': query['query'], 'api_key': 'XXXXXXX'})
resp = requests.get("https://www.googleapis.com/youtube/v3/search", params={
"key": "XXXXXXX",
"q": query['query'],
"part": "snippet",
"maxResults": "50"
}, headers={'content-type': 'application/json'})
if resp.status_code != 200:
print("Error from SerpAPI: {} {}".format(resp.status_code, resp.text))
raise Exception()
resp_json = resp.json()
# if 'video_results' not in resp.json():
if 'items' not in resp_json:
print("No results for {}".format(query['query']))
continue
response_key = 'search_responses/{}/{}.json.gz'.format(query_hash, ts_nodash)
s3.load_bytes(gzip.compress(resp.text.encode('utf8')), response_key, BUCKET, replace=True)
# all_channels = {v['channel']['link'] for v in resp_json['video_results'] if 'link' in v['channel']}
all_channels = {'https://www.youtube.com/channel/{}'.format(v['snippet']['channelId']) for v in resp_json['items']}
n_new_channels = len(all_channels - ex_channels)
for c in all_channels - ex_channels:
new_channels.append({
'id': c,
'query': query['query'],
'timestamp': curr_time.isoformat(),
})
ex_channels.add(c)
for key in resp_json:
if not key.startswith('searches_related_to_'):
continue
for search in resp_json[key]['searches']:
if search['query'] not in all_queries:
new_queries.append({
'query': search['query'],
'seed': False,
'generation': int(query.get('generation') or 0) + 1,
'parent': query['query']
})
search_records.append({
'query': query['query'],
'query_hash': query_hash,
'timestamp': curr_time.isoformat(),
'total': len(all_channels),
'unique': n_new_channels,
})
finally:
for key, objs in [('channels', new_channels), ('search_queries', new_queries), ('searches', search_records)]:
if objs:
contents = gzip.compress('\n'.join([json.dumps(obj) for obj in objs]).encode('utf8'))
s3.load_bytes(contents, '{}/{}-{}.json.gz'.format(key, ts_nodash, uuid.uuid4().hex), BUCKET)
youtube_crawl_op = PythonOperator(
python_callable=youtube_crawl,
task_id=youtube_crawl.__name__,
dag=youtube_search_dag,
provide_context=True,
)
(get_queries_op, get_existing_channels_op) >> youtube_crawl_op
get_new_channels_op = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='get_new_channels',
query='''SELECT DISTINCT c.id
FROM youtube_channels c
LEFT OUTER JOIN youtube_channel_details d ON (
concat('https://www.youtube.com/channel/', d.id)=c.id
OR concat('https://www.youtube.com/user/', d.forUsername)=c.id
)
WHERE d.id IS NULL AND d.forUsername IS NULL''',
output_location=SCRATCH_SPACE_LOC,
database='kite_youtube_crawl',
dag=youtube_search_dag,
)
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
def get_channel_details(ti, ts_nodash, **kwargs):
s3 = S3Hook('aws_us_east_1')
new_channels = {c['id'] for c in get_scratch_space_csv(s3, ti, get_new_channels_op.task_id)}
channels_by_type = collections.defaultdict(list)
for channel in new_channels:
c_parts = channel.split('/')
channels_by_type[c_parts[-2]].append(c_parts[-1])
print("Getting channel details for {} new channels and {} new users".format(
len(channels_by_type['channel']),
len(channels_by_type['user']))
)
c_details = []
url = "https://www.googleapis.com/youtube/v3/channels"
generic_params = {
"part": ["statistics", "snippet", "contentDetails", "status"],
"key": "XXXXXXX",
}
try:
for username in channels_by_type.get('user', []):
params = {'forUsername': username}
params.update(generic_params)
resp = requests.get(url, params=params, headers={'content-type': 'application/json'})
if not resp.json().get('items'):
print("Failed to get user: {}".format(username))
c_details.append({'forUsername': username})
continue
for item in resp.json()['items']:
item['forUsername'] = username
c_details.append(item)
for chunk in chunks(channels_by_type.get('channel', []), 50):
params = {'id': ','.join(chunk)}
params.update(generic_params)
resp = requests.get(url, params=params, headers={'content-type': 'application/json'})
if not resp.json().get('items'):
print("Failed to get channels: {}".format(', '.join(chunk)))
continue
for item in resp.json()['items']:
c_details.append(item)
finally:
print("Loading channel details for {} channels".format(len(c_details)))
contents = gzip.compress('\n'.join([json.dumps(obj) for obj in c_details]).encode('utf8'))
s3.load_bytes(contents, 'channel_details/{}-{}.json.gz'.format(ts_nodash, uuid.uuid4().hex), BUCKET)
get_channel_details_op = PythonOperator(
python_callable=get_channel_details,
task_id=get_channel_details.__name__,
dag=youtube_search_dag,
provide_context=True,
)
youtube_crawl_op >> get_new_channels_op >> get_channel_details_op
get_new_socialblade_channels = AWSAthenaOperator(
aws_conn_id='aws_us_east_1',
task_id='get_new_socialblade_channels',
query='''SELECT DISTINCT c.id
FROM youtube_channel_details c
LEFT OUTER JOIN youtube_socialblade_stats sb ON c.id=sb.id
WHERE sb.id IS NULL AND CAST(c.statistics.viewCount AS bigint) > 100000''',
output_location=SCRATCH_SPACE_LOC,
database='kite_youtube_crawl',
dag=youtube_search_dag,
)
QUEUE_URL = 'https://sqs.us-east-1.amazonaws.com/XXXXXXX/queue-youtube-socialblade'
def enqueue_socialblade_channels(ti, ts_nodash, **kwargs):
s3 = S3Hook('aws_us_east_1')
sqs_hook = SQSHook('aws_us_east_1')
new_channels = {c['id'] for c in get_scratch_space_csv(s3, ti, get_new_socialblade_channels.task_id)}
print('Enqueuing {} channels'.format(len(new_channels)))
sqs_hook.get_conn().purge_queue(QueueUrl=QUEUE_URL)
# Sleep to allow purge to complete
time.sleep(60)
for channel in new_channels:
sqs_hook.send_message(QUEUE_URL, channel)
enqueue_socialblade_channels_op = PythonOperator(
python_callable=enqueue_socialblade_channels,
task_id=enqueue_socialblade_channels.__name__,
dag=youtube_search_dag,
provide_context=True,
)
get_channel_details_op >> get_new_socialblade_channels >> enqueue_socialblade_channels_op

View File

@ -0,0 +1,280 @@
import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
import googleapiclient.discovery
from jinja2 import PackageLoader
from kite_airflow.plugins.google import GoogleSheetsRangeOperator
from kite_airflow.common import configs
from kite_airflow.common import utils as common_utils
from kite_airflow.youtube_dashboard import api
from kite_airflow.youtube_dashboard import files
from kite_airflow.youtube_dashboard import utils
from kite_airflow.slack_alerts import task_fail_slack_alert
BUCKET = 'kite-youtube-data' if common_utils.is_production() else 'kite-metrics-test'
SCRATCH_SPACE_LOC = 's3://{}/athena-scratch-space/'.format(BUCKET)
DATABASE = 'prod_kite_link_stats_youtube' if common_utils.is_production() else 'kite_link_stats_youtube'
TABLE_CHANNELS = {
'name': 'kite_link_stats_youtube_channels',
'data_location': 's3://{}/youtube-dashboard/channels/'.format(BUCKET),
}
TABLE_VIDEOS = {
'name': 'kite_link_stats_youtube_videos',
'data_location': 's3://{}/youtube-dashboard/videos/'.format(BUCKET),
}
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.datetime(2020, 11, 21),
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': datetime.timedelta(minutes=5),
'on_failure_callback': task_fail_slack_alert,
}
kite_link_stats_dag = DAG(
'youtube_dashboard',
description='Import links stats of sponsored videos for the YouTube dashboard.',
default_args=default_args,
schedule_interval='10 0 * * *',
jinja_environment_kwargs={
'loader': PackageLoader('kite_airflow', 'templates')
},
)
schema_operators = []
for table in [TABLE_CHANNELS, TABLE_VIDEOS]:
drop_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='drop_table_{}'.format(table['name']),
query='DROP TABLE IF EXISTS {}'.format(table['name']),
output_location='s3://kite-metrics-test/athena-results/ddl',
database=DATABASE,
dag=kite_link_stats_dag,
params={'data_location': table['data_location']},
)
create_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='create_table_{}'.format(table['name']),
query='athena/tables/{}.tmpl.sql'.format(table['name']),
output_location='s3://kite-metrics-test/athena-results/ddl',
database=DATABASE,
dag=kite_link_stats_dag,
params={'data_location': table['data_location']},
)
drop_op >> create_op
schema_operators.append(create_op)
get_channels_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='get_channels',
query='SELECT * FROM {}'.format(TABLE_CHANNELS['name']),
output_location=SCRATCH_SPACE_LOC,
database=DATABASE,
dag=kite_link_stats_dag,
)
schema_operators >> get_channels_op
get_videos_op = AWSAthenaOperator(
aws_conn_id=configs.AWS_CONN_ID,
task_id='get_videos',
query='SELECT * FROM {}'.format(TABLE_VIDEOS['name']),
output_location=SCRATCH_SPACE_LOC,
database=DATABASE,
dag=kite_link_stats_dag,
)
schema_operators >> get_videos_op
get_channels_sheet_operator = GoogleSheetsRangeOperator(
gcp_conn_id='google_cloud_kite_dev',
spreadsheet_id='XXXXXXX-J0',
range="'List of Channels'!A:C",
task_id='get_channels_sheet',
dag=kite_link_stats_dag,
)
def update_videos_from_all_channels(ti, yt_client):
'''
Take all given channels and store the list their videos
In case of new channel we search all videos and in case of an existing
channel we only search new videos via YouTube activities
Returns:\n
list:
new video items which we will use while taking snapshots. We need this
because athena queries are evaluating at start so we will not receive
these new videos via get videos query.
'''
channel_list = files.get_scratch_space_csv(ti, get_channels_op.task_id)
sheet_data = ti.xcom_pull(task_ids='get_channels_sheet')['values']
cid_field = sheet_data[0].index('Channel ID')
sheet_channels = {line[cid_field] for line in sheet_data[1:] if len(line) > cid_field and line[cid_field].strip()}
for new_c in sheet_channels - {c['id'] for c in channel_list}:
channel_list.append({'id': new_c, 'is_backfilled': 'false', 'last_backfill_until': '', 'last_updated': ''})
new_video_list = []
search_budget = 80
exception = None
for channel in channel_list:
channel_id = channel['id']
# indicates new channel or a channels whose backfilled is yet to complete
if channel['is_backfilled'] == 'false':
# incase of backfill was incomplete then resumes where it's left off
published_before = channel['last_backfill_until'] if channel['is_backfilled'] == 'false' else None
video_search_list, has_channel_search_remaining, no_of_searches, exception = api.get_all_video_search_list(
yt_client,
channel_id,
published_before,
search_budget,
)
for video_search_item in video_search_list:
new_video_list.append({
'id': utils.get_video_id_of_search_item(video_search_item),
'channel_id': channel_id,
})
# only update channel attributes if videos are found (also handles YT out of quota cases)
if(video_search_list):
last_search_item = video_search_list[- 1]
channel['last_backfill_until'] = utils.get_published_date_of_search_item(last_search_item)
channel['is_backfilled'] = not has_channel_search_remaining
# update the last_updated of channel which will help is in limiting future searches
channel['last_updated'] = common_utils.get_date_time_in_ISO()
search_budget -= no_of_searches
if search_budget <= 0:
break
all_activity_list, exception = api.get_all_activity_list(
yt_client,
channel_id,
channel['last_updated'],
)
if(len(all_activity_list)):
files.write_activities_on_file(all_activity_list)
video_activity_list = api.filter_video_activity_from_list(
all_activity_list,
)
for video_activity in video_activity_list:
new_video_list.append({
'id': utils.get_id_of_video_activity(video_activity),
'channel_id': channel_id,
})
# update the last_updated of channel which will help is in limiting future searches
channel['last_updated'] = common_utils.get_date_time_in_ISO()
files.write_channels_on_file(channel_list)
if len(new_video_list) > 0:
files.write_videos_on_file(new_video_list)
if exception:
raise exception
return new_video_list
def take_snapshots_and_update_files(video_list_for_snapshots, cached_urls_dict):
snapshot_list = get_snapshots_list(video_list_for_snapshots, cached_urls_dict)
files.write_snapshots_on_file(snapshot_list)
files.write_cached_urls_on_file(cached_urls_dict)
def get_snapshots_list(video_list, cached_urls_dict):
if not video_list:
return
snapshot_list = []
for video_item in video_list:
snapshot_list.append({
'video_id': utils.get_id_of_video_item(video_item),
'description': utils.get_description_of_video_item(video_item),
'is_link_present': utils.is_link_present_in_description(video_item, cached_urls_dict), # also updates the cache in case of shorten urls
'views': utils.get_views_of_video_item(video_item),
'timestamp': common_utils.get_date_time_in_ISO(),
})
return snapshot_list
def update_snapshots_of_all_videos(ti, yt_client, new_video_list):
'''
Take snapshots of all of the available videos and new videos
'''
video_list_for_snapshots = []
cached_urls_dict = files.get_cached_urls_from_file()
all_videos_list = files.get_scratch_space_csv(ti, get_videos_op.task_id)
all_videos_id_list = [video['id'] for video in all_videos_list]
no_of_batch_requests = 50 # to optimise YouTube quota
# appending new videos id also because get videos query don't return
# us new results that are been during the execution of this script
all_videos_id_list.extend(
list(map(lambda video: video['id'], new_video_list))
)
for start_index in range(0, len(all_videos_id_list), no_of_batch_requests):
try:
video_list = []
end_index = (start_index) + no_of_batch_requests
videos_id_batch_list = all_videos_id_list[start_index:end_index]
video_list = api.get_video_list(yt_client, videos_id_batch_list)
video_list_for_snapshots.extend(video_list)
except Exception:
# store data until now in case of any error or if quota exceeded
take_snapshots_and_update_files(video_list_for_snapshots, cached_urls_dict)
raise
take_snapshots_and_update_files(video_list_for_snapshots, cached_urls_dict)
def get_snaphots_of_videos(ti, **context):
api_service_name = 'youtube'
api_version = 'v3'
api_key = 'XXXXXXX'
yt_client = googleapiclient.discovery.build(
api_service_name, api_version, developerKey=api_key
)
new_video_list = update_videos_from_all_channels(ti, yt_client)
update_snapshots_of_all_videos(ti, yt_client, new_video_list)
get_snaphots_of_videos_operator = PythonOperator(
python_callable=get_snaphots_of_videos,
task_id=get_snaphots_of_videos.__name__,
dag=kite_link_stats_dag,
provide_context=True,
)
(
get_channels_op,
get_videos_op,
get_channels_sheet_operator,
) >> get_snaphots_of_videos_operator

View File

@ -0,0 +1,189 @@
{% for lang in langs %}
- name: user_data_last_{{lang}}_active_date
label: Last {{lang}} active date (user data)
type: datetime
sql:
type: bigint
agg: max
delta: max(to_unixtime(from_iso8601_timestamp(timestamp)) * if(properties__{{lang}}_edit > 0, 1, 0)) * 1000
transform: nullif(user_data_last_{{lang}}_active_date, 0)
{% endfor %}
- name: user_data_last_active_date
label: Last active date (user data)
type: datetime
sql:
type: bigint
delta: max(to_unixtime(from_iso8601_timestamp(timestamp)) * if({% for lang in langs %}properties__{{lang}}_edit{% if not loop.last %} + {% endif %}{% endfor %}> 0, 1, 0)) * 1000
agg: max
transform: nullif(user_data_last_active_date, 0)
- name: user_data_last_kite_alive_date
label: Last Kite alive date (user data)
type: datetime
sql:
type: bigint
agg: max
delta: max(to_unixtime(from_iso8601_timestamp(timestamp))) * 1000
transform: nullif(user_data_last_kite_alive_date, 0)
{% for lang in langs %}
- name: {{lang}}_active_1d
sql:
delta: if(sum(coalesce(properties__{{lang}}_edit, 0)) > 0, 1, 0)
type: bigint
- name: {{lang}}_edit_1d
sql:
delta: sum(coalesce(properties__{{lang}}_edit, 0))
type: bigint
{% for interval in [7, 28] %}
- name: user_data_{{lang}}_active_{{interval}}d
label: Days {{lang|title}} active in last {{interval}} days (user data)
type: number
sql:
type: bigint
delta_field: {{lang}}_active_1d
agg: sum
agg_days: {{interval}}
- name: user_data_{{lang}}_edit_{{interval}}d
label: {{lang|title}} edits in last {{interval}} days (user data)
type: number
sql:
type: bigint
delta_field: {{lang}}_edit_1d
agg: sum
agg_days: {{interval}}
{% endfor %}
{% endfor %}
- name: any_edit_1d
sql:
delta: sum({% for lang in langs %}coalesce(properties__{{lang}}_edit, 0){% if not loop.last %} + {% endif %}{% endfor %})
type: bigint
- name: any_active_1d
sql:
delta: if(sum({% for lang in langs %}coalesce(properties__{{lang}}_edit, 0){% if not loop.last %} + {% endif %}{% endfor %}) > 0, 1, 0)
type: bigint
{% for interval in [7, 28] %}
- name: user_data_any_active_{{interval}}d
label: Days any language active in last {{interval}} days (user data)
type: number
sql:
type: bigint
delta_field: any_active_1d
agg: sum
agg_days: {{interval}}
{% endfor %}
- name: any_edit_12w
sql:
type: bigint
delta_field: any_edit_1d
agg: sum
agg_days: 84
{% for lang in langs %}
- name: {{lang}}_edit_12w
type: number
sql:
type: bigint
delta_field: {{lang}}_edit_1d
agg: sum
agg_days: 84
- name: {{lang}}_percentage
label: {{lang|title}} percentage (user data)
type: number
sql:
type: bigint
transform: cast((cast({{lang}}_edit_12w as double) / nullif(any_edit_12w, 0)) * 100 AS bigint)
{% endfor %}
{% for editor in editors %}
- name: python_edits_in_{{editor}}
sql:
delta: sum(if(properties__python_edit > 0, properties__{{editor}}_events, 0))
type: bigint
agg: sum
agg_days: 84
- name: user_data_{{editor}}_installed
label: Editor {{ editor }} installed (user data)
type: bool
options:
- label: True
value: true
- label: False
value: false
sql:
type: boolean
agg: latest
delta: bool_or(properties__{{editor}}_installed)
{% endfor %}
- name: user_data_plan
label: Plan type (user data)
type: string
sql:
type: varchar(32)
agg: latest
delta: max_by(properties__plan, from_iso8601_timestamp(timestamp))
- name: user_data_server_deployment_id
label: Server deployment ID (user data)
type: string
sql:
type: varchar(64)
agg: latest
delta: max_by(properties__server_deployment_id, from_iso8601_timestamp(timestamp))
- name: user_data_primary_python_editor
label: Primary Python editor (user data)
type: string
sql: {}
- name: user_data_primary_language
label: Primary language (user data)
type: string
sql: {}
{% for prefix, fields in {"properties": ["os"], "maxmind": ["country_name", "city_name", "subdivision_1_name", "time_zone"]}.items() %}
{% for field in fields %}
- name: {{prefix}}__{{field}}_1d
sql:
type: map<string,bigint>
map_delta: {{prefix}}__{{field}}, {% for lang in langs %}coalesce(properties__{{lang}}_edit, 0){% if not loop.last %} + {% endif %}{% endfor %}
- name: user_data_{{field}}
label: Most common {{field}} (user data).
type: string
sql:
type: map<string,bigint>
transform: max_by(k, v)
map_agg: sum
agg_days: 84
delta_field: {{prefix}}__{{field}}_1d
{% endfor %}
{% endfor %}
- name: user_data_paid_jetbrains_installed
label: Editor paid Jetbrains installed (user data)
type: bool
options:
- label: True
value: true
- label: False
value: false
sql:
type: boolean
agg: latest
delta: bool_or({% for prefix in ["IU", "PY", "WS", "GO"] %}properties__intellij_version LIKE '{{prefix}}%'{% if not loop.last %} OR {% endif %}{% endfor %})
- name: user_data_primary_language
label: Primary language (user data)
type: string
sql: {}

View File

@ -0,0 +1 @@
XXXXXXX

View File

@ -0,0 +1,30 @@
editors:
- atom
- intellij
- jupyter
- spyder
- sublime3
- vim
- vscode
languages:
bash: ['lexicaltextshprovider']
c: ['lexicaltexthprovider', 'lexicaltextcprovider']
cpp: ['lexicaltexthprovider', 'lexicaltextcppprovider']
csharp: ['lexicaltextcsprovider']
css: ['lexicaltextcssprovider']
go: ['lexicalgolangprovider', 'lexicaltextgolangprovider']
html: ['lexicaltexthtmlprovider']
java: ['lexicaltextjavaprovider']
javascript: ['lexicaljavascriptprovider', 'lexicaltextjsprovider']
jsx: ['lexicaltextjsxprovider']
kotlin: ['lexicaltextktprovider']
less: ['lexicaltextlessprovider']
objectivec: ['lexicaltexthprovider', 'lexicaltextmprovider']
php: ['lexicaltextphpprovider']
python: ['pythonlexicalprovider']
ruby: ['lexicaltextrbprovider']
scala: ['lexicaltextscalaprovider']
tsx: ['lexicaltexttsxprovider']
typescript: ['lexicaltexttsprovider']
vue: ['lexicaltextvueprovider']

View File

@ -0,0 +1 @@
XXXXXXX

View File

View File

@ -0,0 +1,63 @@
# from airflow.models.baseoperator import BaseOperator
# from airflow.utils.decorators import apply_defaults
import time
from googleapiclient.discovery import build
from airflow import AirflowException
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
from airflow.plugins_manager import AirflowPlugin
from airflow.models.baseoperator import BaseOperator
from airflow.utils.decorators import apply_defaults
class GoogleSheetsHook(GoogleCloudBaseHook):
_conn = None
def __init__(self, api_version="v4", gcp_conn_id="google_cloud_default", delegate_to=None):
super(GoogleSheetsHook, self).__init__(gcp_conn_id, delegate_to)
self.api_version = api_version
def get_conn(self):
"""
Retrieves the connection to Cloud Functions.
:return: Google Cloud Build services object.
"""
if not self._conn:
http_authorized = self._authorize()
self._conn = build('sheets', self.api_version, http=http_authorized, cache_discovery=False)
return self._conn
@GoogleCloudBaseHook.fallback_to_default_project_id
def get_range(self, spreadsheet_id:str, range:str, **kwargs):
conn = self.get_conn()
sheets = conn.spreadsheets().values()
return sheets.get(spreadsheetId=spreadsheet_id, range=range).execute(num_retries=self.num_retries)
class GoogleSheetsRangeOperator(BaseOperator):
@apply_defaults
def __init__(
self,
spreadsheet_id: str,
range: str,
gcp_conn_id: str = 'google_cloud_default',
*args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.gcp_conn_id = gcp_conn_id
self.spreadsheet_id = spreadsheet_id
self.range=range
def execute(self, context):
hook = GoogleSheetsHook(gcp_conn_id=self.gcp_conn_id)
return hook.get_range(spreadsheet_id=self.spreadsheet_id, range=self.range)
class GoogleSheetsPlugin(AirflowPlugin):
name = 'google_sheets'
operators = [GoogleSheetsRangeOperator]
hooks = [GoogleSheetsHook]

View File

@ -0,0 +1,32 @@
import gzip
import json
from airflow.hooks.S3_hook import S3Hook
from airflow.contrib.operators.s3_delete_objects_operator import S3DeleteObjectsOperator
def read_s3_json_files(bucket, file_list):
s3 = S3Hook('aws_us_east_1')
for file in sorted(file_list):
obj = s3.get_key(file, bucket)
for line in gzip.open(obj.get()['Body']):
rec = json.loads(line)
to_clean = [rec]
while to_clean:
this = to_clean.pop()
for k in list(this.keys()):
v = this[k]
if isinstance(v, dict):
to_clean.append(v)
continue
if v is None:
del this[k]
yield rec
class S3DeletePrefixOperator(S3DeleteObjectsOperator):
def execute(self, context):
if isinstance(self.keys, str):
hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
self.keys = hook.list_keys(bucket_name=self.bucket, prefix=self.keys)
return super(S3DeletePrefixOperator, self).execute(context)

View File

@ -0,0 +1,104 @@
from typing import Optional
import boto3
from cached_property import cached_property
from airflow.secrets import BaseSecretsBackend
from airflow.utils.log.logging_mixin import LoggingMixin
class SecretsManagerBackend(BaseSecretsBackend, LoggingMixin):
"""
Retrieves Connection or Variables from AWS Secrets Manager
Configurable via ``airflow.cfg`` like so:
.. code-block:: ini
[secrets]
backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend
backend_kwargs = {"connections_prefix": "airflow/connections"}
For example, if secrets prefix is ``airflow/connections/smtp_default``, this would be accessible
if you provide ``{"connections_prefix": "airflow/connections"}`` and request conn_id ``smtp_default``.
And if variables prefix is ``airflow/variables/hello``, this would be accessible
if you provide ``{"variables_prefix": "airflow/variables"}`` and request variable key ``hello``.
You can also pass additional keyword arguments like ``aws_secret_access_key``, ``aws_access_key_id``
or ``region_name`` to this class and they would be passed on to Boto3 client.
:param connections_prefix: Specifies the prefix of the secret to read to get Connections.
:type connections_prefix: str
:param variables_prefix: Specifies the prefix of the secret to read to get Variables.
:type variables_prefix: str
:param profile_name: The name of a profile to use. If not given, then the default profile is used.
:type profile_name: str
:param sep: separator used to concatenate secret_prefix and secret_id. Default: "/"
:type sep: str
"""
def __init__(
self,
connections_prefix: str = 'airflow/connections',
variables_prefix: str = 'airflow/variables',
profile_name: Optional[str] = None,
sep: str = "/",
**kwargs
):
super().__init__(**kwargs)
self.connections_prefix = connections_prefix.rstrip("/")
self.variables_prefix = variables_prefix.rstrip('/')
self.profile_name = profile_name
self.sep = sep
self.kwargs = kwargs
@cached_property
def client(self):
"""
Create a Secrets Manager client
"""
session = boto3.session.Session(
profile_name=self.profile_name,
)
return session.client(service_name="secretsmanager", **self.kwargs)
def get_conn_uri(self, conn_id: str) -> Optional[str]:
"""
Get Connection Value
:param conn_id: connection id
:type conn_id: str
"""
return self._get_secret(self.connections_prefix, conn_id)
def get_variable(self, key: str) -> Optional[str]:
"""
Get Airflow Variable from Environment Variable
:param key: Variable Key
:return: Variable Value
"""
return self._get_secret(self.variables_prefix, key)
def _get_secret(self, path_prefix: str, secret_id: str) -> Optional[str]:
"""
Get secret value from Secrets Manager
:param path_prefix: Prefix for the Path to get Secret
:type path_prefix: str
:param secret_id: Secret Key
:type secret_id: str
"""
secrets_path = self.build_path(path_prefix, secret_id, self.sep)
try:
response = self.client.get_secret_value(
SecretId=secrets_path,
)
return response.get('SecretString')
except self.client.exceptions.ResourceNotFoundException:
self.log.debug(
"An error occurred (ResourceNotFoundException) when calling the "
"get_secret_value operation: "
"Secret %s not found.", secrets_path
)
return None

View File

@ -0,0 +1,41 @@
from airflow.contrib.operators.slack_webhook_operator import SlackWebhookOperator
from airflow.hooks.base_hook import BaseHook
from airflow.models import Variable
SLACK_CONN_ID = "slack_devops_notifications"
def task_fail_slack_alert(context):
"""
Callback task that can be used in DAG to alert of failure task completion
Args:
context (dict): Context variable passed in from Airflow
Returns:
None: Calls the SlackWebhookOperator execute method internally
"""
if Variable.get('env', 'dev') == 'dev':
return
slack_webhook_token = BaseHook.get_connection(SLACK_CONN_ID).password
slack_msg = """
:red_circle: Task Failed.
*Task*: {task}
*Dag*: {dag} (https://airflow.kite.dev/admin/airflow/tree?dag_id={dag})
*Execution Time*: {exec_date}
""".format(
task=context.get("task_instance").task_id,
dag=context.get("task_instance").dag_id,
exec_date=context.get("execution_date"),
)
failed_alert = SlackWebhookOperator(
task_id="slack_test",
http_conn_id=SLACK_CONN_ID,
webhook_token=slack_webhook_token,
message=slack_msg,
username="airflow",
)
return failed_alert.execute(context=context)

View File

@ -0,0 +1,38 @@
-- Week range is Sunday - Saturday
{% set start_date=execution_date %}
{% set end_date=execution_date.add(days=6) %}
WITH any_edit AS (
SELECT
userid,
CAST(
COUNT_IF(
{% for language in params.languages %}
properties__{{language}}_edit > 0 {% if not loop.last -%} OR {%- endif -%}
{% endfor %}
) AS double
) / 6 AS edits -- dividing by 6 because events get reported after every 10 minutes
FROM
kite_status_normalized
WHERE
(
year > {{ start_date.year }}
OR (year = {{ start_date.year }} AND month > {{ start_date.month }})
OR (year = {{ start_date.year }} AND month = {{ start_date.month }} AND day >= {{ start_date.day }})
)
AND (
year < {{ end_date.year }}
OR (year = {{ end_date.year }} AND month < {{ end_date.month }})
OR (year = {{ end_date.year }} AND month = {{ end_date.month }} AND day <= {{ end_date.day }})
)
GROUP BY
userid
)
SELECT
{% for i in range(1, 100) %}
approx_percentile(edits, {{i/100}}) AS pct_{{i}} {% if i < 99 -%} , {%- endif -%}
{% endfor %}
FROM
any_edit
WHERE
edits > 0;

View File

@ -0,0 +1,80 @@
-- Calculating coding stats of provided NUM_OF_WEEKS.
-- Week range is Sunday - Saturday
{% set start_date=execution_date.subtract(days=(7 * (params.num_of_weeks - 1))) %}
{% set end_date=execution_date.add(days=6) %}
WITH coding_stats AS (
SELECT
userid,
date_diff(
'day',
from_iso8601_timestamp(timestamp),
CAST('{{end_date.to_date_string()}}' AS timestamp)
) / 7 AS week,
SUM(
{% for language in params.languages %}
COALESCE(properties__{{language}}_completions_num_selected, 0) {% if not loop.last -%} + {%- endif -%}
{% endfor %}
) AS completions_selected,
CAST(
COUNT_IF(
{% for language in params.languages %}
properties__{{language}}_edit > 0 {% if not loop.last -%} OR {%- endif -%}
{% endfor %}
) AS double
) / 6 AS coding_hours,
CAST(
COUNT_IF(properties__python_edit > 0) AS double
) / 6 AS python_hours
FROM
kite_status_normalized
WHERE
(
year > {{ start_date.year }}
OR (year = {{ start_date.year }} AND month > {{ start_date.month }})
OR (year = {{ start_date.year }} AND month = {{ start_date.month }} AND day >= {{ start_date.day }})
)
AND (
year < {{ end_date.year }}
OR (year = {{ end_date.year }} AND month < {{ end_date.month }})
OR (year = {{ end_date.year }} AND month = {{ end_date.month }} AND day <= {{ end_date.day }})
)
AND event = 'kite_status'
AND regexp_like(kite_status_normalized.userid, '\\p{Cc}') = FALSE -- filter user id's that contains null
AND regexp_replace(kite_status_normalized.userid, '\x{00}') != '' -- filter user id's were only null bytes
GROUP BY
1,
2
)
SELECT
coding_stats.userid,
map_agg(week, completions_selected) AS completions_selected,
map_agg(week, coding_hours) AS coding_hours,
map_agg(week, python_hours) AS python_hours,
reduce(
array_agg(
from_iso8601_timestamp({{ params.table_daily_active_users }}.timestamp)
ORDER BY
{{ params.table_daily_active_users }}.timestamp DESC
),
0,
(acc, current) -> if(
date_diff('day', current, CAST('{{end_date.to_date_string()}}' AS timestamp)) / 7 - acc < 1,
date_diff('day', current, CAST('{{end_date.to_date_string()}}' AS timestamp)) / 7 + 1,
acc
),
acc -> acc
) AS streak,
COUNT(
DISTINCT date_diff(
'day',
from_iso8601_timestamp({{ params.table_daily_active_users }}.timestamp),
CAST('{{end_date.to_date_string()}}' AS timestamp)
) / 7
) AS total_weeks
FROM
coding_stats
LEFT OUTER JOIN {{ params.table_daily_active_users }} ON coding_stats.userid = {{ params.table_daily_active_users }}.userid
GROUP BY
coding_stats.userid
;

View File

@ -0,0 +1,8 @@
{% if prev_execution_date_success == None -%}
DROP TABLE IF EXISTS {{ params.table_name }};
{%- else -%}
-- void query which prevents ERROR: Parameter validation failed which occurs due to empty file that have no query
SELECT *
FROM {{ params.table_name }}
LIMIT 0;
{%- endif -%}

View File

@ -0,0 +1,20 @@
INSERT INTO
{{ params.table_name }} (userid, timestamp)
SELECT
DISTINCT userid,
timestamp
FROM
kite_status_normalized
WHERE
(
{% for language in params.languages %}
properties__{{language}}_events > 0 {% if not loop.last -%} OR {%- endif -%}
{% endfor %}
)
AND regexp_like(userid, '\p{Cc}') = FALSE -- filter user id's that contains null
AND regexp_replace(kite_status_normalized.userid, '\x{00}') != '' -- filter user id's were only null bytes
AND year >= {{ execution_date.year }}
AND month >= {{ execution_date.month }}
AND day >= {{ execution_date.day }}
AND hour > {{ execution_date.hour }}
;

View File

@ -0,0 +1,7 @@
CREATE EXTERNAL TABLE IF NOT EXISTS `{{params.table_name}}`(
userid string,
timestamp string
)
STORED AS PARQUET
LOCATION
'{{ params.data_location }}'

View File

@ -0,0 +1,30 @@
CREATE TABLE cio_profile_attrs_{{ds_nodash}}
WITH (
format='JSON',
external_location = 's3://kite-metrics/athena/cio_profile_attrs/{{ds}}'
)
AS
WITH current AS (
SELECT *
FROM hubspot_intermediate
WHERE year = {{execution_date.year}}
AND month = {{execution_date.month}}
AND day = {{execution_date.day}}
AND delta=0
)
SELECT
current.userid id,
{% for prop in params.props -%}
subquery_{{prop}}.value {{prop}}
{%- if not loop.last -%},{% endif %}
{%- endfor %}
FROM current
{% for prop in params.props %}
LEFT JOIN (
SELECT userid, max_by(k, v) value
FROM current
CROSS JOIN unnest(user_data_{{prop}}) AS t(k, v)
GROUP BY userid
) subquery_{{ prop }}
ON current.userid = subquery_{{ prop }}.userid
{%- endfor %}

View File

@ -0,0 +1,21 @@
CREATE TABLE hubspot_delta_{{ds_nodash}}
WITH (
format='PARQUET',
parquet_compression='SNAPPY',
external_location = 's3://kite-metrics/athena/hubspot/intermediate/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/delta=1'
)
AS
SELECT
{% for prop in params.props %}
{% if prop.sql.delta -%}
CAST({{prop.sql.delta}} AS {{prop.sql.type}}) {{prop.name}},
{%- endif -%}
{% if prop.sql.map_delta -%}
transform_values(multimap_agg({{prop.sql.map_delta}}), (k, v) -> reduce(v, 0, (s, x) -> s + x, (s) -> s)) {{prop.name}},
{%- endif -%}
{% endfor %}
userid
FROM kite_status_normalized
WHERE year={{execution_date.year}} AND month={{execution_date.month}} AND day={{execution_date.day}}
AND regexp_like(userid, '^[0-9]+$')
GROUP BY userid

View File

@ -0,0 +1 @@
XXXXXXX

View File

@ -0,0 +1,80 @@
CREATE TABLE hubspot_rollup_{{ds_nodash}}
WITH (
format='PARQUET',
parquet_compression='SNAPPY',
external_location = 's3://kite-metrics/athena/hubspot/intermediate/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/delta=0'
)
AS
WITH current AS (
SELECT *
FROM hubspot_intermediate hs
WHERE (
hs.delta=1 AND
hs.year={{execution_date.year}} AND
hs.month={{execution_date.month}} AND
hs.day={{execution_date.day}}
) OR (
hs.delta=0 AND
hs.year={{(execution_date - macros.timedelta(days=1)).year}} AND
hs.month={{(execution_date - macros.timedelta(days=1)).month}} AND
hs.day={{(execution_date - macros.timedelta(days=1)).day}}
)
),
scalar_aggs AS (
SELECT
{% for prop in params.scalar_props %}
{%- if prop.sql.agg == 'latest' -%}
coalesce(max_by({{prop.name}}, delta)) {{prop.name}},
{%- elif prop.sql.delta_field -%}
{{ prop.sql.agg }}(coalesce({{prop.name}}, {{prop.sql.delta_field}})) {{prop.name}},
{%- else -%}
{{ prop.sql.agg }}({{prop.name}}) {{prop.name}},
{%- endif -%}
{% endfor %}
current.userid
FROM current
GROUP BY current.userid
)
SELECT scalar_aggs.userid,
{% for prop in params.scalar_props %}
{%- if prop.sql.agg_days -%}
scalar_aggs.{{prop.name}} - coalesce(scalar_diff_{{ prop.sql.agg_days}}d.{{ prop.sql.delta_field or prop.name }}, 0) {{prop.name}}
{%- else -%}
scalar_aggs.{{prop.name}} {{prop.name}}
{%- endif -%}
{%- if (not loop.last) or params.map_props %},{% endif %}
{% endfor %}
{% for prop in params.map_props %}
{{prop.name}}_aggs.value {{prop.name}}
{%- if not loop.last %},{% endif %}
{% endfor %}
FROM scalar_aggs
{% for tbl in params.scalar_time_rollups %}
LEFT JOIN hubspot_intermediate scalar_diff_{{tbl}}d
ON scalar_aggs.userid = scalar_diff_{{tbl}}d.userid
AND scalar_diff_{{tbl}}d.delta = 1
AND scalar_diff_{{tbl}}d.year={{(execution_date - macros.timedelta(days=tbl)).year}}
AND scalar_diff_{{tbl}}d.month={{(execution_date - macros.timedelta(days=tbl)).month}}
AND scalar_diff_{{tbl}}d.day={{(execution_date - macros.timedelta(days=tbl)).day}}
{% endfor %}
{% for prop in params.map_props %}
LEFT JOIN (
SELECT userid, transform_values(multimap_agg(k, v), (inner_k, inner_v) -> reduce(inner_v, cast(0 as bigint), (s, x) -> s + x, (s) -> s)) value
FROM (
SELECT userid, k, v
FROM current
CROSS JOIN unnest(coalesce({{prop.name}}, {{prop.sql.delta_field}})) as t(k, v)
UNION ALL
SELECT userid, k, v * -1
FROM hubspot_intermediate
CROSS JOIN unnest({{prop.sql.delta_field}}) as t(k, v)
WHERE hubspot_intermediate.delta = 1
AND hubspot_intermediate.year={{(execution_date - macros.timedelta(days=prop.sql.agg_days)).year}}
AND hubspot_intermediate.month={{(execution_date - macros.timedelta(days=prop.sql.agg_days)).month}}
AND hubspot_intermediate.day={{(execution_date - macros.timedelta(days=prop.sql.agg_days)).day}}
)
GROUP BY userid
) {{prop.name}}_aggs
ON scalar_aggs.userid={{prop.name}}_aggs.userid
{% endfor %}

View File

@ -0,0 +1,19 @@
INSERT INTO activations
WITH new_activations as (
SELECT coalesce(properties__user_id, properties__anonymous_id) userid,
min(from_iso8601_timestamp(timestamp)) activation_date,
to_unixtime(min(from_iso8601_timestamp(timestamp))) activation_timestamp
FROM kite_status_normalized
WHERE year = {{execution_date.year}} AND month = {{execution_date.month}} AND day = {{execution_date.day}}
AND (event='ast_node_resolved' OR event='anon_supported_file_edited')
GROUP BY coalesce(properties__user_id, properties__anonymous_id)
)
SELECT new_activations.userid,
new_activations.activation_timestamp,
day(new_activations.activation_date) day,
year(new_activations.activation_date) year,
month(new_activations.activation_date) month
FROM activations
RIGHT OUTER JOIN new_activations ON activations.userid=new_activations.userid
WHERE new_activations.activation_timestamp < activations.activation_timestamp
OR activations.userid IS NULL

View File

@ -0,0 +1 @@
XXXXXXX

View File

@ -0,0 +1,12 @@
CREATE TABLE kite_status_1d_{{params.key}}_{{ds_nodash}}_json
WITH (
format='JSON',
external_location = 's3://kite-metrics/athena/kite_status_1d_{{params.key}}/json/{{ds}}'
)
AS
SELECT *
FROM kite_status_1d_{{params.key}}_{{ds_nodash}}
WHERE ({% for lang in params.languages %}{{lang}}_events > 0{% if not loop.last %} OR {% endif %}{% endfor %})
AND year = {{execution_date.year}}
AND month = {{execution_date.month}}
AND day = {{execution_date.day}}

View File

@ -0,0 +1,81 @@
CREATE table kite_metrics.kite_status_normalized_{{ds_nodash}}
WITH (
format='PARQUET',
parquet_compression='SNAPPY',
partitioned_by = ARRAY['hour'],
external_location = 's3://kite-metrics/athena/kite_status_normalized/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'
)
AS
WITH kite_status_normalized_ts AS (
SELECT
{% for field in params.schema['properties'] %}
{%- if field != 'timestamp' %}{{field}},{% endif %}
{%- endfor %}
{#- Normalize older timestamps. Convert to ISO format and reset them based on prefix because they were client-reported and unreliable. #}
if(regexp_like(timestamp, '^[0-9]+$'), to_iso8601(date_add('second', cast(timestamp as bigint) / 1000 - cast(to_unixtime(timestamp '{{execution_date.strftime('%Y-%m-%d %H:00')}}') as bigint), timestamp '{{execution_date.strftime('%Y-%m-%d %H:00')}}')), timestamp) timestamp
FROM kite_metrics.kite_status
WHERE event IS NOT NULL
AND event != ''
AND prefix >= '{{(execution_date.replace(hour=0, minute=0, second=0, microsecond=0) - macros.timedelta(hours=1)).strftime('%Y/%m/%d/%H')}}'
AND prefix <= '{{(execution_date.replace(hour=0, minute=0, second=0, microsecond=0) + macros.timedelta(hours=25)).strftime('%Y/%m/%d/%H')}}'
),
kite_status_filtered AS (
SELECT *,
reduce(zip_with(split(sourceip, '.'),
sequence(3, 0, -1),
(n, p) -> cast(cast(n as bigint) * pow(256, p) as bigint)
),
cast(0 as bigint),
(s, x) -> s + x,
(s)->s
) sourceIpNumber
FROM kite_status_normalized_ts
WHERE timestamp >= '{{execution_date.replace(hour=0, minute=0, second=0, microsecond=0).strftime('%Y-%m-%dT%H:%M:%S')}}'
AND timestamp < '{{(execution_date.replace(hour=0, minute=0, second=0, microsecond=0) + macros.timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S')}}'
),
maxmind_masks AS (
SELECT DISTINCT kite_status_filtered.sourceIp sourceip,
bitwise_and(kite_status_filtered.sourceIpNumber, maxmind.mask) maskedSourceIpNumber,
maxmind.mask
FROM kite_status_filtered
CROSS JOIN (SELECT DISTINCT mask FROM maxmind_city_ipv4) maxmind
WHERE regexp_like(kite_status_filtered.sourceIp, '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$')
),
maxmind_cities AS (
SELECT sourceip,
arbitrary(maxmind.country_name) country_name,
arbitrary(maxmind.country_iso_code) country_iso_code,
arbitrary(maxmind.subdivision_1_name) subdivision_1_name,
arbitrary(maxmind.city_name) city_name,
arbitrary(maxmind.time_zone) time_zone
FROM maxmind_masks
JOIN maxmind_city_ipv4 maxmind
ON maxmind_masks.mask = maxmind.mask
AND maxmind_masks.maskedSourceIpNumber = maxmind.address
GROUP BY sourceip
)
SELECT
{% for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
kite_status_filtered.{{ key }} {{ key }},
{% endfor %}
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False %}
{% if value.type.startswith('array') or value.type.startswith('map') -%}
if(cardinality(kite_status_filtered.properties.{{ key }}) > 0, kite_status_filtered.properties.{{ key }}) properties__{{ key }},
{%- else -%}
kite_status_filtered.properties.{{ key }} properties__{{ key }},
{%- endif -%}
{% endfor %}
maxmind_cities.country_name maxmind__country_name,
maxmind_cities.country_iso_code maxmind__country_iso_code,
maxmind_cities.subdivision_1_name maxmind__subdivision_1_name,
maxmind_cities.city_name maxmind__city_name,
maxmind_cities.time_zone maxmind__time_zone,
monetizable_scores.score monetizable_score,
monetizable_scores.model_version monetizable_model_version,
hour(from_iso8601_timestamp(kite_status_filtered.timestamp)) hour
FROM kite_status_filtered
LEFT OUTER JOIN maxmind_cities
ON kite_status_filtered.sourceIp = maxmind_cities.sourceip
LEFT OUTER JOIN monetizable_scores
ON kite_status_filtered.userid = monetizable_scores.userid

View File

@ -0,0 +1,31 @@
{% set execution_day = execution_date.replace(hour=0, minute=0, second=0, microsecond=0) %}
CREATE table kite_metrics.kite_status_normalized_{{ds_nodash}}
WITH (
format='PARQUET',
parquet_compression='SNAPPY',
partitioned_by = ARRAY['hour'],
external_location = 's3://kite-metrics/athena/kite_status_normalized/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'
)
AS SELECT
{% for key, value in params.schema.items()|sort if key != "properties" %}
kite_status_segment.{{ key }} {{ key }},
{% endfor %}
{% for key, value in params.schema['properties'].items()|sort %}
{% if value.startswith('array') or value.startswith('map') -%}
if(cardinality(kite_status_segment.properties.{{ key }}) > 0, kite_status_segment.properties.{{ key }}) properties__{{ key }},
{%- else -%}
kite_status_segment.properties.{{ key }} properties__{{ key }},
{%- endif -%}
{% endfor %}
hour(from_iso8601_timestamp(kite_status_segment.timestamp)) hour
FROM kite_metrics.kite_status_segment
WHERE event IS NOT NULL
AND event != ''
AND prefix IN (
'{{1000 * (execution_day - macros.timedelta(days=1)).int_timestamp}}',
'{{1000 * execution_day.int_timestamp}}',
'{{1000 * (execution_day + macros.timedelta(days=1)).int_timestamp}}'
)
AND timestamp >= '{{execution_day.strftime('%Y-%m-%dT%H:%M:%S')}}'
AND timestamp < '{{(execution_day + macros.timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S')}}'

View File

@ -0,0 +1,22 @@
CREATE TABLE mixpanel_people_rollup_{{ds_nodash}}
WITH (
format='PARQUET',
parquet_compression='SNAPPY',
external_location = 's3://kite-metrics/mixpanel/people/rollups/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'
)
AS
WITH candidates AS (
SELECT {% for key in params.schema|sort %}{{ key }}{% if not loop.last %}, {% endif %}{% endfor %}
FROM mixpanel_people_raw
{% if prev_execution_date_success %}WHERE year > {{ prev_execution_date_success.year }} OR (year = {{ prev_execution_date_success.year }} AND month > {{ prev_execution_date_success.month }} ) OR (year = {{ prev_execution_date_success.year }} AND month = {{ prev_execution_date_success.month }} AND day > {{ prev_execution_date_success.day }}) {% endif %}
UNION ALL
SELECT {% for key in params.schema|sort %}{{ key }}{% if not loop.last %}, {% endif %}{% endfor %}
FROM mixpanel_people
)
SELECT
distinct_id,
{% for key in params.schema if key != 'distinct_id' %}
max_by({{ key}}, time) {{ key }}{% if not loop.last %},{% endif %}
{% endfor %}
FROM candidates
GROUP BY distinct_id

View File

@ -0,0 +1,18 @@
CREATE TABLE monetizable_scores_{{ds_nodash}}
WITH (
format='PARQUET',
parquet_compression='SNAPPY',
external_location = 's3://{{params.bucket}}/monetizable/final_users/{{ds_nodash}}'
)
AS SELECT
userid,
max_by(score, timestamp) score,
max_by(model_version, timestamp) model_version,
max(timestamp) timestamp
FROM
(
SELECT userid, score, model_version, timestamp FROM monetizable_scores
UNION ALL
SELECT userid, score, model_version, {{ execution_date.int_timestamp }} FROM monetizable_inf_results_{{ds_nodash}}
) AS subq
GROUP BY userid

View File

@ -0,0 +1,73 @@
CREATE TABLE monetizable_new_users_{{ds_nodash}}
WITH (
external_location ='s3://{{params.bucket}}/monetizable/new_users/{{ds_nodash}}',
format='JSON'
)
AS WITH
people AS (
SELECT
CAST(properties.user_id AS VARCHAR) AS userid,
BOOL_OR(properties.windows_domain_membership) AS windows_domain_membership,
ARBITRARY(properties.cio_experiment_trial_end_v1) AS discount
FROM mixpanel_people
GROUP BY 1
),
status AS (
SELECT
kite_status_normalized.userid,
MIN(month) AS activation_month,
ARBITRARY(properties__os) AS os,
ARBITRARY(maxmind__country_iso_code) AS country_iso_code,
ARBITRARY(properties__cpu_threads) AS cpu_threads,
BOOL_OR(properties__git_found) AS git_found,
BOOL_OR(properties__atom_installed) AS atom_installed,
BOOL_OR(properties__intellij_installed) AS intellij_installed,
false AS pycharm_installed,
BOOL_OR(properties__sublime3_installed) AS sublime3_installed,
BOOL_OR(properties__vim_installed) AS vim_installed,
BOOL_OR(properties__vscode_installed) AS vscode_installed,
BOOL_OR(SUBSTR(properties__intellij_version, 1, 2) NOT IN ('IC', 'PC')) AS intellij_paid,
BOOL_OR(properties__plan IN ('pro_yearly', 'pro_monthly', 'pro_trial')) AS trial_or_converted,
BOOL_OR(properties__plan IN ('pro_yearly', 'pro_monthly')) AS converted
FROM kite_status_normalized
LEFT OUTER JOIN monetizable_scores ON kite_status_normalized.userid = monetizable_scores.userid
WHERE
event = 'kite_status'
AND (
year > {{ prev_execution_date.year }}
OR (year = {{ prev_execution_date.year }} AND month > {{ prev_execution_date.month }})
OR (year = {{ prev_execution_date.year }} AND month = {{ prev_execution_date.month }} AND day = {{ prev_execution_date.day }})
)
AND (
year < {{ next_execution_date.year }}
OR (year = {{ next_execution_date.year }} AND month < {{ next_execution_date.month }})
OR (year = {{ next_execution_date.year }} AND month = {{ next_execution_date.month }} AND day <= {{ next_execution_date.day }})
)
AND kite_status_normalized.userid IS NOT NULL
AND kite_status_normalized.userid != '0'
AND (monetizable_scores.timestamp IS NULL OR date_diff('day', from_unixtime(monetizable_scores.timestamp, 'utc'), now()) >= 7)
GROUP BY 1
)
SELECT
status.userid,
activation_month,
cast(to_unixtime(current_timestamp) as bigint) timestamp,
COALESCE(os, '{unknown}') AS os,
COALESCE(country_iso_code, '{unknown}') AS country_iso_code,
COALESCE(cpu_threads, 0) cpu_threads,
COALESCE(git_found, FALSE) AS git_found,
COALESCE(atom_installed, FALSE) AS atom_installed,
COALESCE(intellij_installed, FALSE) AS intellij_installed,
COALESCE(pycharm_installed, FALSE) AS pycharm_installed,
COALESCE(sublime3_installed, FALSE) AS sublime3_installed,
COALESCE(vim_installed, FALSE) AS vim_installed,
COALESCE(vscode_installed, FALSE) AS vscode_installed,
COALESCE(intellij_paid, FALSE) AS intellij_paid,
COALESCE(windows_domain_membership, FALSE) AS windows_domain_membership,
COALESCE(discount, 'no discount') AS discount,
COALESCE(trial_or_converted, FALSE) AS trial_or_converted,
COALESCE(converted, FALSE) AS converted
FROM status
LEFT JOIN people
ON status.userid = people.userid

View File

@ -0,0 +1,26 @@
CREATE EXTERNAL TABLE IF NOT EXISTS `hubspot_intermediate` (
userid string,
{% for prop in params.props if prop.sql.type %}
{{ prop.name }} {{ prop.sql.type }}{% if not loop.last %},{% endif %}
{% endfor %}
)
PARTITIONED BY (
`year` int,
`month` int,
`day` int,
`delta` int
)
STORED AS PARQUET
LOCATION 's3://kite-metrics/athena/hubspot/intermediate/'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.year.type'='integer',
'projection.year.range'='2010,2100',
'projection.month.type'='integer',
'projection.month.range'='1,12',
'projection.day.type'='integer',
'projection.day.range'='1,31',
'projection.delta.type'='integer',
'projection.delta.range'='0,1',
'storage.location.template'='s3://kite-metrics/athena/hubspot/intermediate/year=${year}/month=${month}/day=${day}/delta=${delta}'
);

View File

@ -0,0 +1,15 @@
CREATE EXTERNAL TABLE `kite_link_stats_youtube_channels`(
id string,
name string,
last_updated string,
is_backfilled boolean,
last_backfill_until string
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'{{ params.data_location }}'

View File

@ -0,0 +1,12 @@
CREATE EXTERNAL TABLE `kite_link_stats_youtube_videos`(
id string,
channel_id string
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'{{ params.data_location }}'

View File

@ -0,0 +1,29 @@
CREATE EXTERNAL TABLE `kite_status`(
{%- for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
`{{ key }}` {{ value.type|safe }},
{%- endfor %}
`properties` struct<
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False -%}
{{ key }}:{{ value.type|safe }}{% if not loop.last %},{% endif %}
{%- endfor %}
>
)
PARTITIONED BY (
`prefix` string)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-metrics/firehose/kite_status'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.prefix.format'='yyyy/MM/dd/HH',
'projection.prefix.interval'='1',
'projection.prefix.interval.unit'='HOURS',
'projection.prefix.range'='2018/01/01/00,NOW',
'projection.prefix.type'='date',
'storage.location.template'='s3://kite-metrics/firehose/kite_status/${prefix}'
)

View File

@ -0,0 +1,37 @@
CREATE EXTERNAL TABLE `kite_status_normalized`(
{% for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
`{{ key }}` {{ value.type|safe }},
{% endfor %}
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False %}
`properties__{{ key }}` {{ value.type|safe }},
{% endfor %}
`maxmind__country_name` string,
`maxmind__country_iso_code` string,
`maxmind__subdivision_1_name` string,
`maxmind__city_name` string,
`maxmind__time_zone` string,
`monetizable_score` double,
`monetizable_model_version` string
)
PARTITIONED BY (
`year` int,
`month` int,
`day` int,
`hour` int
)
STORED AS PARQUET
LOCATION 's3://kite-metrics/athena/kite_status_normalized/'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.year.type'='integer',
'projection.year.range'='2010,2100',
'projection.month.type'='integer',
'projection.month.range'='1,12',
'projection.day.type'='integer',
'projection.day.range'='1,31',
'projection.hour.type'='integer',
'projection.hour.range'='0,23',
'storage.location.template'='s3://kite-metrics/athena/kite_status_normalized/year=${year}/month=${month}/day=${day}/hour=${hour}'
);

View File

@ -0,0 +1,27 @@
CREATE EXTERNAL TABLE `kite_status_segment`(
{%- for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
`{{ key }}` {{ value.type|safe }},
{%- endfor %}
`properties` struct<
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False -%}
{{ key }}:{{ value.type|safe }}{% if not loop.last %},{% endif %}
{%- endfor %}
>
)
PARTITIONED BY (
`prefix` string)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-metrics/segment-logs/XXXXXXX'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.prefix.interval'='XXXXXXX',
'projection.prefix.range'='XXXXXXX,XXXXXXX',
'projection.prefix.type'='integer',
'storage.location.template'='s3://kite-metrics/segment-logs/XXXXXXX/${prefix}'
)

View File

@ -0,0 +1,44 @@
{% macro struct(dct) -%}
struct<
{% for key, value in dct.items() %}
{{ key }}: {% if value is mapping %}{{ struct(value) }}{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %}
{% endfor %}
>
{%- endmacro %}
CREATE EXTERNAL TABLE `{{ params.table_name }}` (
{% for key, value in params.schema.items() %}
{{ key }} {% if value is mapping %}{{ struct(value) }}{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %}
{% endfor %}
)
{% if params.partitioned %}
PARTITIONED BY (
`year` int,
`month` int,
`day` int
)
{% endif %}
{% if params.json %}
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-metrics/{{ params.s3_prefix }}'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.year.type'='integer',
'projection.year.range'='2020,2100',
'projection.month.type'='integer',
'projection.month.range'='1,12',
'projection.day.type'='integer',
'projection.day.range'='1,31',
'storage.location.template'='s3://kite-metrics/{{ params.s3_prefix }}/year=${year}/month=${month}/day=${day}'
)
{% else %}
STORED AS PARQUET
LOCATION
's3://kite-metrics/{{ params.s3_prefix }}'
{% endif %}

View File

@ -0,0 +1,10 @@
CREATE EXTERNAL TABLE monetizable_inf_results_{{ds_nodash}} (
userid string,
score double,
model_version string,
timestamp bigint
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
LOCATION
's3://{{params.bucket}}/monetizable/inf_results/{{ds_nodash}}/'

View File

@ -0,0 +1,21 @@
CREATE EXTERNAL TABLE `youtube_channel_details`(
id string,
forUsername string,
snippet struct<
title: string,
customUrl: string
>,
statistics struct<
viewCount: string,
subscriberCount: string,
videoCount: string
>
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-youtube-data/channel_details/'

View File

@ -0,0 +1,11 @@
CREATE EXTERNAL TABLE `youtube_channels`(
id string
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-youtube-data/channels/'

View File

@ -0,0 +1,15 @@
CREATE EXTERNAL TABLE `youtube_queries`(
tagname string,
count bigint,
query string,
seed boolean,
generation int
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-youtube-data/search_queries/'

View File

@ -0,0 +1,15 @@
CREATE EXTERNAL TABLE `youtube_searches`(
query string,
query_hash string,
timestamp string,
total int,
unique int
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-youtube-data/searches/'

View File

@ -0,0 +1,14 @@
CREATE EXTERNAL TABLE `youtube_socialblade_stats`(
id string,
timestamp timestamp,
success boolean,
monthlyViews string
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://kite-youtube-data/socialblade_stats/'

View File

@ -0,0 +1,178 @@
import time
from kite_airflow.common import utils as common_utils
from kite_airflow.youtube_dashboard import utils
def get_activity_list(yt_client, channel_id, published_after=None, page_token=None):
'''
Uses YouTube Activity List API to get activities.
Returns:\n
list: activity items
string: token which we can use to request next page
'''
request = yt_client.activities().list(
part='id,snippet,contentDetails',
channelId=channel_id,
maxResults=50,
publishedAfter=published_after if published_after else common_utils.get_date_time_in_ISO(),
pageToken=page_token,
)
activity_list_response = request.execute()
return activity_list_response['items'], activity_list_response.get('nextPageToken')
def get_all_activity_list(yt_client, channel_id, published_after=None):
'''
Uses YouTube Activity List API to get the list of all activities from given date.
Returns:\n
list: all activities found
'''
all_activities = []
next_page_token = None
exception = None
try:
while True:
activity_list, next_page_token = get_activity_list(
yt_client,
channel_id,
published_after,
next_page_token,
)
if activity_list.count:
all_activities.extend(activity_list)
if not next_page_token:
break
except Exception as e:
exception = e
finally:
return all_activities, exception
def filter_video_activity_from_list(activity_list):
'''
Filters upload video activities from all activities
'''
new_upload_video_activity_list = []
for activity in activity_list:
if activity['snippet']['type'] == 'upload':
new_upload_video_activity_list.append(activity)
return new_upload_video_activity_list
def get_unique_upload_video_activity_list(video_activity_list):
'''
Filters duplicated upload video activities.
Youtube Activity API can send same upload video activity twice
(don't know the exact reason) and there is no easy way to filter
them therefore this function is added which filters them based
on video id's
'''
video_ids = set() # using it to filter videos
unique_video_activity_list = []
for video_activity in video_activity_list:
video_id = utils.get_id_of_video_activity(video_activity)
if not video_id in video_ids:
video_ids.add(video_id)
unique_video_activity_list.append(video_activity)
return unique_video_activity_list
def get_video_search_list(yt_client, channel_id, published_before=None, page_token=None):
'''
Uses YouTube Search List API to get recent videos.
Returns:\n
list: searched videos items
string: token which we can use to request next page
'''
request = yt_client.search().list(
part='snippet',
channelId=channel_id,
maxResults=50,
publishedBefore=published_before if published_before else common_utils.get_date_time_in_ISO(),
type='video',
order='date',
pageToken=page_token,
)
video_search_list_response = request.execute()
return video_search_list_response['items'], video_search_list_response.get('nextPageToken')
def get_all_video_search_list(yt_client, channel_id, published_before, search_budget):
'''
Uses YouTube Search List API to get all available videos of a channel
Returns:\n
list: all videos of channel
'''
no_of_searches = 0
all_video_searches = []
next_page_token = None
has_channel_search_remaining = True
exception = None
try:
while True:
video_search_list, next_page_token = get_video_search_list(
yt_client,
channel_id,
published_before,
next_page_token
)
has_channel_search_remaining = bool(next_page_token)
if video_search_list.count:
all_video_searches.extend(video_search_list)
if not next_page_token:
break
no_of_searches += 1
if search_budget - no_of_searches <= 0:
break
except Exception as e:
exception = e
finally:
return all_video_searches, bool(has_channel_search_remaining), no_of_searches, exception
def get_video_list(yt_client, videos_id_list):
'''
Uses YouTube Video List API to get details about the video
Returns:\n
list: detailed info of videos
'''
request = yt_client.videos().list(
part='snippet,statistics',
id=','.join(videos_id_list)
)
video_list_response = request.execute()
return video_list_response['items']

View File

@ -0,0 +1,106 @@
import datetime
import json
import csv
import codecs
from airflow.hooks.S3_hook import S3Hook
from kite_airflow.common import configs
from kite_airflow.common import utils
from kite_airflow.common import files
BUCKET = 'kite-youtube-data' if utils.is_production() else 'kite-metrics-test'
DIR_PROJECT = 'youtube-dashboard'
DIR_SCRATCH = 'athena-scratch-space'
DIR_CHANNELS = '{}/channels'.format(DIR_PROJECT)
DIR_VIDEOS = '{}/videos'.format(DIR_PROJECT)
DIR_ACTIVITIES = '{}/activities'.format(DIR_PROJECT)
DIR_SNAPSHOTS = '{}/snapshots'.format(DIR_PROJECT)
FILE_CACHED_URLS = '{}/cached_urls.csv'.format(DIR_PROJECT)
def get_scratch_space_csv(ti, task_id):
'''
Get file content of a csv in json list
'''
s3 = S3Hook(configs.AWS_CONN_ID)
filename = ti.xcom_pull(task_ids=task_id)
s3key = s3.get_key(
'{}/{}.csv'.format(DIR_SCRATCH, filename),
BUCKET,
)
json_list = []
reader = csv.DictReader(
codecs.getreader("utf-8")(s3key.get()['Body'])
)
for row in reader:
json_list.append(row)
return json_list
def write_json_list_on_file(file_path, json_list):
s3_hook = S3Hook(configs.AWS_CONN_ID)
data = []
for index, json_obj in enumerate(json_list):
data.append(json.dumps(json_obj))
s3_hook.load_bytes(
'\n'.join(data).encode('utf-8'),
file_path,
BUCKET,
replace=True,
)
def get_cached_urls_from_file():
try:
cached_urls_list = files.get_csv_file_as_dict(BUCKET, FILE_CACHED_URLS)
except:
cached_urls_list = []
cached_urls_dict = {}
for cached_url in cached_urls_list:
cached_urls_dict[cached_url['url']] = bool(cached_url['is_a_kite_redirect'])
return cached_urls_dict
def write_cached_urls_on_file(cached_urls_dict):
cached_urls_list = []
for url, is_kite_redirect in cached_urls_dict.items():
cached_urls_list.append(
{
'url': url,
'is_a_kite_redirect': 'True' if is_kite_redirect else '' # empty string represents false
}
)
files.write_dict_on_csv_file(BUCKET, FILE_CACHED_URLS, cached_urls_list)
def write_channels_on_file(channel_list):
write_json_list_on_file(DIR_CHANNELS + '/channels.json', channel_list)
def write_activities_on_file(activity_list):
file_path = DIR_ACTIVITIES + '/activities' + utils.get_unique_suffix()
write_json_list_on_file(file_path, activity_list)
def write_videos_on_file(video_list):
file_path = DIR_VIDEOS + '/videos' + utils.get_unique_suffix()
write_json_list_on_file(file_path, video_list)
def write_snapshots_on_file(snapshot_list):
file_path = DIR_SNAPSHOTS + '/snapshots' + utils.get_unique_suffix()
write_json_list_on_file(file_path, snapshot_list)

View File

@ -0,0 +1,75 @@
import re
import requests
def get_video_id_of_search_item(search_item):
return search_item['id']['videoId']
def get_published_date_of_search_item(search_item):
return search_item['snippet']['publishedAt']
def get_id_of_video_activity(video_activity):
return video_activity['contentDetails']['upload']['videoId']
def get_id_of_video_item(video_item):
return video_item['id']
def get_description_of_video_item(video_item):
return video_item['snippet']['description']
def get_views_of_video_item(video_item):
return video_item['statistics'].get('viewCount')
def is_link_present_in_description(video_item, cached_urls_dict):
'''
Looks for kite link in the description and in case of shorten url also update the cache
which we use for performance improvement i.e. prevent future request for same url because
mostly descriptions of same channel have repetative links
Returns:\n
boolean:
indicates if kite link was present
'''
kite_url = 'kite.com'
description = get_description_of_video_item(video_item)
# youtubers always uses word Kite in description so if it's not present
# then no further search is needed
if 'kite' not in description.lower():
return False
if kite_url in description:
return True
# some youtubers uses link shortener so for those we uses a combination of cache
# and HEAD requests to look if kite redirects are present
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', description)
unique_urls = list(dict.fromkeys(urls))
for url in unique_urls:
if url in cached_urls_dict:
if cached_urls_dict[url]:
return True
else:
continue # not returning False because Kite link can be added after we have took the snapshot
try:
response = requests.head(url)
location_header = response.headers.get('Location')
is_a_kite_redirect = location_header and kite_url in location_header;
cached_urls_dict[url] = 'True' if is_a_kite_redirect else ''; # empty string represents false
if is_a_kite_redirect:
return True
except Exception:
cached_urls_dict[url] = '';
return False

9
airflow/requirements.txt Normal file
View File

@ -0,0 +1,9 @@
elasticsearch==7.7.0
gevent
mixpanel
customerio
requests
sagemaker
google-api-python-client
google-auth-httplib2
google-auth-oauthlib

17
airflow/setup.py Normal file
View File

@ -0,0 +1,17 @@
import setuptools
setuptools.setup(
name="kite-airflow-dags", # Replace with your own username
version="0.0.1",
author="Kite Team",
description="Kite Airflow codes.",
packages=setuptools.find_packages(),
python_requires='>=3.6',
include_package_data = True,
entry_points = {
'airflow.plugins': [
'google_plugin = kite_airflow.plugins.google:GoogleSheetsPlugin'
]
}
)

1
airflow/terraform/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.terraform

383
airflow/terraform/main.tf Normal file
View File

@ -0,0 +1,383 @@
terraform {
backend "s3" {
bucket = "kite-terraform-state"
workspace_key_prefix = "deployments/airflow"
key = "terraform.tfstate"
region = "us-west-1"
}
}
provider "aws" {
region = var.region
}
provider "aws" {
region = "us-west-1"
alias = "uswest1"
}
resource "aws_ecs_cluster" "airflow" {
name = var.service_name
capacity_providers = ["FARGATE"]
}
resource "aws_iam_role" "airflow_task_execution" {
name = "instance_role_airflow"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy" "airflow_task_execution" {
name = "airflow-execution-policy"
role = aws_iam_role.airflow_task_execution.id
policy = <<-EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ecr:GetAuthorizationToken",
"ecr:BatchCheckLayerAvailability",
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"secretsmanager:GetResourcePolicy",
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret",
"secretsmanager:ListSecretVersionIds"
],
"Resource": [
"${data.aws_secretsmanager_secret.sql_alchemy_conn_str.arn}",
"${data.aws_secretsmanager_secret.result_db_uri.arn}"
]
}
]
}
EOF
}
resource "aws_iam_role" "airflow_task" {
name = "airflow-container-role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "airflow-ecr" {
role = aws_iam_role.airflow_task.name
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
}
resource "aws_iam_role_policy_attachment" "airflow-sm" {
role = aws_iam_role.airflow_task.name
policy_arn = "arn:aws:iam::aws:policy/SecretsManagerReadWrite"
}
resource "aws_iam_role_policy_attachment" "airflow-s3" {
role = aws_iam_role.airflow_task.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
}
resource "aws_iam_role_policy_attachment" "airflow-athena" {
role = aws_iam_role.airflow_task.name
policy_arn = "arn:aws:iam::aws:policy/AmazonAthenaFullAccess"
}
resource "aws_iam_role_policy_attachment" "airflow-ecs" {
role = aws_iam_role.airflow_task.name
policy_arn = "arn:aws:iam::aws:policy/AmazonECS_FullAccess"
}
resource "aws_iam_role_policy" "airflow-cloudwatch" {
name = "airflow-cloudwatch-policy"
role = aws_iam_role.airflow_task.id
policy = <<-EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
"logs:DescribeLogStreams"
],
"Resource": [
"arn:aws:logs:*:*:*"
]
}
]
}
EOF
}
data "aws_vpc" "kite_prod" {
filter {
name = "tag:Name"
values = ["kite-prod"]
}
}
data "aws_subnet" "private1" {
vpc_id = data.aws_vpc.kite_prod.id
filter {
name = "tag:Name"
values = ["az1-private"]
}
}
data "aws_subnet" "private2" {
vpc_id = data.aws_vpc.kite_prod.id
filter {
name = "tag:Name"
values = ["az2-private"]
}
}
data "aws_security_group" "vpn" {
name = "all-vpn"
vpc_id = data.aws_vpc.kite_prod.id
}
resource "aws_security_group" "airflow" {
name = "Airflow"
description = "Airflow test security group"
vpc_id = data.aws_vpc.kite_prod.id
ingress {
from_port = 8080
to_port = 8080
protocol = "TCP"
self = true
security_groups = [data.aws_security_group.vpn.id]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
data "aws_secretsmanager_secret" "sql_alchemy_conn_str" {
provider = aws.uswest1
name = "airflow/db_uri"
}
data "aws_secretsmanager_secret" "result_db_uri" {
provider = aws.uswest1
name = "airflow/result_db_uri"
}
# Verify the image is published
data "aws_ecr_image" "airflow" {
provider = aws.uswest1
repository_name = var.repository_name
image_tag = var.tag
}
resource "aws_ecs_task_definition" "airflow" {
for_each = var.tasks
family = each.key
container_definitions = jsonencode(
[
{
"name" = each.key,
"image" = "${data.aws_ecr_image.airflow.registry_id}.dkr.ecr.us-west-1.amazonaws.com/${var.repository_name}:${var.tag}",
"portMappings" = [
{
"containerPort" = each.value.port,
"protocol" = "tcp"
}
],
"essential" = true,
"entryPoint" = ["airflow", each.key],
"environment" = [
{ "name" = "AIRFLOW_VAR_ENV", "value" = "production" },
],
"secrets" = [
{ "name" = "AIRFLOW__CORE__SQL_ALCHEMY_CONN", "valueFrom" = data.aws_secretsmanager_secret.sql_alchemy_conn_str.arn },
{ "name" = "AIRFLOW__CELERY__RESULT_BACKEND", "valueFrom" = data.aws_secretsmanager_secret.result_db_uri.arn }
],
"logConfiguration" = {
"logDriver" = "awslogs",
"options" = {
"awslogs-create-group" = "true",
"awslogs-region" = var.region,
"awslogs-group" = "/ecs/airflow/${each.key}",
"awslogs-stream-prefix" = "ecs"
}
}
}
]
)
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
execution_role_arn = aws_iam_role.airflow_task_execution.arn
task_role_arn = aws_iam_role.airflow_task.arn
cpu = each.value.cpu
memory = each.value.memory
}
resource "aws_appautoscaling_target" "worker" {
max_capacity = 8
min_capacity = 1
resource_id = "service/${aws_ecs_cluster.airflow.name}/${aws_ecs_service.airflow["worker"].name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
}
resource "aws_appautoscaling_policy" "worker_policy_memory" {
name = "memory-autoscaling"
policy_type = "TargetTrackingScaling"
resource_id = aws_appautoscaling_target.worker.resource_id
scalable_dimension = aws_appautoscaling_target.worker.scalable_dimension
service_namespace = aws_appautoscaling_target.worker.service_namespace
target_tracking_scaling_policy_configuration {
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageMemoryUtilization"
}
target_value = 70
scale_in_cooldown = 60
scale_out_cooldown = 120
}
}
resource "aws_appautoscaling_policy" "worker_policy_cpu" {
name = "cpu-autoscaling"
policy_type = "TargetTrackingScaling"
resource_id = aws_appautoscaling_target.worker.resource_id
scalable_dimension = aws_appautoscaling_target.worker.scalable_dimension
service_namespace = aws_appautoscaling_target.worker.service_namespace
target_tracking_scaling_policy_configuration {
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageCPUUtilization"
}
target_value = 70
scale_in_cooldown = 60
scale_out_cooldown = 120
}
}
resource "aws_ecs_service" "airflow" {
for_each = var.tasks
name = each.key
cluster = aws_ecs_cluster.airflow.arn
launch_type = "FARGATE"
platform_version = "LATEST"
task_definition = aws_ecs_task_definition.airflow[each.key].arn
desired_count = 1
network_configuration {
subnets = [data.aws_subnet.private1.id]
security_groups = [aws_security_group.airflow.id]
}
dynamic "load_balancer" {
for_each = each.value.load_balancer ? [1] : []
content {
target_group_arn = aws_lb_target_group.airflow.arn
container_name = each.key
container_port = each.value.port
}
}
enable_ecs_managed_tags = true
propagate_tags = "TASK_DEFINITION"
lifecycle {
ignore_changes = [desired_count]
}
}
resource "aws_lb" "airflow" {
name = "airflow"
subnets = [data.aws_subnet.private1.id, data.aws_subnet.private2.id]
load_balancer_type = "application"
internal = true
security_groups = [data.aws_security_group.vpn.id]
}
resource "aws_lb_target_group" "airflow" {
name = "airflow"
port = var.webserver_port
protocol = "HTTP"
vpc_id = data.aws_vpc.kite_prod.id
target_type = "ip"
health_check {
path = "/health"
matcher = "200"
interval = 300
}
}
data "aws_acm_certificate" "kite_dev" {
domain = "*.kite.dev"
statuses = ["ISSUED"]
}
resource "aws_lb_listener" "airflow" {
load_balancer_arn = aws_lb.airflow.arn
port = 443
protocol = "HTTPS"
default_action {
target_group_arn = aws_lb_target_group.airflow.arn
type = "forward"
}
certificate_arn = data.aws_acm_certificate.kite_dev.arn
}

View File

@ -0,0 +1,30 @@
resource "aws_ecs_task_definition" "monetizable" {
family = "monetizable"
container_definitions = jsonencode(
[
{
"name" = "monetizable",
"image" = "${data.aws_ecr_image.airflow.registry_id}.dkr.ecr.us-west-1.amazonaws.com/kite-airflow-monetizable:${var.tag}",
"essential" = true,
"logConfiguration" = {
"logDriver" = "awslogs",
"options" = {
"awslogs-create-group" = "true",
"awslogs-region" = var.region,
"awslogs-group" = "/ecs/airflow/monetizable",
"awslogs-stream-prefix" = "ecs"
}
}
}
]
)
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
execution_role_arn = aws_iam_role.airflow_task_execution.arn
task_role_arn = aws_iam_role.airflow_task.arn
cpu = 1 * 1024.0
memory = 2 * 1024.0
}

View File

@ -0,0 +1,4 @@
output "tag" {
value = var.tag
description = "The currently-deployed tag."
}

View File

@ -0,0 +1,43 @@
variable service_name {
default = "airflow"
}
variable region {
default = "us-east-1"
}
variable webserver_port {
default = 8080
}
variable repository_name {
type = string
default = "kite-airflow"
}
variable tag {
type = string
}
variable tasks {
default = {
webserver = {
port = 8080
cpu = 0.5 * 1024.0
memory = 1 * 1024.0
load_balancer = true
},
scheduler = {
port = 8793
cpu = 1 * 1024.0
memory = 2 * 1024.0
load_balancer = false
},
worker = {
port = 8793
cpu = 2 * 1024.0
memory = 4 * 1024.0
load_balancer = false
}
}
}

105
airflow/unittests.cfg Normal file
View File

@ -0,0 +1,105 @@
[core]
unit_test_mode = True
dags_folder = /opt/airflow/dags
plugins_folder = /opt/airflow/plugins
base_log_folder = /opt/airflow//logs
logging_level = INFO
fab_logging_level = WARN
log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
log_processor_filename_template = {{ filename }}.log
dag_processor_manager_log_location = /opt/airflow//logs/dag_processor_manager/dag_processor_manager.log
executor = SequentialExecutor
sql_alchemy_conn = sqlite:////opt/airflow//unittests.db
load_examples = True
donot_pickle = False
load_default_connections = True
dag_concurrency = 16
dags_are_paused_at_creation = False
fernet_key = XXXXXXX
enable_xcom_pickling = False
killed_task_cleanup_time = 5
secure_mode = False
hostname_callable = socket:getfqdn
worker_precheck = False
default_task_retries = 0
[cli]
api_client = airflow.api.client.local_client
endpoint_url = http://localhost:8080
[api]
auth_backend = airflow.api.auth.backend.default
[operators]
default_owner = airflow
[hive]
default_hive_mapred_queue = airflow
[webserver]
base_url = http://localhost:8080
web_server_host = 0.0.0.0
web_server_port = 8080
dag_orientation = LR
dag_default_view = tree
log_fetch_timeout_sec = 5
hide_paused_dags_by_default = False
page_size = 100
rbac = False
[email]
email_backend = airflow.utils.email.send_email_smtp
[smtp]
smtp_host = localhost
smtp_user = airflow
smtp_port = 25
smtp_password = airflow
smtp_mail_from = airflow@example.com
[celery]
celery_app_name = airflow.executors.celery_executor
worker_concurrency = 16
worker_log_server_port = 8793
broker_url = sqla+mysql://airflow:airflow@localhost:3306/airflow
result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
flower_host = 0.0.0.0
flower_port = 5555
default_queue = default
sync_parallelism = 0
[mesos]
master = localhost:5050
framework_name = Airflow
task_cpu = 1
task_memory = 256
checkpoint = False
authenticate = False
docker_image_slave = test/docker-airflow
[scheduler]
job_heartbeat_sec = 1
scheduler_heartbeat_sec = 5
scheduler_health_check_threshold = 30
authenticate = true
max_threads = 2
catchup_by_default = True
scheduler_zombie_task_threshold = 300
dag_dir_list_interval = 0
max_tis_per_query = 512
[admin]
hide_sensitive_variable_fields = True
[elasticsearch]
host =
log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
end_of_log_mark = end_of_log
[elasticsearch_configs]
use_ssl = False
verify_certs = True
[kubernetes]
dags_volume_claim = default

11
codecov.yml Normal file
View File

@ -0,0 +1,11 @@
codecov:
allow_coverage_offsets: true
coverage:
status:
project: off
patch:
default:
target: 95%
ignore:
- **/cmd/*
- **/cmd/**/*

1
concourse/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
secrets.yml

28
concourse/Makefile Normal file
View File

@ -0,0 +1,28 @@
all:
images/docker/image:
cd images/docker && docker build -t kiteco/concourse .
images/docker/push: images/docker/image
docker push kiteco/concourse
pipelines/bundle-plugins/set:
fly -t kite sp -p bundle-plugins -c pipelines/bundle-plugins/pipeline.yml
YTT_ARGS=''
ifneq ($(BRANCH),)
YTT_ARGS="--data-value dev_branch=$(BRANCH)"
endif
BE_SVCS_DIR=pipelines/deploy-backend-services
$(BE_SVCS_DIR)/%/set: PIPELINE=$*
$(BE_SVCS_DIR)/%/set:
ytt "$(YTT_ARGS)" -f $(BE_SVCS_DIR)/pipeline-template.ytt.yml -f $(BE_SVCS_DIR)/data-defaults.ytt.yml -f $(BE_SVCS_DIR)/$(PIPELINE)/data.ytt.yml > $(BE_SVCS_DIR)/$(PIPELINE)/pipeline.yml
fly -t kite sp -l secrets.yml -p $(PIPELINE) -c $(BE_SVCS_DIR)/$(PIPELINE)/pipeline.yml
rm $(BE_SVCS_DIR)/$(PIPELINE)/pipeline.yml
pipelines/%/set: PIPELINE=$*
pipelines/%/set:
ytt -f pipelines/$(PIPELINE)/pipeline.ytt.yml > pipelines/$(PIPELINE)/pipeline.yml
fly -t kite sp -p $(PIPELINE) -c pipelines/$(PIPELINE)/pipeline.yml
rm pipelines/$(PIPELINE)/pipeline.yml

92
concourse/README.md Normal file
View File

@ -0,0 +1,92 @@
> [Concourse](https://concourse-ci.org) is an open-source continuous thing-doer.
At Kite, we use Concourse for at least parts of our build/deploy pipelines.
The goal is to incrementally port all deployment jobs to Concourse,
but in the meanwhile, our prior build system (Solness) will trigger Concourse jobs as needed.
In order to manually run the pipelines, you can login at [concourse.kite.com](http://concourse.kite.com)
from within the AWS dev VPN. Find credentials in Quip.
For now we *do not* intend to move developer CI onto to Concourse,
since scaling up a self-hosted CI system comes with its own set of challenges,
and our current solution (Travis) is "good enough." This is purely for deployments.
## Development
Read the Concourse docs!
Pipelines are composed of jobs which are in turn composed of tasks.
We have a pipeline called "release" defined in `pipelines/release/pipeline.ytt`.
In order to develop this pipeline, you need the Concourse `fly` tool,
as well as the YAML templating tool `ytt`.
This pipeline can be updated using the `fly` CLI tool, or with the `make` command:
```
make pipelines/deploy/set
```
### Secrets
All secrets are currently stored in AWS Systems Manager Parameter Store in us-west-1.
The Concourse Web node is configured to look up secrets from SSM.
## Provisioning a Worker
Eventually, we should use Packer to provision worker AMIs, but for now
workers must be manually configured.
### Windows
1. Start with a "Windows Server 2019 with Containers" machine image.
2. Provision all the tools needed for building Kite,
as per the Windows [README](../windows/README.md).
* also `choco install windows-sdk-10.0`
3. Allocate and mount a separate disk (100G) for all Concourse-related data
* Below, we assume it's mounted at `D:`.
* `mkdir D:\containers`, `mkdir D:\concourse`
4. Enable long paths using registry editor.
* set `HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled` to 1.
5. Download [WinSW](https://github.com/kohsuke/winsw)
and [Concourse](https://github.com/concourse/concourse/).
* Move Concourse binary to `D:\concourse\concourse-bin.exe`
* Move the WinSW binary to `D:\concourse\concourse.exe`
5. Provision a worker key on the Windows machine.
```
cd D:\concourse
ssh-keygen -t rsa -b 4096 -f tsa-worker-key
...
cat D:\concourse\tsa-worker-key.pub
```
* add the public key to `authorized_keys` on the Concourse web node.
* restart the web node.
6. Create `D:\concourse\concourse.xml` to configure all the Concourse options.
```
<service>
<id>concourse</id>
<name>Concourse</name>
<description>Concourse Windows worker.</description>
<startmode>Automatic</startmode>
<executable>D:\concourse\concourse-bin.exe</executable>
<argument>worker</argument>
<argument>/work-dir</argument>
<argument>D:\containers</argument>
<argument>/tsa-worker-private-key</argument>
<argument>D:\concourse\tsa-worker-key</argument>
<argument>/tsa-public-key</argument>
<argument>D:\concourse\tsa-host-key.pub</argument>
<argument>/tsa-host</argument> <argument>10.86.0.122:2222</argument>
<onfailure action="restart" delay="10 sec"/>
<onfailure action="restart" delay="20 sec"/>
<logmode>rotate</logmode>
</service>
```
7. Install and start the Concourse service
```
D:\concourse\concourse.exe install
D:\concourse\concourse.exe start
```
8. License VS Community 2019 under the system user.
* Download [`PsExec.exe`](https://docs.microsoft.com/en-us/sysinternals/downloads/psexec)
* Start VS under the system user: `PsExec.exe -sid "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\devenv.com"`
* Log in to license the software.

View File

@ -0,0 +1,50 @@
FROM ubuntu:bionic
ARG GO_VERSION=1.15.3
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
curl \
wget \
gzip \
zip unzip \
openssl \
libssl-dev \
make \
openssh-client \
libstdc++6 \
software-properties-common \
openjdk-11-jre-headless \
openjdk-8-jdk \
makeself \
chrpath \
gcc \
build-essential \
gpg-agent \
jq
RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
RUN apt-get install -y nodejs
RUN npm install -g n
RUN n 11.12.0
RUN apt-add-repository ppa:git-core/ppa \
&& apt-get update \
&& apt-get install -y git \
&& rm -rf /var/lib/apt/lists/*
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
RUN apt-get install -y git-lfs
RUN wget https://dl.google.com/go/go$GO_VERSION.linux-amd64.tar.gz
RUN tar -C /usr/local -xzf go$GO_VERSION.linux-amd64.tar.gz
RUN rm go$GO_VERSION.linux-amd64.tar.gz
ENV PATH=/usr/local/go/bin:$PATH
RUN apt-get install -y --no-install-recommends python3.7 python3-pip python3-setuptools
RUN pip3 install awscli wheel pipenv
RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
RUN git config --global user.email "ops@kite.com"
RUN git config --global user.name "Kite Concourse"

View File

@ -0,0 +1,85 @@
resource_types:
- name: slack-notification
type: docker-image
source:
repository: cfcommunity/slack-notification-resource
resources:
- name: time.8am
type: time
source:
start: 7:00 AM
stop: 8:00 AM
location: America/Los_Angeles
days: [Monday, Tuesday, Wednesday, Thursday, Friday]
- name: git.kiteco.intellij-plugin-private
type: git
source:
uri: git@github.com:kiteco/intellij-plugin-private.git
branch: master
private_key: ((ssh_private))
disable_ci_skip: true
- name: image-build
type: docker-image
source: {repository: kiteco/concourse}
- name: slack-deep-intellij
type: slack-notification
source:
url: ((slack_deep-intellij))
jobs:
- name: stage-intellij-release
plan:
- get: time.8am
trigger: true
- get: intellij-plugin-private
resource: git.kiteco.intellij-plugin-private
- get: image-build
- task: version-bump-intellij
image: image-build
config:
platform: linux
inputs:
- name: intellij-plugin-private
outputs:
- name: intellij-plugin-private
run:
path: intellij-plugin-private/release_version.bash
- put: intellij-plugin-private
resource: git.kiteco.intellij-plugin-private
params:
repository: intellij-plugin-private
tag: intellij-plugin-private/pluginVersion.txt
tag_prefix: v
- task: intellij-build-binaries
image: image-build
config:
platform: linux
params:
AWS_ACCESS_KEY_ID: ((aws_id))
AWS_SECRET_ACCESS_KEY: ((aws_secret))
inputs:
- name: intellij-plugin-private
run:
path: intellij-plugin-private/concourse/stage-plugin.bash
on_failure:
put: slack-deep-intellij
params:
text: "IntelliJ <http://concourse.kite.com/builds/$BUILD_ID|build> failed! <@XXXXXXX> <@XXXXXXX>"
on_success:
put: slack-deep-intellij
params:
text: "IntelliJ <http://concourse.kite.com/builds/$BUILD_ID|build> succeeded!"

View File

@ -0,0 +1,5 @@
#@data/values
---
dev_branch: ""
package_regexp: ""
terraform_location: ""

View File

@ -0,0 +1,4 @@
#@data/values
---
package_regexp: kite-server.tgz
terraform_location: kiteserver

View File

@ -0,0 +1,166 @@
#@ load("@ytt:data", "data")
resource_types:
- name: terraform
type: docker-image
source:
repository: ljfranklin/terraform-resource
tag: latest
#@yaml/text-templated-strings
resources:
- name: kiteco
type: git
source:
uri: git@github.com:kiteco/kiteco.git
private_key: ((ssh.private))
disable_ci_skip: true
#@ if data.values.dev_branch != '':
branch: (@= data.values.dev_branch @)
#@ else:
branch: release
fetch_tags: true
tag_filter: v2*
#@ end
- name: kite-deploy-package
type: s3
source:
bucket: kite-deploys
regexp: v(.*)/(@= data.values.package_regexp @)
region_name: us-west-1
access_key_id: ((aws.id))
secret_access_key: ((aws.secret))
- name: terraform
type: terraform
source:
backend_type: s3
env_name: production
terraform_source: "kiteco/devops/terraform/cloud/deployments/(@= data.values.terraform_location @)/"
backend_config:
access_key: ((aws.id))
bucket: kite-terraform-state
key: terraform.tfstate
region: us-west-1
secret_key: ((aws.secret))
workspace_key_prefix: deployments/(@= data.values.terraform_location @)
env:
AWS_ACCESS_KEY_ID: ((aws.id))
AWS_SECRET_ACCESS_KEY: ((aws.secret))
GOOGLE_CREDENTIALS: ((gcloud))
jobs:
- name: stage-jump-instances-plan
plan:
- get: kiteco
params: &kiteco_get_params
depth: 10
submodules: none
disable_git_lfs: true
- get: kite-deploy-package
trigger: true
- load_var: version-tag
file: kite-deploy-package/version
- put: terraform
params:
plan_only: true
vars:
versions:
gray: ((.:version-tag))
blue: blue
- name: stage-jump-instances-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-jump-instances-plan]
- put: terraform
params:
plan_run: true
- name: stage-add-to-lb-plan
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-jump-instances-apply]
trigger: true
- put: terraform
params:
plan_only: true
vars:
versions:
blue: blue
green: gray
- name: stage-add-to-lb-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-add-to-lb-plan]
- put: terraform
params:
plan_run: true
- name: switch-plan
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-add-to-lb-apply]
trigger: true
- put: terraform
params:
plan_only: true
vars:
versions:
green: blue
blue: green
- name: switch-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [switch-plan]
- put: terraform
params:
plan_run: true
- name: retire-remove-lb-plan
plan:
- get: kiteco
params: *kiteco_get_params
- put: terraform
params:
plan_only: true
vars:
versions:
blue: blue
gray: green
- name: retire-remove-lb-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [retire-remove-lb-plan]
- put: terraform
params:
plan_run: true
- name: retire-terminate-plan
plan:
- get: kiteco
params: *kiteco_get_params
passed: [retire-remove-lb-apply]
trigger: true
- put: terraform
params:
plan_only: true
vars:
versions:
blue: blue
- name: retire-terminate-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [retire-terminate-plan]
- put: terraform
params:
plan_run: true

View File

@ -0,0 +1,114 @@
resource_types:
- name: terraform
type: docker-image
source:
repository: ljfranklin/terraform-resource
tag: latest
resources:
- name: kiteco
type: git
source:
uri: git@github.com:kiteco/kiteco.git
branch: release
private_key: ((ssh_private))
disable_ci_skip: true
fetch_tags: true
tag_filter: v2* #! this'll last for the millenium
- name: puppet
type: s3
source:
bucket: kite-deploys
regexp: puppet/puppet-v(.*).tar.gz
region_name: us-west-1
access_key_id: ((aws_id))
secret_access_key: ((aws_secret))
- name: terraform
type: terraform
source:
backend_type: s3
env_name: us-east-1
terraform_source: kiteco/devops/terraform/cloud/deployments/metrics/
vars:
region: us-east-1
backend_config:
access_key: ((aws_id))
bucket: kite-terraform-state
key: terraform.tfstate
region: us-west-1
secret_key: ((aws_secret))
workspace_key_prefix: deployments/metrics-collector
env:
AWS_ACCESS_KEY_ID: ((aws_id))
AWS_SECRET_ACCESS_KEY: ((aws_secret))
jobs:
- name: stage-plan
plan:
- get: kiteco
params: &kiteco_get_params
depth: 10
submodules: none
disable_git_lfs: true
- get: puppet
- task: tfvars
file: kiteco/concourse/tasks/tf-vars/task.yml
vars:
build: puppet
versions: '{"green": "VERSION", "blue": "blue"}'
- put: terraform
params:
plan_only: true
var_files: [tfvars/terraform.tfvars]
- name: stage-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-plan]
- put: terraform
params:
plan_run: true
- name: switch-plan
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-apply]
trigger: true
- put: terraform
params:
plan_only: true
vars:
versions:
green: blue
blue: green
- name: switch-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [switch-plan]
- put: terraform
params:
plan_run: true
- name: cleanup-plan
plan:
- get: kiteco
params: *kiteco_get_params
- put: terraform
params:
plan_only: true
vars:
versions:
blue: blue
- name: cleanup-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [cleanup-plan]
- put: terraform
params:
plan_run: true

View File

@ -0,0 +1,114 @@
resource_types:
- name: terraform
type: docker-image
source:
repository: ljfranklin/terraform-resource
tag: latest
resources:
- name: kiteco
type: git
source:
branch: release
disable_ci_skip: true
fetch_tags: true
private_key: ((ssh_private))
uri: git@github.com:kiteco/kiteco.git
tag_filter: v2*
- name: convcohort
type: s3
source:
bucket: kite-deploys
regexp: v(.*)/convcohort
region_name: us-west-1
access_key_id: ((aws_id))
secret_access_key: ((aws_secret))
- name: terraform
type: terraform
source:
backend_type: s3
env_name: production
terraform_source: kiteco/devops/terraform/cloud/deployments/nchan/
backend_config:
access_key: ((aws_id))
bucket: kite-terraform-state
key: terraform.tfstate
region: us-west-1
secret_key: ((aws_secret))
workspace_key_prefix: deployments/nchan
env:
AWS_ACCESS_KEY_ID: ((aws_id))
AWS_SECRET_ACCESS_KEY: ((aws_secret))
GOOGLE_CREDENTIALS: ((gcloud))
jobs:
- name: stage-plan
plan:
- get: kiteco
params: &kiteco_get_params
depth: 10
submodules: none
disable_git_lfs: true
- get: convcohort
trigger: true
- task: tfvars
file: kiteco/concourse/tasks/tf-vars/task.yml
vars:
build: convcohort
versions: '{"green": "VERSION", "blue": "blue"}'
- put: terraform
params:
plan_only: true
var_files: [tfvars/terraform.tfvars]
- name: stage-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-plan]
- put: terraform
params:
plan_run: true
- name: switch-plan
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-apply]
trigger: true
- put: terraform
params:
plan_only: true
vars:
versions:
green: blue
blue: green
- name: switch-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [switch-plan]
- put: terraform
params:
plan_run: true
- name: cleanup-plan
plan:
- get: kiteco
params: *kiteco_get_params
- put: terraform
params:
plan_only: true
vars:
versions:
blue: blue
- name: cleanup-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [cleanup-plan]
- put: terraform
params:
plan_run: true

View File

@ -0,0 +1,115 @@
resource_types:
- name: terraform
type: docker-image
source:
repository: ljfranklin/terraform-resource
tag: latest
resources:
- name: kiteco
type: git
source:
branch: release
disable_ci_skip: true
fetch_tags: true
private_key: ((ssh_private))
uri: git@github.com:kiteco/kiteco.git
tag_filter: v2*
- name: release
type: s3
source:
bucket: kite-deploys
regexp: v(.*)/release
region_name: us-west-1
access_key_id: ((aws_id))
secret_access_key: ((aws_secret))
- name: terraform
type: terraform
source:
backend_type: s3
env_name: production
terraform_source: kiteco/devops/terraform/cloud/deployments/release/
backend_config:
access_key: ((aws_id))
bucket: kite-terraform-state
key: terraform.tfstate
region: us-west-1
secret_key: ((aws_secret))
workspace_key_prefix: deployments/release
env:
AWS_ACCESS_KEY_ID: ((aws_id))
AWS_SECRET_ACCESS_KEY: ((aws_secret))
GOOGLE_CREDENTIALS: ((gcloud))
jobs:
- name: stage-plan
plan:
- get: kiteco
params: &kiteco_get_params
depth: 10
submodules: none
disable_git_lfs: true
- get: release
trigger: true
- task: tfvars
file: kiteco/concourse/tasks/tf-vars/task.yml
vars:
build: release
versions: '{"green": "VERSION", "blue": "blue"}'
- put: terraform
params:
plan_only: true
var_files: [tfvars/terraform.tfvars]
- name: stage-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-plan]
- put: terraform
params:
plan_run: true
- name: switch-plan
plan:
- get: kiteco
params: *kiteco_get_params
passed: [stage-apply]
trigger: true
- put: terraform
params:
plan_only: true
vars:
versions:
green: blue
blue: green
- name: switch-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [switch-plan]
- put: terraform
params:
plan_run: true
- name: cleanup-plan
plan:
- get: kiteco
params: *kiteco_get_params
- put: terraform
params:
plan_only: true
vars:
versions:
blue: blue
- name: cleanup-apply
plan:
- get: kiteco
params: *kiteco_get_params
passed: [cleanup-plan]
- put: terraform
params:
plan_run: true

View File

@ -0,0 +1,148 @@
resource_types:
- name: slack-notification
type: docker-image
source:
repository: cfcommunity/slack-notification-resource
resources:
- name: image-lfs-pull
type: docker-image
source: {repository: kiteco/concourse.lfs-pull}
- name: image-build
type: docker-image
source: {repository: kiteco/concourse}
- name: kiteco
type: git
source:
uri: git@github.com:kiteco/kiteco.git
branch: release
private_key: ((ssh_private))
disable_ci_skip: true
tag_filter: v2* #! this'll last for the millenium
- name: slack-release-notifications
type: slack-notification
source:
url: ((slack_release-notifications))
jobs:
#@ platforms = ["windows", "linux"]
#@ for platform in platforms:
- name: #@ "stage-{}-release".format(platform)
plan:
- get: image-lfs-pull
- get: image-build
- get: kiteco
trigger: true
params: &kiteco_get_params
depth: 10
submodules: none
disable_git_lfs: true
- in_parallel:
- task: kiteco-lfs-pull
file: kiteco/concourse/tasks/lfs-pull/task.yml
image: image-lfs-pull
input_mapping: {repo: kiteco}
output_mapping: {repo: kiteco}
vars:
private_key: ((ssh_private))
- do:
- task: build-release-binary
file: kiteco/concourse/tasks/build-release-binary/task.yml
image: image-build
- task: prepare-release
file: kiteco/concourse/tasks/prepare-release/task.yml
image: image-build
on_success:
put: slack-release-notifications
params:
text_file: slack/message
vars:
platform: #@ platform
release_db_uri: ((release_gcp-db-uri))
- task: build-client
file: #@ "kiteco/concourse/tasks/build-{}-client/task.yml".format(platform)
#@ if platform == "linux":
image: image-build
#@ end
vars:
#@ if platform == "linux":
private_key: ((linux_update-signing-key))
aws_access_key_id: ((aws_id))
aws_access_key_secret: ((aws_secret))
#@ elif platform == "windows":
update_signing: ((windows_update-signing-key-password))
#@ end
- task: upload-client
file: kiteco/concourse/tasks/upload-client-build/task.yml
image: image-build
vars:
aws_access_key_id: ((aws_id))
aws_access_key_secret: ((aws_secret))
- task: stage-client
file: kiteco/concourse/tasks/release-client-build/task.yml
image: image-build
input_mapping: {meta: build}
vars:
release_db_uri: ((release_gcp-db-uri))
percentage: 100
on_failure: &fail_notif
put: slack-release-notifications
params:
text: #@ "{} client release <http://concourse.kite.com/builds/$BUILD_ID|build> failed <@XXXXXXX>".format(platform.capitalize())
on_success:
put: slack-release-notifications
params:
text: #@ "{} client release available on staging <@XXXXXXX>".format(platform.capitalize())
#@ end
- name: stage-backend-release
plan:
- get: image-lfs-pull
- get: image-build
- get: kiteco
trigger: true
params: *kiteco_get_params
- task: kiteco-lfs-pull
file: kiteco/concourse/tasks/lfs-pull/task.yml
image: image-lfs-pull
input_mapping: {repo: kiteco}
output_mapping: {repo: kiteco}
vars:
private_key: ((ssh_private))
- task: build-backend
file: kiteco/concourse/tasks/build-backend/task.yml
image: image-build
- task: upload-backend
file: kiteco/concourse/tasks/upload-backend-build/task.yml
image: image-build
vars:
aws_access_key_id: ((aws_id))
aws_access_key_secret: ((aws_secret))
on_failure:
put: slack-release-notifications
params:
text: "Backend build <http://concourse.kite.com/builds/$BUILD_ID|build> failed <@XXXXXXX>"
on_success:
put: slack-release-notifications
params:
text: "Backend build <http://concourse.kite.com/builds/$BUILD_ID|build> succeeded <@XXXXXXX>"
- name: puppet
plan:
- get: image-build
- get: kiteco
params: *kiteco_get_params
- task: build-puppet
file: kiteco/concourse/tasks/build-puppet/task.yml
- task: upload-puppet
file: kiteco/concourse/tasks/upload-puppet/task.yml
image: image-build
vars:
aws_access_key_id: ((aws_id))
aws_access_key_secret: ((aws_secret))

Some files were not shown because too many files have changed in this diff Show More