mirror of
https://github.com/kiteco/kiteco-public.git
synced 2024-10-05 17:49:06 +03:00
Contents — happy new year everyone
This commit is contained in:
commit
70fa808fcd
4
LICENSES.md
Normal file
4
LICENSES.md
Normal file
@ -0,0 +1,4 @@
|
||||
## [Android Robot icon](./sidebar/src/assets/editorIcons/android-studio@2x.png)
|
||||
- License
|
||||
> The Android robot is reproduced or modified from work created and shared by Google and used according to terms described in the Creative Commons 3.0 Attribution License.
|
||||
- Guidelines at https://developer.android.com/distribute/marketing-tools/brand-guidelines#android_robot
|
195
Makefile
Executable file
195
Makefile
Executable file
@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env make
|
||||
|
||||
# This variable is used to set the environment variable UIDEBUG
|
||||
# in sidebar-bindata. It is initialized with the value of UIDEBUG
|
||||
# from the environment but is explicitly set to 1 when
|
||||
# usernode-debug-ui is run.
|
||||
UI_DEBUG := $(UIDEBUG)
|
||||
|
||||
# Allow for a test backend to be passed into the makefile, but make sure that we have
|
||||
# a sensible default too
|
||||
REACT_APP_TEST_BACKEND := $(if $(REACT_APP_TEST_BACKEND),$(REACT_APP_TEST_BACKEND),$("https://staging.kite.com"))
|
||||
|
||||
GITCOMMIT := $(shell git rev-parse HEAD)
|
||||
|
||||
.PHONY: run-standalone
|
||||
|
||||
default: install-standalone
|
||||
|
||||
#####################################
|
||||
# Go build and verification tools #
|
||||
#####################################
|
||||
|
||||
install-ci-deps:
|
||||
# This target contains a minimal set of tools needed by CI.
|
||||
# Do not add things here lightly!
|
||||
go get -u golang.org/x/lint/golint
|
||||
go get -u golang.org/x/tools/cmd/goimports
|
||||
go get github.com/jteeuwen/go-bindata/...
|
||||
go get gotest.tools/gotestsum
|
||||
|
||||
install-deps: install-ci-deps
|
||||
# Protocol buffers
|
||||
go get github.com/golang/protobuf/proto
|
||||
go get github.com/golang/protobuf/protoc-gen-go
|
||||
|
||||
# Install some utilities
|
||||
go install github.com/kiteco/kiteco/kite-go/cmds/printjson
|
||||
go install github.com/kiteco/kiteco/kite-go/cmds/importchanged
|
||||
|
||||
datadeps-bindata:
|
||||
go install github.com/kiteco/kiteco/kite-go/client/internal/kitelocal/cmds/datadeps-bindata
|
||||
|
||||
build-datadeps:
|
||||
./scripts/build_datadeps.sh
|
||||
|
||||
generate:
|
||||
go generate ./...
|
||||
|
||||
test:
|
||||
# Run gotestsum with codecov reports for kite-go and kite-golib
|
||||
gotestsum --raw-command scripts/go_test_coverage ./kite-go/... ./kite-golib/...
|
||||
# Run gotestsum for checking build & test for local-pipelines (not part of codecov)
|
||||
gotestsum ./local-pipelines/...
|
||||
# Only run the data race checker on goroutine-heavy packages
|
||||
go test -race \
|
||||
./kite-go/sandbox \
|
||||
./kite-go/client/internal/client \
|
||||
./kite-go/client/internal/clientapp \
|
||||
./kite-go/health/cmds/healthd \
|
||||
./kite-go/core \
|
||||
./kite-go/lang/python/pythonlocal
|
||||
|
||||
# Linux only, run tests with libtcmalloc overriding malloc, free, ...
|
||||
test-tcmalloc:
|
||||
LD_PRELOAD="${PWD}/linux/tcmalloc/libtcmalloc_minimal_debug.so" ${MAKE} test
|
||||
|
||||
build:
|
||||
go build -v ./kite-go/... ./kite-golib/... ./local-pipelines/... ./kite-answers/...
|
||||
|
||||
vet:
|
||||
# Run go-vet on all directories
|
||||
go vet ./kite-go/... ./kite-golib/... ./local-pipelines/... ./kite-answers/...
|
||||
|
||||
lint:
|
||||
true ./scripts/custom_lint.sh
|
||||
# Run golint only on files that are not auto-generated
|
||||
find kite-go kite-golib local-pipelines kite-answers -name "*.go" | grep -v ".pb.go" | grep -v "bindata.go" | grep -v "stackoverflow-xml.go" | grep -v "lsp/types/protocol.go" | xargs -I file golint file > /tmp/golint.test 2>&1
|
||||
cat /tmp/golint.test
|
||||
! test -s /tmp/golint.test
|
||||
|
||||
fmt:
|
||||
find kite-go kite-golib local-pipelines kite-answers -name "*.go" | grep -v "bindata.go" | grep -v ".*.pb.go" | grep -v "/corpus/go/.*.go" | xargs -I file goimports -l=true file > /tmp/gofmt.test 2>/dev/null
|
||||
cat /tmp/gofmt.test
|
||||
! test -s /tmp/gofmt.test
|
||||
|
||||
check-client-fatal:
|
||||
true git grep 'log.Fatal' ./kite-go/client/internal/ ':(exclude)*_test.go' ':(exclude)*/cmds/*' > /tmp/fatal.test 2>&1
|
||||
cat /tmp/fatal.test
|
||||
! test -s /tmp/fatal.test
|
||||
|
||||
bin-check:
|
||||
! git status --porcelain --untracked-files=no | sed s/".* "// | xargs -I f file ../f | grep -E '(ELF|x86)'
|
||||
|
||||
verify: fmt lint vet bin-check build test
|
||||
|
||||
pull-frontend-docker:
|
||||
docker pull kiteco/build-frontend
|
||||
|
||||
install-libtensorflow:
|
||||
sudo rm -f /usr/local/lib/libtensorflow* || true
|
||||
curl -L "https://s3-us-west-1.amazonaws.com/kite-data/tensorflow/libtensorflow-cpu-`go env GOOS`-x86_64-1.15.0.tar.gz" | sudo tar -C /usr/local -xz
|
||||
|
||||
install-libtensorflow-avx2:
|
||||
sudo rm -f /usr/local/lib/libtensorflow* || true
|
||||
curl -L "https://s3-us-west-1.amazonaws.com/kite-data/tensorflow/libtensorflow-cpu-`go env GOOS`-x86_64-avx2-1.15.0.tar.gz" | sudo tar -C /usr/local -xz
|
||||
|
||||
|
||||
#######################################
|
||||
# Webapp assets/bindata generation #
|
||||
#######################################
|
||||
|
||||
# Ref for seemingly extravagant npm invocations: https://github.com/imagemin/pngquant-bin/issues/52#issuecomment-260247356
|
||||
webapp-deps: pull-frontend-docker
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm rebuild --quiet; npm uninstall --quiet; npm install --quiet"
|
||||
|
||||
webapp-tests: webapp-deps
|
||||
# TODO(tarak): Use the right commands to run the tests here?
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build-test"
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app -t kiteco/build-frontend npm test
|
||||
|
||||
webapp-build: webapp-deps
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
|
||||
|
||||
webapp-build-dev: webapp-deps
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-e "REACT_APP_ENV=development"\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
|
||||
|
||||
webapp-build-staging: webapp-deps
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-e "REACT_APP_BACKEND=https://staging.kite.com" -e "REACT_APP_ENV=staging"\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
|
||||
|
||||
webapp-build-prod: webapp-deps
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-e "REACT_APP_BACKEND=https://alpha.kite.com" -e "REACT_APP_ENV=production"\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
|
||||
|
||||
webapp-build-testing: webapp-deps
|
||||
docker run --rm -v "$(PWD)":/kiteco -w /kiteco/web/app\
|
||||
-e "REACT_APP_BACKEND=$(REACT_APP_TEST_BACKEND)" -e "REACT_APP_ENV=development"\
|
||||
-t kiteco/build-frontend\
|
||||
/bin/bash -c "npm config set //registry.npmjs.org/:_authToken=$(NPM_TOKEN); npm run build"
|
||||
|
||||
#######################################
|
||||
# kited.exe: windows #
|
||||
#######################################
|
||||
|
||||
force:
|
||||
|
||||
kited.exe: force
|
||||
go build -buildmode=exe \
|
||||
-ldflags "-H windowsgui -X github.com/kiteco/kiteco/kite-go/client/internal/clientapp.gitCommit=$(GITCOMMIT)" \
|
||||
github.com/kiteco/kiteco/kite-go/client/cmds/kited
|
||||
|
||||
WINDOWS_BUILD_VERSION ?= "9.9.9.9"
|
||||
|
||||
KiteSetup.exe: kited.exe kite-lsp.exe
|
||||
mv kited.exe windows/
|
||||
mv kite-lsp.exe windows/
|
||||
mkdir -p windows/installer/current_build_bin/out
|
||||
cd windows/installer && ./nant.bat -D:prevPatchVersion="${WINDOWS_PATCH_BASE}" -D:buildnumstring="${WINDOWS_BUILD_VERSION}" build
|
||||
|
||||
KiteUpdateInfo.xml: KiteSetup.exe
|
||||
@cd windows/tools/kite_update_signer_cmd/bin/Debug && ./KiteUpdateSignerCmd.exe ${WINDOWS_PASS}
|
||||
|
||||
KitePatchUpdateInfo.xml: KiteSetup.exe
|
||||
@[[ -n "${WINDOWS_PATCH_BASE}" ]] && cd windows/tools/kite_patch_update_signer_cmd/bin/Debug && ./KitePatchUpdateSignerCmd.exe ${WINDOWS_PASS}
|
||||
|
||||
kite-lsp.exe: force
|
||||
go build \
|
||||
-ldflags "-H windowsgui" \
|
||||
github.com/kiteco/kiteco/kite-go/lsp/cmds/kite-lsp
|
||||
|
||||
kite-windows: KiteSetup.exe KiteUpdateInfo.xml KitePatchUpdateInfo.xml
|
||||
|
||||
#######################################
|
||||
|
||||
install-standalone:
|
||||
./scripts/standalone.sh install
|
||||
|
||||
run-standalone:
|
||||
./scripts/standalone.sh run
|
||||
|
||||
run-web-node:
|
||||
go run github.com/kiteco/kiteco/kite-go/cmds/web-node/
|
233
README.md
Normal file
233
README.md
Normal file
@ -0,0 +1,233 @@
|
||||
Getting started with the codebase
|
||||
=================================
|
||||
|
||||
Our codebase is primarily located at github.com/kiteco/kiteco (http://github.com/kiteco/kiteco). There are a few auxiliary repositories that host very experimental code, but the goal is to make the “kiteco” repository the point of truth for all of our services.
|
||||
|
||||
|
||||
Summary (TL;DR)
|
||||
---------------
|
||||
|
||||
* Our codebase is primarily Go. (`kite-go`, `kite-golib` directories)
|
||||
* Infrastructure uses Terraform (for AWS) provisioning, and Fabric/shell scripts for deployment and management of remote hosts (`devops` directory)
|
||||
* You need VPN credentials to access any of our remote AWS (or Azure) hosts.
|
||||
* Platform-specific logic & instructions live in subdirectories `osx`, `windows`, `linux`. You probably don't need these.
|
||||
|
||||
Git LFS
|
||||
--
|
||||
We use [Git LFS](https://git-lfs.github.com/) to store our various `bindata.go` files. You will need to install the command line tool to get the contents of those files when you pull the repository. Installation instructions are on their website, but for MacOS you can install it by running (from inside the `kiteco` repository)
|
||||
```
|
||||
brew update
|
||||
brew install git-lfs
|
||||
git lfs install
|
||||
```
|
||||
Then do a `git pull` to get the bindata.go files. If they do not download from LFS, try running `git lfs pull` (you should only need to do this once - subsequent `git pull`s should update the bindata correctly).
|
||||
|
||||
### Optional: Improving Performance
|
||||
|
||||
`git lfs install` installs a [smudge filter](https://git-scm.com/docs/gitattributes) that automatically downloads and replaces the contents of newly checked out "pointer files" with their content.
|
||||
By default smudge filters operate on checked out blobs in sequence, so cannot download in batch as would typically happen when running `git lfs pull`.
|
||||
Furthermore, by default, git checkouts will block on downloading the new LFS files which can be annoying.
|
||||
You might prefer to disable the smudge filter (this can be run even if you've already run the regular `git lfs install`):
|
||||
```
|
||||
git lfs install --skip-smudge
|
||||
git lfs pull
|
||||
```
|
||||
|
||||
Then, when building after a new checkout, you may see an error of the form "expected package got ident."
|
||||
This occurs because `go` reads some Go files and sees the Git LFS pointers instead of the actual data file.
|
||||
At this point, you can download the latest files with `git lfs pull` and rebuilding should work.
|
||||
|
||||
Nothing needs to be done when pushing LFS blobs. That will still happen automatically.
|
||||
|
||||
Go
|
||||
--
|
||||
|
||||
The bulk of our code is currently in Go.
|
||||
This can be found at github.com/kiteco/kiteco/kite-go (http://github.com/kiteco/kiteco/kite-go).
|
||||
To get started working in this part of the codebase, first make sure you have your Go environment setup correctly (i.e Go is installed, $GOPATH is set, etc.).
|
||||
|
||||
Locally, however, you will need to install Go 1.15.3. The following steps will get you going.
|
||||
|
||||
Set `$GOPATH` in your .profile / .bashrc/ .bash_profile / .zshrc, e.g:
|
||||
|
||||
```sh
|
||||
export GOROOT=/usr/local/go
|
||||
export GOPATH=$HOME/go
|
||||
export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
|
||||
```
|
||||
|
||||
Make sure to create these directories as well:
|
||||
|
||||
```sh
|
||||
mkdir $HOME/go
|
||||
mkdir $HOME/go/src $HOME/go/bin $HOME/go/pkg
|
||||
```
|
||||
|
||||
If you are on a Mac and set the above in either .bashrc or .zshrc, make sure to load it in either your .profile or .bash_profile.
|
||||
See [this](http://www.joshstaiger.org/archives/2005/07/bash_profile_vs.html) for an explanation.
|
||||
|
||||
It would be useful to become familiar with how `go` code is organized. Check out https://golang.org/doc/code.html for more on this topic.
|
||||
|
||||
Navigate to where the `kiteco` repo will live in your `GOPATH`, and clone the repo.
|
||||
|
||||
```sh
|
||||
# Create kiteco directory within GOPATH, and clone the repo there
|
||||
mkdir -p ~/go/src/github.com/kiteco
|
||||
cd ~/go/src/github.com/kiteco
|
||||
git clone git@github.com:kiteco/kiteco
|
||||
```
|
||||
|
||||
To install the latest version of Go that's compatible with our codebase, run:
|
||||
|
||||
```sh
|
||||
cd ~/go/src/github.com/kiteco/kiteco
|
||||
cd devops/scripts
|
||||
./install-golang.sh
|
||||
```
|
||||
|
||||
From here, just run `make install-deps` from the root of the `kiteco` repo to get basic utilities installed.
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
make install-deps
|
||||
```
|
||||
|
||||
Use `./scripts/update-golang-version.sh` if you'd like to make Kite require a newer version of Golang.
|
||||
|
||||
### Tensorflow
|
||||
|
||||
For development builds (see below), you may need to have Tensorflow installed globally on your system.
|
||||
|
||||
```bash
|
||||
make install-libtensorflow
|
||||
```
|
||||
|
||||
Building Kite
|
||||
-------------
|
||||
|
||||
You're now ready to build Kite! First, build the sidebar for your platform
|
||||
|
||||
```bash
|
||||
./osx/build_electron.sh force
|
||||
# ./linux/build_electron.sh force
|
||||
# ./windows/build_electron.sh force
|
||||
```
|
||||
|
||||
This process is asynchronous to the Kite daemon build,
|
||||
so you must manually rebuild the sidebar as needed.
|
||||
|
||||
Now build and run Kite:
|
||||
|
||||
```bash
|
||||
make run-standalone
|
||||
```
|
||||
|
||||
Note that this is not a full Kite build, but is the recommended approach for development, as it is much faster.
|
||||
Some functionality is disabled in the development build (depending on the platform):
|
||||
|
||||
- Kite system tray icon
|
||||
- Updater service
|
||||
|
||||
|
||||
Development
|
||||
-----------
|
||||
|
||||
You should be able to develop, build, and test Kite entirely on your local machine.
|
||||
However, we do have cloud instances & VMs available for running larger jobs and for
|
||||
[testing our cloud services](VAGRANT.md)
|
||||
|
||||
### Dependency Management with Go Modules
|
||||
We use the [Go Modules](https://blog.golang.org/using-go-modules) system for dependency management.
|
||||
|
||||
General tips:
|
||||
- make sure in `~/go/src/github.com/kiteco/kiteco` and not a symlink
|
||||
- make sure deps are updated to the versions in `go.mod`: `go mod download`
|
||||
- Set `$GOPRIVATE` in your .profile / .bashrc/ .bash_profile / .zshrc, e.g: `export GOPRIVATE=github.com/kiteco/*`.
|
||||
|
||||
To add or update a dependency, all you need to do is `go get` it, which
|
||||
will automatically update the `go.mod` and `go.sum` files. To remove a dependency,
|
||||
remove references to it in the code and run `go mod tidy`. In general, make sure to
|
||||
run `go mod tidy` to make sure all new dependencies have been added and unused ones
|
||||
have been removed before committing any dependency changes.
|
||||
|
||||
The process for updating a dependency is:
|
||||
- `go get -u github.com/foo/bar`
|
||||
- (optional) run any `go` command, such as `go build`, `go test`
|
||||
- `go mod tidy`
|
||||
- `git add go.mod go.sum`
|
||||
- `git commit ...`
|
||||
|
||||
The process for adding a dependency is:
|
||||
- `go get github.com/foo/bar`
|
||||
- edit code to import "github.com/foo/bar"
|
||||
- `go mod tidy`
|
||||
- `git add go.mod go.sum`
|
||||
- `git commit ...`
|
||||
|
||||
#### HTTPS Auth
|
||||
`godep` may attempt to clone private repositories via HTTPS, requiring manual authentication.
|
||||
Instead, you can add the following section to your `~/.gitconfig` in order to force SSH authentication:
|
||||
|
||||
```
|
||||
[url "git@github.com:"]
|
||||
insteadOf = https://github.com/
|
||||
```
|
||||
|
||||
### Datasets, Datadeps
|
||||
|
||||
We bundle a lot of pre-computed datasets & machine learning models into the Kite app
|
||||
through the use of a custom filemap & encoding on top of [go-bindata](https://github.com/jteeuwen/go-bindata).
|
||||
The data, located in `kite-go/client/datadeps`, is kept in Git-LFS.
|
||||
|
||||
All needed data files is first stored on S3.
|
||||
There are pointers at various places in our codebase to S3 URIs.
|
||||
After updating references to these datasets, the datadeps file must be manually rebuilt:
|
||||
|
||||
```
|
||||
$ ./scripts/build_datadeps.sh
|
||||
```
|
||||
|
||||
This will bundle all data that is loaded at Kite initialization time.
|
||||
You must ensure the needed data is loaded at initialization, otherwise it will not be included!
|
||||
|
||||
|
||||
### Logs
|
||||
|
||||
Some logs are displayed in Xcode, but most are written to a log file:
|
||||
|
||||
```shell
|
||||
tail -F ~/.kite/logs/client.log
|
||||
```
|
||||
|
||||
### Testing and Continuous Integration
|
||||
|
||||
Your Go code should pass several quality criteria before being allowed into the master branch. Travis CI (https://travis-ci.org/) acts as the gatekeeper between pull requests and merging. You can test your code before pushing to a pull request to speed up the process by navigating to the `kite-go` directory and running `make *` commands directly (any of `make (fmt|lint|vet|bin-check|build|test)`).
|
||||
|
||||
### VPN Access
|
||||
|
||||
You will need access to our VPN to connect to our backend hosts.
|
||||
|
||||
* Get VPN credentials (*.ovpn file) from @tarak (You will need to type in a password IRL - don't IM/chat it)
|
||||
* Install Tunnelblick for OS X (https://code.google.com/p/tunnelblick/)
|
||||
* Double click on the “.ovpn” file that contains your credentials.
|
||||
* Tunnelblick should automatically apply the configuration.. look for the icon on the OS X status bar
|
||||
* Click on the Tunnelblick icon, select your config, and enter your VPN password. (**NOTE**: Tunnelblick will complain saying the IP hasn't changed. Check the box to disable the message and continue.)
|
||||
* Ping 'test-0.kite.com' and make sure it resolves. It's okay if the pings timeout; ICMP is disabled by default on aws instances.
|
||||
|
||||
### SSH Access
|
||||
|
||||
Kite's Dropbox has ssh credentials for all the machines on AWS and Azure under Shared > Engineering > keys > kite-dev.pem and Shared > Engineering > keys > kite-dev-azure. Place both of these in your .ssh directory, i.e. ~/.ssh/kite-dev.pem. As a convenience, you should add the following to your `~/.ssh/config`:
|
||||
|
||||
```
|
||||
Host *.kite.com
|
||||
ForwardAgent yes
|
||||
IdentityFile ~/.ssh/kite-dev.pem
|
||||
User ubuntu
|
||||
|
||||
# Test instances are on Azure
|
||||
Host test-*.kite.com
|
||||
User ubuntu
|
||||
IdentityFile ~/.ssh/kite-dev-azure
|
||||
```
|
||||
|
||||
Don't forget to set appropriate permissions on the credential files (e.g. 700)
|
46
VAGRANT.md
Normal file
46
VAGRANT.md
Normal file
@ -0,0 +1,46 @@
|
||||
Cloud Development
|
||||
=================
|
||||
|
||||
In the past, Kite's language analysis facilities ran on an AWS/Azure backend instead of the users machine.
|
||||
There are still several backend components including exposing the symbol API for serving web docs as well as
|
||||
servicing the web sandbox.
|
||||
|
||||
|
||||
### Vagrant
|
||||
|
||||
We use VMs for backend development to guarantee a consistent environment between development and production.
|
||||
To get this set up, first [set up Vagrant](vagrant-boxes/kite-dev/README.md)
|
||||
|
||||
Once you have a shell in the virtual machine, the kiteco repo's working directory should be at:
|
||||
|
||||
```sh
|
||||
$HOME/go/src/github.com/kiteco/kiteco
|
||||
```
|
||||
|
||||
NOTE: This is a symlink to `/kiteco`, mounted as a NFS share in the `Vagrantfile`
|
||||
|
||||
All commands (`make *`, `go build`, etc) must be run from the full `$HOME/go/src/github.com/kiteco/kiteco` path (not a symlinked directory).
|
||||
From here, you may need to repeat some of the steps from the original dev setup, e.g:
|
||||
|
||||
```sh
|
||||
# Install dependencies
|
||||
make install-deps
|
||||
```
|
||||
|
||||
Because `user-node` takes too many resources to load locally, there are test instances available on AWS/Azure for you to run/test your development changes to `user-node`.
|
||||
Please see https://kite.quip.com/Phk4AB8lLqh9 for a list of test instances; we no longer have per-developer test instances,
|
||||
so please notify others before deploying the backend or otherwise running resource intensive processes.
|
||||
|
||||
Once you have your test instance, you can deploy your local changes to it by running:
|
||||
|
||||
```sh
|
||||
cd ~/go/src/github.com/kiteco/kiteco
|
||||
./scripts/deploy_test.sh test-N.kite.com
|
||||
```
|
||||
|
||||
|
||||
#### Infrastructure
|
||||
|
||||
Our AWS infrastructure makes use of Terraform (http://www.terraform.io (http://www.terraform.io/)). Terraform helps us manage our AWS topology. Please do not modify this unless you know what you are doing :). Our terraform configuration files can be found in github.com/kiteco/kiteco/devops/terraform (http://github.com/kiteco/kiteco/devops/terraform).
|
||||
|
||||
We use Fabric to execute some commands on remote hosts (others are simply shell scripts that invoke SSH). The fabric scripts can be found at github.com/kiteco/kiteco/devops/fabric (http://github.com/kiteco/kiteco/devops/fabric).
|
1
airflow/.dockerignore
Normal file
1
airflow/.dockerignore
Normal file
@ -0,0 +1 @@
|
||||
dev/**
|
2
airflow/.gitignore
vendored
Normal file
2
airflow/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
dev/
|
||||
terraform/.terraform.lock.hcl
|
2
airflow/.pep8
Normal file
2
airflow/.pep8
Normal file
@ -0,0 +1,2 @@
|
||||
[flake8]
|
||||
max-line-length = 160
|
25
airflow/Dockerfile
Normal file
25
airflow/Dockerfile
Normal file
@ -0,0 +1,25 @@
|
||||
FROM apache/airflow:1.10.12
|
||||
|
||||
USER airflow
|
||||
|
||||
ARG KITECO=${AIRFLOW_HOME}/kiteco
|
||||
WORKDIR ${KITECO}
|
||||
|
||||
COPY --chown=airflow:root airflow/requirements.txt airflow/MANIFEST.in airflow/setup.py ./airflow/
|
||||
COPY --chown=airflow:root airflow/kite_airflow ./airflow/kite_airflow
|
||||
COPY --chown=airflow:root kite-python/metrics ./kite-python/metrics
|
||||
|
||||
RUN python -m pip install --user --upgrade pip && \
|
||||
python -m pip install --user --no-cache-dir -r airflow/requirements.txt && \
|
||||
python -m pip install --user ./airflow/ && \
|
||||
python -m pip install --user ./kite-python/metrics/
|
||||
|
||||
WORKDIR ${AIRFLOW_HOME}
|
||||
|
||||
RUN rm -rf dags
|
||||
RUN ln -s ${KITECO}/airflow/kite_airflow/dags dags
|
||||
|
||||
RUN mkdir conf
|
||||
COPY airflow/conf/prod/airflow.cfg conf/
|
||||
VOLUME /opt/airflow/conf
|
||||
RUN ln -s conf/airflow.cfg .
|
2
airflow/MANIFEST.in
Normal file
2
airflow/MANIFEST.in
Normal file
@ -0,0 +1,2 @@
|
||||
graft kite_airflow/templates
|
||||
graft kite_airflow/files
|
32
airflow/Makefile
Normal file
32
airflow/Makefile
Normal file
@ -0,0 +1,32 @@
|
||||
ECR_REPO_URL=XXXXXXX.dkr.ecr.us-west-1.amazonaws.com
|
||||
ECR_PACKAGE_NAME=kite-airflow
|
||||
TAG=$(shell git rev-parse --short HEAD)
|
||||
|
||||
docker.login:
|
||||
aws ecr get-login-password --region us-west-1 | docker login --username AWS --password-stdin $(ECR_REPO_URL)
|
||||
|
||||
docker.build:
|
||||
docker build -t $(ECR_PACKAGE_NAME):$(TAG) ../ -f Dockerfile
|
||||
cd containers/monetizable && make TAG=$(TAG) docker.build
|
||||
|
||||
docker.tag:
|
||||
docker tag $(ECR_PACKAGE_NAME):$(TAG) $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
|
||||
cd containers/monetizable && make TAG=$(TAG) docker.tag
|
||||
|
||||
docker.push:
|
||||
docker push $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
|
||||
cd containers/monetizable && make TAG=$(TAG) docker.push
|
||||
|
||||
docker.all: docker.login docker.build docker.tag docker.push
|
||||
|
||||
deployment.apply:
|
||||
docker run --rm -w /opt/terraform -v $(PWD)/terraform:/opt/terraform -v $(HOME)/.gcloud/:/root/.gcloud -v $(HOME)/.aws/:/root/.aws -v $(HOME)/.config/gcloud/:/root/.config/gcloud -it ljfranklin/terraform-resource:latest terraform apply -var tag=$(TAG)
|
||||
|
||||
deployment.list:
|
||||
@aws ecs list-tasks --cluster airflow | jq -r '.taskArns[]' | xargs aws ecs describe-tasks --cluster airflow --tasks | jq -r '.tasks[] | {"group": .group, "container": .containers[]} | (.group + " " + .container.name + " " + (.container.image | capture(".*:(?<tag>[0-9a-f]+)$$") | .tag) + " " + .container.lastStatus)'
|
||||
|
||||
deployment.shell:
|
||||
exec docker run --rm -w /opt/terraform -v $(PWD)/terraform:/opt/terraform -v $(HOME)/.gcloud/:/root/.gcloud -v $(HOME)/.aws/:/root/.aws -v $(HOME)/.config/gcloud/:/root/.config/gcloud -it ljfranklin/terraform-resource:latest /bin/bash
|
||||
|
||||
python.lint:
|
||||
flake8 kite_airflow
|
32
airflow/README.md
Normal file
32
airflow/README.md
Normal file
@ -0,0 +1,32 @@
|
||||
Kite Airflow
|
||||
============
|
||||
|
||||
UI
|
||||
-------------
|
||||
|
||||
Airflow is deploy to https://airflow.kite.dev. Requires VPN.
|
||||
|
||||
How to Deploy
|
||||
-------------
|
||||
|
||||
Requirements:
|
||||
* AWS CLI
|
||||
* JQ (https://stedolan.github.io/jq/download/)
|
||||
* Docker
|
||||
|
||||
Deployment:
|
||||
* Login to AWS ECR: make docker_login
|
||||
* Deploy: make build deploy
|
||||
* Confirm Terraform deploy by type "yes"
|
||||
|
||||
To see deployment status:
|
||||
* make show_containers
|
||||
|
||||
Adding metrics to kite status 1d
|
||||
--------------------------------
|
||||
|
||||
* Ensure the field is in dags/files/kite_status.schema.yaml.
|
||||
* Add the aggregation to dags/templates/athena/queries/kite_status_1d.tmpl.sql.
|
||||
* Deploy.
|
||||
* Manually trigger the DAG "update_kite_status_schema": http://XXXXXXX:8080/admin/airflow/tree?dag_id=update_kite_status_schema
|
||||
* Let the kite_status_1d jobs run at their normally-scheduled time.
|
22
airflow/conf/prod/airflow.cfg
Normal file
22
airflow/conf/prod/airflow.cfg
Normal file
@ -0,0 +1,22 @@
|
||||
[core]
|
||||
executor = CeleryExecutor
|
||||
load_examples = False
|
||||
remote_logging = True
|
||||
remote_log_conn_id = aws_us_east_1
|
||||
remote_base_log_folder = s3://kite-backend-logs/airflow/logs
|
||||
enable_xcom_pickling = False
|
||||
dag_concurrency = 32
|
||||
max_active_runs_per_dag = 32
|
||||
dag_file_processor_timeout = 6000
|
||||
parallelism = 64
|
||||
|
||||
[scheduler]
|
||||
max_threads = 8
|
||||
|
||||
[celery]
|
||||
broker_url = XXXXXXX
|
||||
worker_concurrency = 16
|
||||
|
||||
[secrets]
|
||||
backend = kite_airflow.secrets_backend.SecretsManagerBackend
|
||||
backend_kwargs = {"connections_prefix": "airflow/connections", "variables_prefix": "airflow/variables", "region_name": "us-west-1"}
|
1
airflow/containers/monetizable/.gitignore
vendored
Normal file
1
airflow/containers/monetizable/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
build/
|
13
airflow/containers/monetizable/Dockerfile
Normal file
13
airflow/containers/monetizable/Dockerfile
Normal file
@ -0,0 +1,13 @@
|
||||
FROM ubuntu:20.04
|
||||
|
||||
# Fixes x509: certificate signed by unknown authority when fetching from AWS
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG BUILD_HASH=unset
|
||||
ENV BUILD_HASH=$BUILD_HASH
|
||||
|
||||
WORKDIR /opt/svc/
|
||||
COPY build/monetizable .
|
||||
ENTRYPOINT ["./monetizable"]
|
41
airflow/containers/monetizable/Makefile
Normal file
41
airflow/containers/monetizable/Makefile
Normal file
@ -0,0 +1,41 @@
|
||||
DEV_TAG=airflow_monetizable_dev:latest
|
||||
KITECO=$(PWD)/../../..
|
||||
DOCKER_RUN_CMD=docker run --rm -it -e AWS_ACCESS_KEY_ID=$(AWS_ACCESS_KEY_ID) -e AWS_SECRET_ACCESS_KEY=$(AWS_SECRET_ACCESS_KEY) -v $(KITECO):/go/src/github.com/kiteco/kiteco $(DEV_TAG)
|
||||
|
||||
ECR_REPO_URL=XXXXXXX.dkr.ecr.us-west-1.amazonaws.com
|
||||
ECR_PACKAGE_NAME=kite-airflow-monetizable
|
||||
TAG=$(shell git rev-parse --short HEAD)
|
||||
|
||||
KITECO=$${PWD%/kiteco/**}/kiteco
|
||||
CWD_RELATIVE=$${PWD\#/**/kiteco}
|
||||
GO_IMAGE=golang:1.15.3-buster
|
||||
|
||||
dev.build: build/monetizable
|
||||
docker build --build-arg BUILD_HASH=$(TAG) -t $(DEV_TAG) .
|
||||
|
||||
dev.shell:
|
||||
@exec $(DOCKER_RUN_CMD) /bin/bash
|
||||
|
||||
docker.login:
|
||||
aws ecr get-login-password --region us-west-1 | docker login --username AWS --password-stdin $(ECR_REPO_URL)
|
||||
|
||||
docker.build: build/monetizable
|
||||
docker build --build-arg BUILD_HASH=$(TAG) -t $(ECR_PACKAGE_NAME):$(TAG) .
|
||||
|
||||
docker.tag:
|
||||
docker tag $(ECR_PACKAGE_NAME):$(TAG) $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
|
||||
|
||||
docker.push:
|
||||
docker push $(ECR_REPO_URL)/$(ECR_PACKAGE_NAME):$(TAG)
|
||||
|
||||
docker.all: docker.login docker.build docker.tag docker.push
|
||||
|
||||
build/monetizable: build main.go
|
||||
docker run --rm -e "GOPRIVATE=github.com/kiteco/*" \
|
||||
-v $(KITECO):/go/src/github.com/kiteco/kiteco \
|
||||
-v $(PWD)/build:/build \
|
||||
-w /go/src/github.com/kiteco/$(CWD_RELATIVE) \
|
||||
$(GO_IMAGE) go build -o /build/monetizable .
|
||||
|
||||
build:
|
||||
mkdir -p build
|
112
airflow/containers/monetizable/main.go
Normal file
112
airflow/containers/monetizable/main.go
Normal file
@ -0,0 +1,112 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/kiteco/kiteco/kite-golib/awsutil"
|
||||
"github.com/kiteco/kiteco/kite-golib/conversion/monetizable"
|
||||
"github.com/kiteco/kiteco/kite-golib/fileutil"
|
||||
)
|
||||
|
||||
// Result type
|
||||
type Result struct {
|
||||
Score float64 `json:"score"`
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
Userid string `json:"userid"`
|
||||
ModelVersion string `json:"model_version"`
|
||||
}
|
||||
|
||||
// Inputs alias
|
||||
type Inputs = monetizable.Inputs
|
||||
|
||||
// Input type
|
||||
type Input struct {
|
||||
Userid string `json:"userid"`
|
||||
Inputs
|
||||
}
|
||||
|
||||
func main() {
|
||||
var dataPath string
|
||||
flag.StringVar(&dataPath, "data", "", "path to data directory")
|
||||
|
||||
var region string
|
||||
flag.StringVar(®ion, "region", "us-east-1", "AWS region of source data path")
|
||||
|
||||
var destPath string
|
||||
flag.StringVar(&destPath, "dest", "", "path to destination directory")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
buildHash := os.Getenv("BUILD_HASH")
|
||||
runTS := time.Now().Unix()
|
||||
|
||||
dataURL, err := url.Parse(dataPath)
|
||||
if err != nil {
|
||||
log.Fatalf("Error parsing data path, %v", err)
|
||||
}
|
||||
|
||||
keys, err := awsutil.S3ListObjects(region, dataURL.Hostname(), strings.TrimPrefix(dataURL.Path, "/"))
|
||||
if err != nil {
|
||||
log.Fatalf("Error listing data directory, %v", err)
|
||||
}
|
||||
for _, key := range keys {
|
||||
srcFilename := fmt.Sprintf("s3://%s", path.Join(dataURL.Hostname(), key))
|
||||
dstFilename := fmt.Sprintf("%s/%s", destPath, fmt.Sprintf("%s.json", strings.TrimSuffix(path.Base(key), ".gz")))
|
||||
log.Printf("Processing file %s, destination=%s", srcFilename, dstFilename)
|
||||
handleFile(srcFilename, dstFilename, buildHash, runTS)
|
||||
}
|
||||
}
|
||||
|
||||
func handleFile(srcFilename string, dstFilename string, buildHash string, runTS int64) {
|
||||
zReader, err := fileutil.NewReader(srcFilename)
|
||||
if err != nil {
|
||||
log.Fatalf("Error reading data file %s", srcFilename)
|
||||
}
|
||||
defer zReader.Close()
|
||||
gr, err := gzip.NewReader(zReader)
|
||||
if err != nil {
|
||||
log.Fatalf("Error reading gzip data in file %s", srcFilename)
|
||||
}
|
||||
defer gr.Close()
|
||||
|
||||
outf, err := fileutil.NewBufferedWriter(dstFilename)
|
||||
if err != nil {
|
||||
log.Fatalf("Error opening file %s for writing, %v", dstFilename, err)
|
||||
}
|
||||
defer outf.Close()
|
||||
writer := json.NewEncoder(outf)
|
||||
|
||||
scanner := bufio.NewScanner(gr)
|
||||
|
||||
for scanner.Scan() {
|
||||
var input Input
|
||||
err = json.Unmarshal(scanner.Bytes(), &input)
|
||||
if err != nil {
|
||||
log.Fatalf("Error parsing JSON from %s, %v", srcFilename, err)
|
||||
}
|
||||
score, err := monetizable.Score(input.Inputs)
|
||||
if err != nil {
|
||||
log.Fatalf("Error computing score, %v", err)
|
||||
}
|
||||
|
||||
var result = Result{
|
||||
Score: score,
|
||||
Timestamp: runTS,
|
||||
Userid: input.Userid,
|
||||
ModelVersion: buildHash,
|
||||
}
|
||||
if err := writer.Encode(result); err != nil {
|
||||
log.Fatalf("Error writing result data file %s, %v", dstFilename, err)
|
||||
}
|
||||
}
|
||||
}
|
71
airflow/docker-compose.yaml
Normal file
71
airflow/docker-compose.yaml
Normal file
@ -0,0 +1,71 @@
|
||||
version: "3"
|
||||
services:
|
||||
postgres:
|
||||
image: "postgres:9.6"
|
||||
container_name: "postgres"
|
||||
environment:
|
||||
- POSTGRES_USER=airflow
|
||||
- POSTGRES_PASSWORD=XXXXXXX
|
||||
- POSTGRES_DB=airflow
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- ./dev/data/postgres:/var/lib/postgresql/data
|
||||
initdb:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: ./airflow/Dockerfile
|
||||
entrypoint: airflow initdb
|
||||
depends_on:
|
||||
- postgres
|
||||
volumes:
|
||||
- ./conf/dev:/opt/airflow/conf
|
||||
- $HOME/.aws/:/home/airflow/.aws
|
||||
webserver:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: ./airflow/Dockerfile
|
||||
restart: always
|
||||
depends_on:
|
||||
- initdb
|
||||
environment:
|
||||
- AWS_ACCESS_KEY_ID
|
||||
- AWS_SECRET_ACCESS_KEY
|
||||
- AWS_SESSION_TOKEN
|
||||
volumes:
|
||||
- ../:/opt/airflow/kiteco
|
||||
- ./conf/dev:/opt/airflow/conf
|
||||
- ./dev/logs:/opt/airflow/logs
|
||||
- $HOME/.aws/:/home/airflow/.aws
|
||||
ports:
|
||||
- "8080:8080"
|
||||
entrypoint: airflow webserver
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-webserver.pid ]"]
|
||||
interval: 30s
|
||||
timeout: 30s
|
||||
retries: 3
|
||||
scheduler:
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: ./airflow/Dockerfile
|
||||
restart: always
|
||||
depends_on:
|
||||
- initdb
|
||||
environment:
|
||||
- AWS_ACCESS_KEY_ID
|
||||
- AWS_SECRET_ACCESS_KEY
|
||||
- AWS_SESSION_TOKEN
|
||||
volumes:
|
||||
- ../:/opt/airflow/kiteco
|
||||
- ./conf/dev:/opt/airflow/conf
|
||||
- ./dev/logs:/opt/airflow/logs
|
||||
- $HOME/.aws/:/home/airflow/.aws
|
||||
ports:
|
||||
- "8793:8793"
|
||||
entrypoint: airflow scheduler
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "[ -f /opt/airflow/airflow-scheduler.pid ]"]
|
||||
interval: 30s
|
||||
timeout: 30s
|
||||
retries: 3
|
0
airflow/kite_airflow/__init__.py
Normal file
0
airflow/kite_airflow/__init__.py
Normal file
0
airflow/kite_airflow/common/__init__.py
Normal file
0
airflow/kite_airflow/common/__init__.py
Normal file
18
airflow/kite_airflow/common/configs.py
Normal file
18
airflow/kite_airflow/common/configs.py
Normal file
@ -0,0 +1,18 @@
|
||||
from airflow.models import Variable
|
||||
from kite_airflow.common import utils
|
||||
|
||||
|
||||
CIO_CREDENTIALS = Variable.get('cio_credentials' if utils.is_production() else 'cio_credentials_dev', deserialize_json=True)
|
||||
CIO_MAX_CONCURRENT_REQUESTS = 50
|
||||
|
||||
MP_CREDENTIALS = Variable.get('mixpanel_credentials' if utils.is_production() else 'mixpanel_credentials_dev', deserialize_json=True)
|
||||
MP_MAX_CONCURRENT_REQUESTS = 100
|
||||
|
||||
# S3
|
||||
AWS_CONN_ID = 'aws_us_east_1'
|
||||
BUCKET = 'kite-metrics' if utils.is_production() else 'kite-metrics-test'
|
||||
DIR_SCRATCH_SPACE = 'athena-scratch-space'
|
||||
DIR_SCRATCH_URI = 's3://{}/{}'.format(BUCKET, DIR_SCRATCH_SPACE)
|
||||
|
||||
# Athena
|
||||
DB_KITE_METRICS = 'kite_metrics'
|
73
airflow/kite_airflow/common/files.py
Normal file
73
airflow/kite_airflow/common/files.py
Normal file
@ -0,0 +1,73 @@
|
||||
import csv
|
||||
import codecs
|
||||
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
|
||||
from kite_airflow.common import utils
|
||||
from kite_airflow.common import configs
|
||||
|
||||
|
||||
def get_scratch_csv_dict_reader(ti, task_id, sub_directory):
|
||||
s3 = S3Hook(configs.AWS_CONN_ID)
|
||||
filename = ti.xcom_pull(task_ids=task_id)
|
||||
s3key = s3.get_key(
|
||||
'{}/{}/{}.csv'.format(configs.DIR_SCRATCH_SPACE, sub_directory, filename),
|
||||
configs.BUCKET,
|
||||
)
|
||||
|
||||
return csv.DictReader(
|
||||
codecs.getreader("utf-8")(s3key.get()['Body'])
|
||||
)
|
||||
|
||||
|
||||
def get_full_scratch_space_csv(ti, task_id, sub_directory):
|
||||
reader = get_scratch_csv_dict_reader(ti, task_id, sub_directory)
|
||||
row_list = []
|
||||
|
||||
for row in reader:
|
||||
row_list.append(row)
|
||||
|
||||
return row_list
|
||||
|
||||
|
||||
def get_line_of_scratch_space_csv(ti, task_id, sub_directory):
|
||||
reader = get_scratch_csv_dict_reader(ti, task_id, sub_directory)
|
||||
i = 0
|
||||
|
||||
for row in reader:
|
||||
i += 1
|
||||
yield i, row
|
||||
|
||||
|
||||
def get_csv_file_as_dict(bucket, file_path):
|
||||
s3 = S3Hook(configs.AWS_CONN_ID)
|
||||
s3key = s3.get_key(file_path, bucket)
|
||||
reader = csv.DictReader(codecs.getreader("utf-8")(s3key.get()['Body']))
|
||||
row_list = []
|
||||
|
||||
for row in reader:
|
||||
row_list.append(row)
|
||||
|
||||
return row_list
|
||||
|
||||
|
||||
def write_dict_on_csv_file(bucket, file_path, data_list):
|
||||
if(len(data_list) == 0):
|
||||
return
|
||||
|
||||
s3_hook = S3Hook(configs.AWS_CONN_ID)
|
||||
upload_data_list = []
|
||||
|
||||
keys = data_list[0].keys()
|
||||
upload_data_list.append(','.join(keys))
|
||||
|
||||
for item in data_list:
|
||||
values = item.values()
|
||||
upload_data_list.append(','.join(values))
|
||||
|
||||
s3_hook.load_bytes(
|
||||
'\n'.join(upload_data_list).encode('utf-8'),
|
||||
file_path,
|
||||
bucket,
|
||||
replace=True,
|
||||
)
|
35
airflow/kite_airflow/common/utils.py
Normal file
35
airflow/kite_airflow/common/utils.py
Normal file
@ -0,0 +1,35 @@
|
||||
import ast
|
||||
import datetime
|
||||
import time
|
||||
import uuid
|
||||
import json
|
||||
|
||||
from airflow.models import Variable
|
||||
import kite_metrics
|
||||
|
||||
|
||||
kite_status_config = kite_metrics.load_context('kite_status')
|
||||
|
||||
|
||||
def is_production():
|
||||
return Variable.get('env', 'dev') == 'production'
|
||||
|
||||
|
||||
def get_supported_languages():
|
||||
return kite_status_config['languages']
|
||||
|
||||
|
||||
def get_unique_suffix():
|
||||
return '-{}-{}.json'.format(
|
||||
get_date_time_in_ISO(),
|
||||
uuid.uuid4().hex,
|
||||
)
|
||||
|
||||
|
||||
def get_date_time_in_ISO():
|
||||
date_time = datetime.datetime.fromtimestamp(time.time())
|
||||
return date_time.isoformat() + 'Z'
|
||||
|
||||
|
||||
def string_to_dict(string):
|
||||
return ast.literal_eval(string.replace('=', ':'))
|
0
airflow/kite_airflow/dags/__init__.py
Normal file
0
airflow/kite_airflow/dags/__init__.py
Normal file
287
airflow/kite_airflow/dags/coding_stats_mail.py
Normal file
287
airflow/kite_airflow/dags/coding_stats_mail.py
Normal file
@ -0,0 +1,287 @@
|
||||
import concurrent.futures
|
||||
import datetime
|
||||
import threading
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from jinja2 import PackageLoader
|
||||
|
||||
from customerio import CustomerIO
|
||||
from kite_airflow.common import configs
|
||||
from kite_airflow.common import utils
|
||||
from kite_airflow.common import files
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
DIR_BASE_URI = 's3://{}/{}'.format(configs.BUCKET, 'coding-stats-mail')
|
||||
DIR_APPROX_PERCENTILES = 'approx_percentiles'
|
||||
DIR_DAILY_ACTIVE_USERS = 'daily_active_users'
|
||||
DIR_CODING_STATS = 'coding_stats'
|
||||
|
||||
TABLE_DAILY_ACTIVE_USERS = 'kite_daily_active_users' if utils.is_production() else 'kite_daily_active_users_dev'
|
||||
|
||||
USER_LIMIT = -1 # helpful to reduce time during development
|
||||
NUM_OF_WEEKS = 6
|
||||
EVENT_STATS_EMAIL = 'send_stats_email_weekly'
|
||||
|
||||
cio_local = threading.local()
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2021, 1, 24),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 0,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
kite_coding_stats_email_dag = DAG(
|
||||
'kite_coding_stats_mail',
|
||||
description='Weekly coding stats emails to users that are active in last 2 weeks.',
|
||||
default_args=default_args,
|
||||
schedule_interval='0 20 * * SUN', # Every Sunday 20:00
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
approx_percentiles_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='get_approx_percentiles',
|
||||
query='athena/coding_stats_mail/queries/approx_percentiles.sql',
|
||||
params={
|
||||
'languages': utils.get_supported_languages(),
|
||||
},
|
||||
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_APPROX_PERCENTILES),
|
||||
database=configs.DB_KITE_METRICS,
|
||||
dag=kite_coding_stats_email_dag,
|
||||
)
|
||||
|
||||
drop_daily_active_users_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='drop_daily_active_users',
|
||||
query='athena/coding_stats_mail/queries/drop_daily_active_users.sql',
|
||||
params={
|
||||
'table_name': TABLE_DAILY_ACTIVE_USERS,
|
||||
},
|
||||
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_DAILY_ACTIVE_USERS),
|
||||
database=configs.DB_KITE_METRICS,
|
||||
dag=kite_coding_stats_email_dag,
|
||||
)
|
||||
|
||||
create_daily_active_users_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='create_daily_active_users',
|
||||
query='athena/coding_stats_mail/tables/kite_daily_active_users.sql',
|
||||
params={
|
||||
'table_name': TABLE_DAILY_ACTIVE_USERS,
|
||||
'data_location': '{}/{}/'.format(DIR_BASE_URI, DIR_DAILY_ACTIVE_USERS),
|
||||
},
|
||||
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_DAILY_ACTIVE_USERS),
|
||||
database=configs.DB_KITE_METRICS,
|
||||
dag=kite_coding_stats_email_dag,
|
||||
)
|
||||
|
||||
update_daily_active_users_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='update_daily_active_users',
|
||||
query='athena/coding_stats_mail/queries/update_daily_active_users.sql',
|
||||
params={
|
||||
'table_name': TABLE_DAILY_ACTIVE_USERS,
|
||||
'languages': utils.get_supported_languages(),
|
||||
},
|
||||
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_DAILY_ACTIVE_USERS),
|
||||
database=configs.DB_KITE_METRICS,
|
||||
dag=kite_coding_stats_email_dag,
|
||||
)
|
||||
|
||||
coding_stats_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='coding_stats',
|
||||
query='athena/coding_stats_mail/queries/coding_stats.sql',
|
||||
params={
|
||||
'table_daily_active_users': TABLE_DAILY_ACTIVE_USERS,
|
||||
'languages': utils.get_supported_languages(),
|
||||
'num_of_weeks': NUM_OF_WEEKS,
|
||||
},
|
||||
output_location='{}/{}/'.format(configs.DIR_SCRATCH_URI, DIR_CODING_STATS),
|
||||
database=configs.DB_KITE_METRICS,
|
||||
dag=kite_coding_stats_email_dag,
|
||||
)
|
||||
|
||||
|
||||
def get_approx_percentiles(ti):
|
||||
percentiles_list = files.get_full_scratch_space_csv(
|
||||
ti,
|
||||
approx_percentiles_op.task_id,
|
||||
DIR_APPROX_PERCENTILES,
|
||||
)[0]
|
||||
|
||||
approx_percentiles = []
|
||||
for percentile_index in range(1, 100):
|
||||
approx_percentiles.append({
|
||||
"percentile": percentile_index,
|
||||
"value": float(percentiles_list[f'pct_{percentile_index}']),
|
||||
})
|
||||
|
||||
return approx_percentiles
|
||||
|
||||
|
||||
def get_coding_time_percentile(coding_hours, percentiles):
|
||||
max_coding_time_percentile = 0
|
||||
|
||||
for index in range(len(percentiles)):
|
||||
if percentiles[index]["value"] <= coding_hours:
|
||||
max_coding_time_percentile = percentiles[index]["percentile"]
|
||||
|
||||
return max_coding_time_percentile
|
||||
|
||||
|
||||
def is_inactive_user(record):
|
||||
'''
|
||||
Checks if user is inactive by looking at first two weeks of data
|
||||
|
||||
Adding first two weeks of coding_hours & completions_selected, if both of them are zero
|
||||
then returns True
|
||||
'''
|
||||
return (
|
||||
(record['coding_hours'].get(0, 0) + record['coding_hours'].get(1, 0) == 0) and
|
||||
(record['completions_selected'].get(0, 0) + record['completions_selected'].get(1, 0) == 0)
|
||||
)
|
||||
|
||||
|
||||
def get_track_object(coding_stat_row, execution_date, all_percentiles):
|
||||
'''Returns the track_object OR None in case of inactive user'''
|
||||
|
||||
# transforms coding stat data to their respective types
|
||||
coding_stat_row['total_weeks'] = int(coding_stat_row['total_weeks'])
|
||||
coding_stat_row['streak'] = int(coding_stat_row['streak'])
|
||||
coding_stat_row['completions_selected'] = utils.string_to_dict(coding_stat_row['completions_selected'])
|
||||
coding_stat_row['coding_hours'] = utils.string_to_dict(coding_stat_row['coding_hours'])
|
||||
coding_stat_row['python_hours'] = utils.string_to_dict(coding_stat_row['python_hours'])
|
||||
|
||||
if is_inactive_user(coding_stat_row):
|
||||
return None
|
||||
|
||||
coding_time_graph = []
|
||||
max_coding_hours = max(coding_stat_row['coding_hours'].values())
|
||||
max_python_hours = max(coding_stat_row['python_hours'].values())
|
||||
|
||||
exec_date_end = datetime.datetime(execution_date.year, execution_date.month, execution_date.day) + datetime.timedelta(days=7)
|
||||
sat_offset = (exec_date_end.weekday() - 5) % 7
|
||||
sun_offset = (exec_date_end.weekday() - 6) % 7
|
||||
|
||||
for week_index in range(NUM_OF_WEEKS - 1, -1, -1):
|
||||
start_date = exec_date_end.replace(hour=0, minute=0, second=0) - datetime.timedelta(days=7 * (week_index + 1) + sun_offset) # Sunday 12:00am
|
||||
end_date = exec_date_end.replace(hour=23, minute=59, second=59) - datetime.timedelta(days=7 * week_index + sat_offset) # Saturday 11:59:59pm
|
||||
|
||||
coding_hours = coding_stat_row['coding_hours'].get(week_index, 0)
|
||||
python_hours = coding_stat_row['python_hours'].get(week_index, 0)
|
||||
completions_selected = coding_stat_row['completions_selected'].get(week_index, 0)
|
||||
|
||||
coding_time_graph.append({
|
||||
'start_date': int(start_date.timestamp()),
|
||||
'end_date': int(end_date.timestamp()),
|
||||
'coding_hours': coding_hours,
|
||||
'scaled_coding_hours': coding_hours / max_coding_hours if max_coding_hours > 0 else 0,
|
||||
'py_hours': python_hours,
|
||||
'scaled_py_hours': python_hours / max_python_hours if max_python_hours > 0 else 0,
|
||||
'completions_used': completions_selected,
|
||||
'time_saved': python_hours * 0.18,
|
||||
})
|
||||
|
||||
return dict(
|
||||
all_time_weeks = coding_stat_row['total_weeks'],
|
||||
streak = coding_stat_row['streak'],
|
||||
coding_time_percentile = get_coding_time_percentile(
|
||||
coding_stat_row['coding_hours'].get(week_index, 0),
|
||||
all_percentiles
|
||||
),
|
||||
coding_time_graph = coding_time_graph,
|
||||
)
|
||||
|
||||
|
||||
def iteration(ti, execution_date, storage_task_name):
|
||||
all_percentiles = get_approx_percentiles(ti)
|
||||
start_row = ti.xcom_pull(task_ids=storage_task_name, key='progress')
|
||||
|
||||
for i, coding_stat_row in files.get_line_of_scratch_space_csv(ti, coding_stats_op.task_id, DIR_CODING_STATS):
|
||||
if i <= start_row:
|
||||
continue
|
||||
|
||||
yield (
|
||||
i,
|
||||
coding_stat_row['userid'],
|
||||
get_track_object(coding_stat_row, execution_date, all_percentiles)
|
||||
)
|
||||
|
||||
if i == USER_LIMIT:
|
||||
return
|
||||
|
||||
|
||||
def send_event_to_cio(item):
|
||||
i, userid, track_object = item
|
||||
|
||||
if not hasattr(cio_local, 'client'):
|
||||
cio_local.client = CustomerIO(configs.CIO_CREDENTIALS['site_id'], configs.CIO_CREDENTIALS['api_key'])
|
||||
|
||||
if track_object != None:
|
||||
cio_local.client.track(customer_id=userid, name=EVENT_STATS_EMAIL, **track_object)
|
||||
|
||||
return i
|
||||
|
||||
|
||||
def submissions_to_cio(ti, execution_date, dag_run, storage_task_name, **context):
|
||||
queue_size = 100
|
||||
futures = []
|
||||
records_iter = iteration(ti, execution_date, storage_task_name)
|
||||
has_values = True
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=configs.CIO_MAX_CONCURRENT_REQUESTS) as executor:
|
||||
while has_values:
|
||||
while len(futures) < queue_size:
|
||||
try:
|
||||
futures.append(executor.submit(send_event_to_cio, next(records_iter)))
|
||||
except StopIteration:
|
||||
has_values = False
|
||||
break
|
||||
|
||||
mode = concurrent.futures.FIRST_COMPLETED if has_values else concurrent.futures.ALL_COMPLETED
|
||||
done, not_done = concurrent.futures.wait(futures, timeout=6000, return_when=mode)
|
||||
futures = list(not_done)
|
||||
|
||||
for future in done:
|
||||
try:
|
||||
i = future.result()
|
||||
|
||||
except Exception:
|
||||
dag_run.get_task_instance(storage_task_name).xcom_push(
|
||||
key='progress',
|
||||
value=i - configs.CIO_MAX_CONCURRENT_REQUESTS # subtracting because due to threading we can't get the exact index
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
progress_storage_operator = PythonOperator(
|
||||
python_callable=lambda ti, **kwargs: ti.xcom_push(key='progress', value=0),
|
||||
task_id='progress_storage_{}'.format(submissions_to_cio.__name__),
|
||||
dag=kite_coding_stats_email_dag,
|
||||
provide_context=True,
|
||||
)
|
||||
|
||||
submissions_to_cio_operator = PythonOperator(
|
||||
python_callable=submissions_to_cio,
|
||||
task_id=submissions_to_cio.__name__,
|
||||
dag=kite_coding_stats_email_dag,
|
||||
provide_context=True,
|
||||
op_kwargs={'storage_task_name': 'progress_storage_{}'.format(submissions_to_cio.__name__)}
|
||||
)
|
||||
|
||||
(
|
||||
approx_percentiles_op,
|
||||
drop_daily_active_users_op >> create_daily_active_users_op >> update_daily_active_users_op >> coding_stats_op,
|
||||
progress_storage_operator,
|
||||
) >> submissions_to_cio_operator
|
92
airflow/kite_airflow/dags/dashboards.py
Normal file
92
airflow/kite_airflow/dags/dashboards.py
Normal file
@ -0,0 +1,92 @@
|
||||
import requests
|
||||
import time
|
||||
from airflow.models import Variable
|
||||
from airflow import DAG
|
||||
import pendulum
|
||||
import datetime
|
||||
import json
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
KIBANA_VERSION = '7.9.3'
|
||||
KIBANA_URL = XXXXXXX
|
||||
SLACK_URL = 'https://slack.com/api/files.upload'
|
||||
|
||||
|
||||
local_tz = pendulum.timezone('America/Los_Angeles')
|
||||
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2020, 10, 27, tzinfo=local_tz),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 0,
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
dag = DAG(
|
||||
'slack_dashboards',
|
||||
default_args=default_args,
|
||||
description='Render and post dashboards to Slack.',
|
||||
schedule_interval='0 10 * * *',
|
||||
)
|
||||
|
||||
|
||||
def dashboards(conf, **context):
|
||||
import logging
|
||||
logger = logging.getLogger("airflow.task")
|
||||
|
||||
kibana_requests_kwargs = {'headers': {'kbn-version': KIBANA_VERSION}, 'auth': ('elastic', Variable.get('elastic_password'))}
|
||||
|
||||
dashboards = Variable.get("slack_dashboards", deserialize_json=True)
|
||||
enqueued = []
|
||||
for dashboard in dashboards:
|
||||
res = requests.post(dashboard['url'], **kibana_requests_kwargs)
|
||||
if res.status_code != 200:
|
||||
raise Exception("Error requesting dashboard, config={}, code={}, response={}".format(json.dumps(dashboard), res.status_code, res.text))
|
||||
logger.info("ENQUEUE RES={}".format(res.json()))
|
||||
enqueued.append(res.json())
|
||||
|
||||
errors = []
|
||||
for dashboard, rendered_url in zip(dashboards, enqueued):
|
||||
logger.info('Waiting for dashboard "{}"'.format(dashboard['slackParams']['title']))
|
||||
while True:
|
||||
res = requests.get("{}{}".format(KIBANA_URL, rendered_url['path']), **kibana_requests_kwargs)
|
||||
if res.status_code == 503:
|
||||
logger.info('Received 503 response, sleeping.')
|
||||
time.sleep(60)
|
||||
continue
|
||||
elif res.status_code != 200:
|
||||
errors.append('Error fetching rendered dashboard, config={}, code={}, response={}'.format(json.dumps(dashboard), res.status_code, res.text))
|
||||
break
|
||||
|
||||
logger.info('Kibana response: code={}, response={}'.format(res.status_code, res.content))
|
||||
filename = dashboard['slackParams']['filename']
|
||||
logger.info('Slack request: files={}, headers={}, url={}'.format({
|
||||
'file': (filename, res.content),
|
||||
**{k: (None, v) for k, v in dashboard['slackParams'].items()},
|
||||
}, {'Authorization': 'Bearer {}'.format(Variable.get('slack_token'))}, SLACK_URL))
|
||||
slack_res = requests.post(
|
||||
SLACK_URL,
|
||||
files={
|
||||
'file': (filename, res.content),
|
||||
**{k: (None, v) for k, v in dashboard['slackParams'].items()},
|
||||
},
|
||||
headers={'Authorization': 'Bearer {}'.format(Variable.get('slack_token'))}
|
||||
)
|
||||
logger.info('Slack response: code={}, response={}'.format(slack_res.status_code, slack_res.text))
|
||||
break
|
||||
|
||||
if errors:
|
||||
raise Exception('\n'.join(errors))
|
||||
|
||||
|
||||
dashboards_operator = PythonOperator(
|
||||
python_callable=dashboards,
|
||||
task_id='dashboards',
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
)
|
363
airflow/kite_airflow/dags/hubspot.py
Normal file
363
airflow/kite_airflow/dags/hubspot.py
Normal file
@ -0,0 +1,363 @@
|
||||
import logging
|
||||
import datetime
|
||||
import tempfile
|
||||
import requests
|
||||
import yaml
|
||||
import json
|
||||
import gzip
|
||||
import re
|
||||
import time
|
||||
|
||||
from airflow import DAG
|
||||
from jinja2 import Template
|
||||
import customerio
|
||||
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.hooks.postgres_hook import PostgresHook
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.models import Variable
|
||||
from airflow.sensors.external_task_sensor import ExternalTaskSensor
|
||||
from airflow.contrib.operators.s3_list_operator import S3ListOperator
|
||||
from kite_airflow.dags.kite_status_1d import dag as kits_status_1d_dag, read_s3_json_files
|
||||
from jinja2 import PackageLoader
|
||||
import concurrent.futures
|
||||
import pkg_resources
|
||||
import kite_metrics
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': True,
|
||||
'start_date': datetime.datetime(2020, 6, 28),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
|
||||
DATA_LOC = 's3://kite-metrics/firehose/kite_status/'
|
||||
PROD_RESULT_LOC_PREFIX = 's3://kite-metrics/athena-results'
|
||||
|
||||
kite_status_config = kite_metrics.load_context('kite_status')
|
||||
LANGS = kite_status_config['languages']
|
||||
EDITORS = kite_status_config['editors']
|
||||
|
||||
contat_props_tmpl = Template(pkg_resources.resource_string('kite_airflow', 'files/hubspot_contactprops.yaml').decode('utf8'))
|
||||
contact_props_yaml = contat_props_tmpl.render(editors=EDITORS, langs=LANGS)
|
||||
contact_props = yaml.load(contact_props_yaml, Loader=yaml.FullLoader)
|
||||
|
||||
dag = DAG(
|
||||
'hubspot_user_metrics',
|
||||
default_args=default_args,
|
||||
description='A simple tutorial DAG',
|
||||
schedule_interval='30 0 * * *',
|
||||
max_active_runs=1,
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
previous_dag_run_sensor = ExternalTaskSensor(
|
||||
task_id='previous_dag_run_sensor',
|
||||
dag=dag,
|
||||
external_dag_id=dag.dag_id,
|
||||
execution_delta=datetime.timedelta(days=1),
|
||||
mode='reschedule',
|
||||
)
|
||||
|
||||
kite_status_dag_run_sensor = ExternalTaskSensor(
|
||||
task_id='kite_status_dag_run_sensor',
|
||||
dag=dag,
|
||||
execution_delta=datetime.timedelta(minutes=20),
|
||||
external_dag_id=kits_status_1d_dag.dag_id,
|
||||
mode='reschedule',
|
||||
)
|
||||
|
||||
drop_intermediate_table = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='drop_intermediate_table',
|
||||
query='DROP TABLE kite_metrics.hubspot_intermediate',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
create_intermediate_table = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_intermediate_table',
|
||||
query='athena/tables/hubspot_intermediate.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'props': contact_props},
|
||||
)
|
||||
|
||||
(previous_dag_run_sensor, kite_status_dag_run_sensor) >> drop_intermediate_table >> create_intermediate_table
|
||||
|
||||
insert_deltas = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='insert_deltas',
|
||||
query='athena/queries/hubspot_delta.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
params={'props': contact_props},
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
insert_deltas >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_delta_table',
|
||||
query='DROP TABLE hubspot_delta_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
||||
(previous_dag_run_sensor, kite_status_dag_run_sensor) >> insert_deltas
|
||||
|
||||
EMAIL_RE = re.compile(r'^\s*[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\s*$', re.I)
|
||||
|
||||
|
||||
def write_contact_prop_data(ti, **context):
|
||||
props = [p['name'] for p in contact_props if 'label' in p]
|
||||
props.append('user_id')
|
||||
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
buffer = []
|
||||
|
||||
# Hubspot validates emails against some list of domain extensions. Go fetch a list to replicate that.
|
||||
domains_resp = requests.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt')
|
||||
domains = set([d.lower() for d in domains_resp.text.split('\n') if re.match('^[a-z]+$', d.lower())])
|
||||
counter = 0
|
||||
|
||||
for file in sorted(ti.xcom_pull(task_ids='list_hubspot_json_files')):
|
||||
obj = s3.get_key(file, 'kite-metrics')
|
||||
for line in gzip.open(obj.get()['Body']):
|
||||
counter += 1
|
||||
if counter % 1000 == 0:
|
||||
logger.info('Processed {} records'.format(counter))
|
||||
|
||||
rec = json.loads(line)
|
||||
email = rec['email']
|
||||
if not EMAIL_RE.match(email) or email.rsplit('.', 1)[1] not in domains:
|
||||
logger.info('Skipping invalid email address {}'.format(email))
|
||||
continue
|
||||
|
||||
if any([rec.get('{}_percentage'.format(key)) is not None for key in LANGS]):
|
||||
rec['user_data_primary_language'] = max(LANGS, key=lambda x: rec.get('{}_percentage'.format(x)) or 0)
|
||||
|
||||
if any([rec.get('python_edits_in_{}'.format(key)) for key in EDITORS]):
|
||||
rec['user_data_primary_python_editor'] = max(EDITORS, key=lambda x: rec.get('python_edits_in_{}'.format(x)) or 0)
|
||||
|
||||
hs_props = {prop: rec[prop] for prop in props if rec.get(prop) is not None}
|
||||
hs_props['kite_lifecycle_stages'] = 'User' # This property is called 'Source' in HS
|
||||
buffer.append({'email': email, 'properties': [{'property': prop, 'value': value} for prop, value in hs_props.items()]})
|
||||
|
||||
if len(buffer) >= 100:
|
||||
make_hubspot_request('contacts/v1/contact/batch', buffer)
|
||||
buffer = []
|
||||
|
||||
if buffer:
|
||||
make_hubspot_request('contacts/v1/contact/batch', buffer)
|
||||
|
||||
|
||||
def copy_kite_users():
|
||||
pg_hook = PostgresHook(postgres_conn_id='community')
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
tf = tempfile.NamedTemporaryFile()
|
||||
pg_hook.copy_expert("COPY public.user (id, name, email) TO STDOUT WITH (FORMAT csv)", tf.name)
|
||||
s3.load_file(tf.name, 'enrichment/kite/users/users.csv', bucket_name='kite-metrics', replace=True)
|
||||
|
||||
|
||||
copy_kite_users_operator = PythonOperator(
|
||||
python_callable=copy_kite_users,
|
||||
task_id=copy_kite_users.__name__,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
setup_partitions = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='setup_final_partitions',
|
||||
query='MSCK REPAIR TABLE hubspot_intermediate',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
[create_intermediate_table, insert_deltas] >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='insert_rollups',
|
||||
query='athena/queries/hubspot_rollup.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'scalar_props': [p for p in contact_props if 'agg' in p['sql']],
|
||||
'map_props': [p for p in contact_props if 'map_agg' in p['sql']],
|
||||
'scalar_time_rollups': set([prop['sql']['agg_days'] for prop in contact_props if 'agg_days' in prop['sql']]),
|
||||
},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_rollup_table',
|
||||
query='DROP TABLE hubspot_rollup_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
) >> setup_partitions
|
||||
|
||||
|
||||
(copy_kite_users_operator, setup_partitions) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_hubspot_final_table',
|
||||
query='athena/queries/hubspot_final.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'scalar_props': [p for p in contact_props if 'map_agg' not in p['sql']],
|
||||
'map_props': [p for p in contact_props if 'map_agg' in p['sql']],
|
||||
},
|
||||
) >> S3ListOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='list_hubspot_json_files',
|
||||
bucket='kite-metrics',
|
||||
prefix='athena/hubspot/final/{{ds}}/',
|
||||
delimiter='/',
|
||||
dag=dag,
|
||||
) >> PythonOperator(
|
||||
python_callable=write_contact_prop_data,
|
||||
task_id=write_contact_prop_data.__name__,
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_final_table',
|
||||
query='DROP TABLE hubspot_final_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
||||
def write_cio_profile_attrs(task_instance, execution_date, dag_run, **context):
|
||||
cio_creds = Variable.get('cio_credentials', deserialize_json=True)
|
||||
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_cio_json_files'))
|
||||
|
||||
def iter():
|
||||
for i, rec in enumerate(iter_records):
|
||||
if not rec['id'] or not all(ord(c) < 128 for c in rec['id']):
|
||||
continue
|
||||
if 'time_zone' in rec:
|
||||
rec['timezone'] = rec.pop('time_zone')
|
||||
yield i, rec
|
||||
|
||||
def call_cio(item):
|
||||
i, kwargs = item
|
||||
customerio.CustomerIO(cio_creds['site_id'], cio_creds['api_key']).identify(**kwargs)
|
||||
return i
|
||||
|
||||
queue_size = 100
|
||||
pool_size = 20
|
||||
futures = []
|
||||
records_iter = iter()
|
||||
max_i = 0
|
||||
has_values = True
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=pool_size) as executor:
|
||||
while has_values:
|
||||
while len(futures) < queue_size:
|
||||
try:
|
||||
futures.append(executor.submit(call_cio, next(records_iter)))
|
||||
except StopIteration:
|
||||
has_values = False
|
||||
break
|
||||
|
||||
mode = concurrent.futures.FIRST_COMPLETED if has_values else concurrent.futures.ALL_COMPLETED
|
||||
done, not_done = concurrent.futures.wait(futures, timeout=6000, return_when=mode)
|
||||
futures = list(not_done)
|
||||
for future in done:
|
||||
i = future.result()
|
||||
if max_i > 0 and (i // 1000) > (max_i // 1000):
|
||||
logger.info("Processed line {}".format(i))
|
||||
max_i = max(max_i, i)
|
||||
|
||||
|
||||
setup_partitions >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_cio_table',
|
||||
query='athena/queries/cio_profile_attrs.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'props': ["country_name", "city_name", "subdivision_1_name", "time_zone"]
|
||||
},
|
||||
) >> S3ListOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='list_cio_json_files',
|
||||
bucket='kite-metrics',
|
||||
prefix='athena/cio_profile_attrs/{{ds}}/',
|
||||
delimiter='/',
|
||||
dag=dag,
|
||||
) >> PythonOperator(
|
||||
python_callable=write_cio_profile_attrs,
|
||||
task_id=write_cio_profile_attrs.__name__,
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_cio_table',
|
||||
query='DROP TABLE cio_profile_attrs_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
|
||||
USER_DATA_PROPGROUP_NAME = 'user_data'
|
||||
MAX_TRIES = 3
|
||||
|
||||
|
||||
def make_hubspot_request(path, data=None, method=None, tries=0):
|
||||
url = 'https://api.hubapi.com/{}?hapikey={}'.format(path, Variable.get('hubspot_apikey'))
|
||||
req_fn = getattr(requests, method) if method else (requests.post if data else requests.get)
|
||||
resp = req_fn(url, **({'json': data} if data else {}))
|
||||
tries = tries + 1
|
||||
if resp.status_code == 502 and tries < MAX_TRIES:
|
||||
logger.warn('Got 502 from Hubspot API, sleeping 60 seconds before retry.')
|
||||
time.sleep(60)
|
||||
return make_hubspot_request(path, data, method, tries)
|
||||
if resp.status_code >= 300:
|
||||
raise Exception('Error make hubspot request, code={}, response={}'.format(resp.status_code, resp.text))
|
||||
return resp
|
||||
|
||||
|
||||
def update_contact_props():
|
||||
props = make_hubspot_request('properties/v1/contacts/properties').json()
|
||||
props_dict = {prop['name']: prop for prop in props if prop['groupName'] == USER_DATA_PROPGROUP_NAME}
|
||||
|
||||
for prop in contact_props:
|
||||
if 'label' not in prop:
|
||||
continue
|
||||
prop = prop.copy()
|
||||
prop.pop('sql', None)
|
||||
prop['groupName'] = USER_DATA_PROPGROUP_NAME
|
||||
if prop['name'] not in props_dict:
|
||||
make_hubspot_request('properties/v1/contacts/properties', prop)
|
||||
continue
|
||||
if {k: v for k, v in props_dict[prop['name']].items() if k in prop} == prop:
|
||||
continue
|
||||
make_hubspot_request('properties/v1/contacts/properties/named/{}'.format(prop['name']), prop, 'put')
|
||||
|
||||
|
||||
update_contact_props_operator = PythonOperator(
|
||||
python_callable=update_contact_props,
|
||||
task_id=update_contact_props.__name__,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
previous_dag_run_sensor >> update_contact_props_operator
|
96
airflow/kite_airflow/dags/hubspot_companies.py
Normal file
96
airflow/kite_airflow/dags/hubspot_companies.py
Normal file
@ -0,0 +1,96 @@
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.models import Variable
|
||||
from jinja2 import PackageLoader
|
||||
import mixpanel
|
||||
|
||||
from kite_airflow.dags.hubspot import make_hubspot_request
|
||||
from kite_airflow.plugins.google import GoogleSheetsRangeOperator
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': True,
|
||||
'start_date': datetime.datetime(2021, 1, 7),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
|
||||
dag = DAG(
|
||||
'hubspot_companies',
|
||||
default_args=default_args,
|
||||
description='Sychronizes user company data from hubspot to other systems.',
|
||||
schedule_interval='0 12 * * *',
|
||||
max_active_runs=1,
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
MP_COMPANY_PROP = 'Company name'
|
||||
|
||||
|
||||
def write_company_assignments(ti, **ctx):
|
||||
mp_consumer = mixpanel.BufferedConsumer(max_size=100)
|
||||
mp_client = mixpanel.Mixpanel(Variable.get('mixpanel_credentials', deserialize_json=True)['token'], consumer=mp_consumer)
|
||||
|
||||
logger.info("Fetching company list")
|
||||
supported_companies = [rec[0] for rec in ti.xcom_pull(task_ids='get_companies_sheet')['values']]
|
||||
for company in supported_companies:
|
||||
logger.info("Starting processing for company {}".format(company))
|
||||
params = {
|
||||
'limit': 100,
|
||||
'filterGroups': [{'filters': [
|
||||
{'propertyName': 'company', 'operator': 'EQ', 'value': company},
|
||||
{'propertyName': 'user_id', 'operator': 'HAS_PROPERTY'}
|
||||
]}],
|
||||
'properties': ['user_id'],
|
||||
}
|
||||
n_done = 0
|
||||
while True:
|
||||
resp = make_hubspot_request('crm/v3/objects/contacts/search', params).json()
|
||||
if resp['total'] == 0:
|
||||
raise Exception('No results for company "{}". Is it mis-spelled?'.format(company))
|
||||
|
||||
for res in resp['results']:
|
||||
mp_client.people_set(
|
||||
res['properties']['user_id'],
|
||||
{MP_COMPANY_PROP: company},
|
||||
meta={'$ignore_time': 'true', '$ip': 0})
|
||||
n_done += 1
|
||||
|
||||
logger.info(" {} / {} records processed".format(n_done, resp['total']))
|
||||
|
||||
after = resp.get('paging', {}).get('next', {}).get('after')
|
||||
if not after:
|
||||
break
|
||||
params['after'] = after
|
||||
time.sleep(20)
|
||||
mp_consumer.flush()
|
||||
|
||||
|
||||
GoogleSheetsRangeOperator(
|
||||
gcp_conn_id='google_cloud_kite_dev',
|
||||
spreadsheet_id='XXXXXXX',
|
||||
range="'Companies to Import to Mixpanel'!CompanyNames",
|
||||
task_id='get_companies_sheet',
|
||||
dag=dag,
|
||||
) >> PythonOperator(
|
||||
python_callable=write_company_assignments,
|
||||
task_id=write_company_assignments.__name__,
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
)
|
314
airflow/kite_airflow/dags/kite_status.py
Normal file
314
airflow/kite_airflow/dags/kite_status.py
Normal file
@ -0,0 +1,314 @@
|
||||
from airflow import DAG
|
||||
import datetime
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from elasticsearch import Elasticsearch
|
||||
import json
|
||||
import gzip
|
||||
import io
|
||||
import logging
|
||||
import base64
|
||||
from elasticsearch.helpers import bulk
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.models import Variable
|
||||
from airflow.contrib.operators.s3_list_operator import S3ListOperator
|
||||
from airflow.models.xcom import XCom
|
||||
import itertools
|
||||
from airflow.operators.python_operator import ShortCircuitOperator
|
||||
from jinja2 import PackageLoader
|
||||
import time
|
||||
from kite_airflow.plugins.google import GoogleSheetsRangeOperator
|
||||
import kite_metrics
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
INDEX_GRANULARITY = datetime.timedelta(days=10)
|
||||
BUCKET = 'kite-metrics'
|
||||
KS_INDEX_PREFIX = 'kite_status'
|
||||
|
||||
|
||||
def resolve_dotted_path(doc, path):
|
||||
container = doc
|
||||
field_name = path
|
||||
while '.' in field_name:
|
||||
container_name, field_name = path.split('.', 1)
|
||||
if container_name not in container:
|
||||
return None, None
|
||||
container = container[container_name]
|
||||
|
||||
if field_name in container:
|
||||
return container, field_name
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def get_index_shard(dt, granularity, epoch=datetime.date(1970, 1, 1)):
|
||||
date = datetime.date(dt.year, dt.month, dt.day)
|
||||
rounded = epoch + (date - epoch) // granularity * granularity
|
||||
return rounded.isoformat()
|
||||
|
||||
|
||||
def iter_s3_file(s3_hook, bucket, key):
|
||||
json_file = s3_hook.get_key(key, BUCKET)
|
||||
for line in gzip.open(json_file.get()['Body']):
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def client_event_convert_fn(docs, index_date_suffix, deployments):
|
||||
for doc in docs:
|
||||
if 'messageId' not in doc:
|
||||
continue
|
||||
|
||||
if 'properties' not in doc:
|
||||
continue
|
||||
|
||||
event = doc.get('event')
|
||||
if event == 'Index Build':
|
||||
index_prefix = 'index_build'
|
||||
elif event == 'Completion Stats':
|
||||
index_prefix = 'completions_selected'
|
||||
else:
|
||||
continue
|
||||
|
||||
index_name = '{}_{}'.format(index_prefix, index_date_suffix)
|
||||
|
||||
for field in ['originalTimestamp']:
|
||||
if field in doc:
|
||||
del doc[field]
|
||||
|
||||
for field in ['repo_stats', 'receivedAt', 'sentAt', 'sent_at', 'parse_info.parse_errors']:
|
||||
container, field_name = resolve_dotted_path(doc['properties'], field)
|
||||
if container:
|
||||
del container[field_name]
|
||||
|
||||
for field in ['cpu_info.sum', 'lexical_metrics.score']:
|
||||
container, field_name = resolve_dotted_path(doc['properties'], field)
|
||||
if container:
|
||||
container[field_name] = float(container[field_name])
|
||||
|
||||
for field in ['completion_stats']:
|
||||
if field in doc['properties']:
|
||||
# completions_stats is an encoded list
|
||||
data = doc['properties'][field]
|
||||
data = base64.b64decode(data)
|
||||
data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
|
||||
data = json.loads(data)
|
||||
del doc['properties'][field]
|
||||
# create one document per completion stat
|
||||
i = 0
|
||||
for stat in data:
|
||||
i += 1
|
||||
elem = doc
|
||||
for key in stat:
|
||||
elem['properties'][key] = stat[key]
|
||||
yield {
|
||||
'_index': index_name,
|
||||
'_id': doc['messageId'] + "-" + str(i),
|
||||
'_source': elem
|
||||
}
|
||||
|
||||
else:
|
||||
yield {
|
||||
'_index': index_name,
|
||||
'_id': doc['messageId'],
|
||||
'_source': doc
|
||||
}
|
||||
|
||||
|
||||
def scrub(a_dict, schema):
|
||||
res = {}
|
||||
for k, v in schema['properties'].items():
|
||||
if k not in a_dict:
|
||||
continue
|
||||
a_val = a_dict[k]
|
||||
elastic = v.get('elastic', False)
|
||||
if isinstance(a_val, dict):
|
||||
if elastic:
|
||||
res[k] = {k1: v1 for k1, v1 in a_val.items() if k1}
|
||||
elif 'properties' in v:
|
||||
res[k] = scrub(a_val, v)
|
||||
continue
|
||||
|
||||
if elastic:
|
||||
res[k] = a_val
|
||||
|
||||
return res
|
||||
|
||||
|
||||
kite_status_config = kite_metrics.load_context('kite_status')
|
||||
kite_status_schema = kite_metrics.load_schema('kite_status')
|
||||
|
||||
|
||||
def kite_status_convert_fn(docs, index_date_suffix, deployments):
|
||||
total_time = 0
|
||||
for i, doc in enumerate(docs):
|
||||
if i and i % 10000 == 0:
|
||||
logger.info('Done {} records, avg time / record={}'.format(i, total_time / i))
|
||||
start_time = time.perf_counter()
|
||||
if doc.get('event') != 'kite_status':
|
||||
total_time += (time.perf_counter() - start_time)
|
||||
continue
|
||||
|
||||
if not doc.get('messageId'):
|
||||
total_time += (time.perf_counter() - start_time)
|
||||
continue
|
||||
|
||||
if 'properties' not in doc:
|
||||
total_time += (time.perf_counter() - start_time)
|
||||
continue
|
||||
|
||||
if sum(doc['properties'].get('{}_events'.format(lang), 0) for lang in kite_status_config['languages']) == 0:
|
||||
total_time += (time.perf_counter() - start_time)
|
||||
continue
|
||||
|
||||
index_name = '{}_active_{}'.format(KS_INDEX_PREFIX, index_date_suffix)
|
||||
|
||||
doc = scrub(doc, kite_status_schema)
|
||||
|
||||
for field in ['cpu_samples_list', 'active_cpu_samples_list']:
|
||||
if not doc['properties'].get(field):
|
||||
continue
|
||||
p = field.split('_')[:-2]
|
||||
new_field = '_'.join(['max'] + p)
|
||||
doc['properties'][new_field] = max(map(float, doc['properties'][field]))
|
||||
|
||||
# We got some bogus timestamps, TODO: validate and cleanup data
|
||||
for field in ['license_expire', 'plan_end']:
|
||||
if isinstance(doc['properties'].get(field), int):
|
||||
if 0 < doc['properties'][field] < 2524636800:
|
||||
doc['properties'][field] = datetime.datetime.fromtimestamp(doc['properties'][field])
|
||||
else:
|
||||
del doc['properties'][field]
|
||||
|
||||
# Next block is for backcompatibilty only
|
||||
# can be removed once the content of the PR https://github.com/kiteco/kiteco/pull/10638/ has been released to
|
||||
# most of our users
|
||||
for field in ['cpu_samples', 'active_cpu_samples']:
|
||||
if field in doc['properties']:
|
||||
samples_str = doc['properties'].pop(field)
|
||||
if len(samples_str) == 0:
|
||||
continue
|
||||
p = field.split('_')[:-1]
|
||||
new_field = '_'.join(['max'] + p)
|
||||
doc['properties'][new_field] = max(map(float, samples_str.split(',')))
|
||||
|
||||
deployment_id = doc['properties'].get('server_deployment_id')
|
||||
if deployment_id and deployment_id in deployments:
|
||||
doc['properties']['server_deployment_name'] = deployments[deployment_id]
|
||||
|
||||
doc['payload_size'] = len(doc)
|
||||
total_time += (time.perf_counter() - start_time)
|
||||
yield {'_index': index_name, '_id': doc['messageId'], '_source': doc}
|
||||
|
||||
|
||||
kite_status_dag = DAG(
|
||||
'elastic_load_kite_status',
|
||||
description='Load kite_status to Kibana.',
|
||||
default_args={
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'start_date': datetime.datetime(2020, 10, 15),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
},
|
||||
schedule_interval='*/10 * * * *',
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
client_events_dag = DAG(
|
||||
'elastic_load_client_events',
|
||||
description='Load client_events to Kibana.',
|
||||
default_args={
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'start_date': datetime.datetime(2020, 10, 15),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
},
|
||||
schedule_interval='*/10 * * * *',
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
convert_fns = {'kite_status': kite_status_convert_fn, 'client_events': client_event_convert_fn}
|
||||
|
||||
|
||||
def bulk_index_metrics(bucket, s3_keys, granularity, key, deployments):
|
||||
s3_hook = S3Hook('aws_us_east_1')
|
||||
es = Elasticsearch(
|
||||
cloud_id="metrics:XXXXXXX",
|
||||
http_auth=("elastic", Variable.get('elastic_password')),
|
||||
)
|
||||
|
||||
def iter():
|
||||
for s3_key in s3_keys:
|
||||
dt = datetime.date(*map(int, s3_key.split('/')[2:5]))
|
||||
index_date_suffix = get_index_shard(dt, granularity)
|
||||
|
||||
for rec in convert_fns[key](iter_s3_file(s3_hook, bucket, s3_key), index_date_suffix, deployments):
|
||||
yield rec
|
||||
|
||||
bulk(es, iter())
|
||||
|
||||
|
||||
def skip_no_new_files(ti, **kwargs):
|
||||
prev_files = set(itertools.chain(*[result.value for result in XCom.get_many(
|
||||
execution_date=ti.execution_date,
|
||||
dag_ids=ti.dag_id,
|
||||
task_ids=ti.task_id,
|
||||
include_prior_dates=True,
|
||||
limit=100
|
||||
)]))
|
||||
|
||||
all_files = set(ti.xcom_pull(task_ids='list_prev_json_files') + (ti.xcom_pull(task_ids='list_next_json_files') or []))
|
||||
curr_files = list(all_files - prev_files)
|
||||
ti.xcom_push(key='curr_files', value=curr_files)
|
||||
return len(curr_files) > 0
|
||||
|
||||
|
||||
for key, dag in [('kite_status', kite_status_dag), ('client_events', client_events_dag)]:
|
||||
list_ops = [
|
||||
S3ListOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='list_{}_json_files'.format(k),
|
||||
bucket='kite-metrics',
|
||||
prefix="firehose/{}/{{{{ (execution_date + macros.timedelta(hours={})).format('%Y/%m/%d/%H') }}}}/".format(key, diff),
|
||||
delimiter='/',
|
||||
dag=dag,
|
||||
) for k, diff in [('prev', 0), ('next', 1)]
|
||||
]
|
||||
|
||||
def load_fn(ti, params, **kwargs):
|
||||
s3_keys = ti.xcom_pull(task_ids=skip_no_new_files.__name__, key='curr_files')
|
||||
logger.info("Loading files {}".format(', '.join(s3_keys)))
|
||||
|
||||
deployments_data = ti.xcom_pull(task_ids='copy_server_deployments')['values']
|
||||
id_col = deployments_data[1].index('Deployment ID')
|
||||
name_col = deployments_data[1].index('Name')
|
||||
|
||||
deployments = {d[id_col]: d[name_col] for d in deployments_data[2:] if len(d) > max(id_col, name_col) and d[name_col].strip()}
|
||||
bulk_index_metrics(BUCKET, s3_keys, INDEX_GRANULARITY, params['key'], deployments)
|
||||
return s3_keys
|
||||
|
||||
list_ops >> ShortCircuitOperator(
|
||||
task_id=skip_no_new_files.__name__,
|
||||
python_callable=skip_no_new_files,
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
depends_on_past=True,
|
||||
) >> GoogleSheetsRangeOperator(
|
||||
gcp_conn_id='google_cloud_kite_dev',
|
||||
spreadsheet_id='1-XXXXXXX',
|
||||
range='A:D',
|
||||
task_id='copy_server_deployments',
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
) >> PythonOperator(
|
||||
python_callable=load_fn,
|
||||
task_id='load_{}'.format(key),
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
params={'key': key}
|
||||
)
|
366
airflow/kite_airflow/dags/kite_status_1d.py
Normal file
366
airflow/kite_airflow/dags/kite_status_1d.py
Normal file
@ -0,0 +1,366 @@
|
||||
from datetime import timedelta
|
||||
import base64
|
||||
import hashlib
|
||||
import mixpanel
|
||||
import gzip
|
||||
import json
|
||||
import customerio
|
||||
from airflow.contrib.operators.s3_list_operator import S3ListOperator
|
||||
# The DAG object; we'll need this to instantiate a DAG
|
||||
from airflow import DAG
|
||||
# Operators; we need this to operate!
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from airflow.models import Variable
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.operators.python_operator import ShortCircuitOperator
|
||||
import logging
|
||||
import datetime
|
||||
from jinja2 import PackageLoader
|
||||
import kite_metrics
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
MP_START_DATE = datetime.datetime(2020, 5, 29)
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2020, 5, 24),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 0,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
|
||||
DATA_LOC = 's3://kite-metrics/firehose/kite_status/'
|
||||
PROD_RESULT_LOC_PREFIX = 's3://kite-metrics/athena-results'
|
||||
|
||||
dag = DAG(
|
||||
'kite_status_1d',
|
||||
default_args=default_args,
|
||||
description='A simple tutorial DAG',
|
||||
schedule_interval='10 0 * * *',
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
kite_status_config = kite_metrics.load_context('kite_status')
|
||||
kite_status_schema = kite_metrics.load_schema('kite_status')
|
||||
|
||||
schema_reload_ops = []
|
||||
|
||||
for table_name in ['kite_status', 'kite_status_segment', 'kite_status_normalized']:
|
||||
schema_reload_ops.append(AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='drop_{}'.format(table_name),
|
||||
query='DROP TABLE {{params.table_name}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'table_name': table_name},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_{}'.format(table_name),
|
||||
query='athena/tables/{}.tmpl.sql'.format(table_name),
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'schema': kite_status_schema, 'table_name': table_name}
|
||||
))
|
||||
|
||||
insert_kite_status_normalized = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='insert_kite_status_normalized',
|
||||
query='athena/queries/kite_status_normalized.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'schema': kite_status_schema}
|
||||
)
|
||||
|
||||
cleanup_kite_status_normalized_table = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_kite_status_normalized_table',
|
||||
query='DROP TABLE kite_status_normalized_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
schema_reload_ops >> insert_kite_status_normalized >> cleanup_kite_status_normalized_table
|
||||
|
||||
|
||||
def read_s3_json_files(bucket, file_list):
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
|
||||
for file in sorted(file_list):
|
||||
obj = s3.get_key(file, bucket)
|
||||
for line in gzip.open(obj.get()['Body']):
|
||||
rec = json.loads(line)
|
||||
to_clean = [rec]
|
||||
while to_clean:
|
||||
this = to_clean.pop()
|
||||
for k in list(this.keys()):
|
||||
v = this[k]
|
||||
if isinstance(v, dict):
|
||||
to_clean.append(v)
|
||||
continue
|
||||
if v is None:
|
||||
del this[k]
|
||||
yield rec
|
||||
|
||||
|
||||
def load_athena_to_elastic(task_instance, execution_date, **context):
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.helpers import bulk
|
||||
es = Elasticsearch(
|
||||
cloud_id="metrics:XXXXXXX",
|
||||
http_auth=("elastic", Variable.get('elastic_password')),
|
||||
)
|
||||
|
||||
def iter():
|
||||
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_mixpanel_json_files'))
|
||||
for i, rec in enumerate(iter_records):
|
||||
try:
|
||||
if sum(rec.get('{}_events'.format(lang), 0) for lang in kite_status_config['languages']) == 0:
|
||||
continue
|
||||
|
||||
if rec['event'] != 'kite_status':
|
||||
continue
|
||||
|
||||
ts = datetime.datetime.fromtimestamp(rec['end_time'])
|
||||
|
||||
rec_id_str = '{}::{}'.format(rec.get('userid', ''), ts.strftime('%Y/%m/%d'))
|
||||
rec_id = hashlib.md5(rec_id_str.encode('utf8')).hexdigest()
|
||||
rec['timestamp'] = ts
|
||||
yield {'_index': 'kite_status_1d_{}'.format(execution_date.format('%Y%m')), '_id': rec_id, '_source': rec}
|
||||
except Exception:
|
||||
logger.exception("Error processing line {}, content={}".format(i, rec))
|
||||
raise
|
||||
|
||||
bulk(es, iter())
|
||||
|
||||
|
||||
event_names = {
|
||||
'anon_supported_file_edited': 'anon_supported_file_edited_1d',
|
||||
'anon_kite_status': 'anon_kite_status_1d',
|
||||
'kite_status': 'kite_status_1d',
|
||||
}
|
||||
|
||||
|
||||
def load_athena_to_mixpanel(task_instance, execution_date, dag_run, storage_task_name, **context):
|
||||
mp_consumer = mixpanel.BufferedConsumer(max_size=100)
|
||||
mp_client = mixpanel.Mixpanel(Variable.get('mixpanel_credentials', deserialize_json=True)['token'], consumer=mp_consumer)
|
||||
start_row = task_instance.xcom_pull(task_ids=storage_task_name, key='progress')
|
||||
|
||||
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_mixpanel_json_files'))
|
||||
for i, rec in enumerate(iter_records):
|
||||
if i <= start_row:
|
||||
continue
|
||||
try:
|
||||
insert_id = str(base64.b64encode(
|
||||
hashlib.md5('{}::{}'.format(
|
||||
rec['userid'],
|
||||
execution_date.strftime('%Y/%m/%d')).encode('utf8')
|
||||
).digest())[:16])
|
||||
rec.update({
|
||||
'time': rec['end_time'],
|
||||
'_group': 'firehose/kite_status/{}/'.format(execution_date.strftime('%Y/%m/%d')),
|
||||
'_version': '1.0.0',
|
||||
'$insert_id': insert_id,
|
||||
})
|
||||
user_id = rec['userid']
|
||||
name = event_names.get(rec['event'])
|
||||
if name is None:
|
||||
continue
|
||||
|
||||
if datetime.datetime.today() - execution_date < datetime.timedelta(days=4):
|
||||
mp_client.track(user_id, name, rec)
|
||||
else:
|
||||
ts = rec.pop('time')
|
||||
mp_client.import_data(Variable.get('mixpanel_credentials', deserialize_json=True)['api_key'], user_id, name, ts, rec)
|
||||
if i > 0 and i % 10000 == 0:
|
||||
logger.info("Processed line {}".format(i))
|
||||
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=i)
|
||||
except Exception:
|
||||
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=i-100)
|
||||
logger.exception("Error processing line {}, content={}".format(i, rec))
|
||||
raise
|
||||
mp_consumer.flush()
|
||||
|
||||
|
||||
def load_athena_to_cio(task_instance, execution_date, dag_run, storage_task_name, **context):
|
||||
import concurrent.futures
|
||||
cio_creds = Variable.get('cio_credentials', deserialize_json=True)
|
||||
start_row = task_instance.xcom_pull(task_ids=storage_task_name, key='progress')
|
||||
iter_records = read_s3_json_files('kite-metrics', task_instance.xcom_pull(task_ids='list_cio_json_files'))
|
||||
|
||||
def iter():
|
||||
for i, rec in enumerate(iter_records):
|
||||
|
||||
if i <= start_row:
|
||||
continue
|
||||
|
||||
if rec['event'] != 'kite_status':
|
||||
continue
|
||||
|
||||
rec.update({
|
||||
'time': rec['end_time'],
|
||||
'_group': 'firehose/kite_status/{}/'.format(execution_date.strftime('%Y/%m/%d')),
|
||||
'_version': '1.0.0',
|
||||
})
|
||||
user_id = rec['userid']
|
||||
|
||||
if not user_id or not all(ord(c) < 128 for c in user_id):
|
||||
continue
|
||||
|
||||
name = event_names.get(rec['event'])
|
||||
if name is None:
|
||||
continue
|
||||
|
||||
yield i, (user_id, name, rec['time']), rec
|
||||
|
||||
def call_cio(item):
|
||||
i, args, kwargs = item
|
||||
customerio.CustomerIO(cio_creds['site_id'], cio_creds['api_key']).backfill(*args, **kwargs)
|
||||
return i
|
||||
|
||||
max_i = 0
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
||||
try:
|
||||
for i in executor.map(call_cio, iter()):
|
||||
if max_i > 0 and (i // 1000) > (max_i // 1000):
|
||||
logger.info("Processed line {}".format(i))
|
||||
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=max(max_i, i))
|
||||
max_i = max(max_i, i)
|
||||
except Exception:
|
||||
dag_run.get_task_instance(storage_task_name).xcom_push(key='progress', value=max_i)
|
||||
raise
|
||||
|
||||
|
||||
for key, group_by, downstreams in [
|
||||
('mixpanel', 'regexp_replace(kite_metrics.kite_status_normalized.userId, \'\p{Cntrl}\')', [(False, load_athena_to_elastic), (True, load_athena_to_mixpanel)]),
|
||||
('cio', 'regexp_replace(coalesce(kite_metrics.kite_status_normalized.properties__forgetful_metrics_id, kite_metrics.kite_status_normalized.userId), \'\p{Cntrl}\')', [(True, load_athena_to_cio)])
|
||||
]:
|
||||
operator = insert_kite_status_normalized >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='insert_kite_status_1d_{}'.format(key),
|
||||
query='athena/queries/kite_status_1d.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
params={
|
||||
'key': key,
|
||||
'group_by': group_by,
|
||||
'languages': kite_status_config['languages'],
|
||||
'editors': kite_status_config['editors'],
|
||||
'lexical_providers': kite_status_config['lexical_providers'],
|
||||
'python_providers': kite_status_config['python_providers']
|
||||
},
|
||||
dag=dag,
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='generate_{}_json'.format(key),
|
||||
query='athena/queries/kite_status_1d_json.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
params={'key': key, 'languages': kite_status_config['languages']},
|
||||
dag=dag,
|
||||
)
|
||||
operator >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_{}_table_json'.format(key),
|
||||
query='DROP TABLE kite_status_1d_{{params.key}}_{{ds_nodash}}_json',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
params={'key': key},
|
||||
dag=dag,
|
||||
)
|
||||
operator >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_{}_table'.format(key),
|
||||
query='DROP TABLE kite_status_1d_{{params.key}}_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
params={'key': key},
|
||||
dag=dag,
|
||||
)
|
||||
operator = operator >> S3ListOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='list_{}_json_files'.format(key),
|
||||
bucket='kite-metrics',
|
||||
prefix='athena/kite_status_1d_{{params.key}}/json/{{ds}}/',
|
||||
delimiter='/',
|
||||
params={'key': key},
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
def skip_older(execution_date, **ctx):
|
||||
return execution_date >= MP_START_DATE or (datetime.datetime(2020, 5, 19) < execution_date < datetime.datetime(2020, 5, 26))
|
||||
|
||||
skip_older_operator = ShortCircuitOperator(
|
||||
task_id='skip_older_{}'.format(key),
|
||||
python_callable=skip_older,
|
||||
dag=dag,
|
||||
provide_context=True
|
||||
)
|
||||
|
||||
for skip_older, downstream in downstreams:
|
||||
progress_operator = PythonOperator(
|
||||
python_callable=lambda ti, **kwargs: ti.xcom_push(key='progress', value=0),
|
||||
task_id='progress_storage_{}'.format(downstream.__name__),
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
)
|
||||
ds_operator = PythonOperator(
|
||||
python_callable=downstream,
|
||||
task_id=downstream.__name__,
|
||||
dag=dag,
|
||||
retries=4,
|
||||
provide_context=True,
|
||||
op_kwargs={'storage_task_name': 'progress_storage_{}'.format(downstream.__name__)}
|
||||
)
|
||||
if skip_older:
|
||||
operator >> skip_older_operator >> progress_operator >> ds_operator
|
||||
else:
|
||||
operator >> progress_operator >> ds_operator
|
||||
|
||||
insert_kite_status_normalized >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='update_activations_table',
|
||||
query='athena/queries/insert_activations.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
depends_on_past=True,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
update_schema_dag = DAG(
|
||||
'update_kite_status_schema',
|
||||
default_args=default_args,
|
||||
description='Update the kite_status and kite_status_normalized schemas.',
|
||||
schedule_interval=None,
|
||||
)
|
||||
|
||||
for table_name in ['kite_status', 'kite_status_segment', 'kite_status_normalized']:
|
||||
AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='drop_{}'.format(table_name),
|
||||
query='DROP TABLE {{params.table_name}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=update_schema_dag,
|
||||
params={'table_name': table_name},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_{}'.format(table_name),
|
||||
query='athena/tables/{}.tmpl.sql'.format(table_name),
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=update_schema_dag,
|
||||
params={'schema': kite_status_schema, 'table_name': table_name}
|
||||
)
|
53
airflow/kite_airflow/dags/kite_status_segment.py
Normal file
53
airflow/kite_airflow/dags/kite_status_segment.py
Normal file
@ -0,0 +1,53 @@
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from jinja2 import PackageLoader
|
||||
import kite_metrics
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2017, 4, 27),
|
||||
'end_date': datetime.datetime(2020, 2, 23),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 0,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
dag = DAG(
|
||||
'kite_status_segment',
|
||||
default_args=default_args,
|
||||
description='Load Segment data into kite_status_normalized',
|
||||
schedule_interval='10 0 * * *',
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
kite_status_schema = kite_metrics.load_schema('kite_status')
|
||||
|
||||
AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='insert_kite_status_normalized',
|
||||
query='athena/queries/kite_status_normalized_segment.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'schema': kite_status_schema}
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_kite_status_normalized_table',
|
||||
query='DROP TABLE kite_status_normalized_{{ds_nodash}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
)
|
204
airflow/kite_airflow/dags/maxmind.py
Normal file
204
airflow/kite_airflow/dags/maxmind.py
Normal file
@ -0,0 +1,204 @@
|
||||
from airflow import DAG
|
||||
import ipaddress
|
||||
import datetime
|
||||
import requests
|
||||
import io
|
||||
import os
|
||||
import csv
|
||||
import zipfile
|
||||
from airflow.models import Variable
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from kite_airflow.s3_utils import S3DeletePrefixOperator
|
||||
from jinja2 import PackageLoader
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2020, 6, 12),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 0,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
dag = DAG(
|
||||
'maxmind_geolite2',
|
||||
description='Load the Maxmind Geolite2 database.',
|
||||
default_args=default_args,
|
||||
schedule_interval='0 0 * * 0',
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
maxmind_url = 'https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-{}-CSV&license_key={}&suffix=zip'
|
||||
maxmind_files = [
|
||||
'GeoLite2-Country-Blocks-IPv4',
|
||||
'GeoLite2-Country-Blocks-IPv6',
|
||||
'GeoLite2-Country-Locations-en',
|
||||
]
|
||||
bucket_name = 'kite-metrics'
|
||||
key_prefix_template = 'enrichment/maxmind/{prefix}/{dataset}/{ds}/{filename}/'
|
||||
key_template = key_prefix_template + '{filename}.csv'
|
||||
|
||||
|
||||
def maxmind_operator_fn(ds, **context):
|
||||
for dataset in ['city', 'country']:
|
||||
mm_resp = requests.get(maxmind_url.format(dataset.title(), Variable.get('maxmind_license_key')))
|
||||
mm_zipfile = zipfile.ZipFile(io.BytesIO(mm_resp.content))
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
|
||||
for path in mm_zipfile.namelist():
|
||||
if not path.endswith('.csv'):
|
||||
continue
|
||||
mm_file = mm_zipfile.open(path)
|
||||
filename = os.path.splitext(os.path.basename(path))[0]
|
||||
s3.load_file_obj(mm_file, key_template.format(prefix='raw', dataset=dataset, ds=ds, filename=filename), bucket_name=bucket_name, replace=True)
|
||||
|
||||
ipv4_path = [p for p in mm_zipfile.namelist() if p.endswith('GeoLite2-{}-Blocks-IPv4.csv'.format(dataset.title()))][0]
|
||||
ipv4_file = io.TextIOWrapper(mm_zipfile.open(ipv4_path, 'r'))
|
||||
ipv4_reader = csv.DictReader(ipv4_file)
|
||||
ipv4_output = io.StringIO()
|
||||
ipv4_writer = csv.DictWriter(ipv4_output, ipv4_reader.fieldnames + ['address', 'mask'])
|
||||
for rec in ipv4_reader:
|
||||
net = ipaddress.IPv4Network(rec['network'])
|
||||
rec['address'] = int(net.network_address)
|
||||
rec['mask'] = int(net.netmask)
|
||||
ipv4_writer.writerow(rec)
|
||||
key = key_template.format(prefix='expanded', dataset=dataset, ds=ds, filename='GeoLite2-{}-Blocks-IPv4'.format(dataset))
|
||||
s3.load_string(ipv4_output.getvalue(), key, bucket_name=bucket_name, replace=True)
|
||||
|
||||
|
||||
maxmind_operator = PythonOperator(
|
||||
python_callable=maxmind_operator_fn,
|
||||
task_id='load_maxmind_to_s3',
|
||||
dag=dag,
|
||||
provide_context=True,
|
||||
)
|
||||
|
||||
for dataset in ['city']:
|
||||
maxmind_operator >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_{}_names_table'.format(dataset),
|
||||
query='''CREATE EXTERNAL TABLE kite_metrics.maxmind_{{params.dataset}}_names_{{ds_nodash}} (
|
||||
geoname_id string,
|
||||
locale_code string,
|
||||
continent_code string,
|
||||
continent_name string,
|
||||
country_iso_code string,
|
||||
country_name string,
|
||||
subdivision_1_iso_code string,
|
||||
subdivision_1_name string,
|
||||
subdivision_2_iso_code string,
|
||||
subdivision_2_name string,
|
||||
city_name string,
|
||||
metro_code string,
|
||||
time_zone string,
|
||||
is_in_european_union string)
|
||||
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
|
||||
LOCATION 's3://{{params.bucket}}/{{params.key_prefix_template.format(ds=ds, dataset=params.dataset, prefix='raw', filename=params.filename)}}'
|
||||
TBLPROPERTIES ('skip.header.line.count'='1')
|
||||
''',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'bucket': bucket_name,
|
||||
'key_prefix_template': key_prefix_template,
|
||||
'filename': 'GeoLite2-{}-Locations-en'.format(dataset.title()),
|
||||
'dataset': dataset},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_ipv4_{}_table'.format(dataset),
|
||||
query='''CREATE EXTERNAL TABLE kite_metrics.maxmind_ipv4_{{params.dataset}}_{{ds_nodash}} (
|
||||
network string,
|
||||
geoname_id string,
|
||||
registered_country_geoname_id string,
|
||||
represented_country_geoname_id string,
|
||||
is_anonymous_proxy string,
|
||||
is_satellite_provider string,
|
||||
postal_code string,
|
||||
latitude string,
|
||||
longitude string,
|
||||
accuracy_radius string,
|
||||
address bigint,
|
||||
mask bigint)
|
||||
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
|
||||
LOCATION 's3://{{params.bucket}}/{{params.key_prefix_template.format(ds=ds, dataset=params.dataset, prefix='expanded', filename=filename)}}'
|
||||
TBLPROPERTIES ('skip.header.line.count'='1')
|
||||
''',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'bucket': bucket_name,
|
||||
'key_prefix_template': key_prefix_template,
|
||||
'filename': 'GeoLite2-{}-Blocks-IPv4'.format(dataset.title()),
|
||||
'dataset': dataset
|
||||
},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='drop_ipv4_{}_table'.format(dataset),
|
||||
query='''DROP TABLE kite_metrics.maxmind_{{params.dataset}}_ipv4''',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'dataset': dataset
|
||||
},
|
||||
) >> S3DeletePrefixOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='prepare_ipv4_{}_join_destination'.format(dataset),
|
||||
bucket='kite-metrics',
|
||||
keys='enrichment/maxmind/join/{{params.dataset}}/ipv4/',
|
||||
params={'dataset': dataset},
|
||||
dag=dag,
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_ipv4_{}_join_table'.format(dataset),
|
||||
query='''CREATE TABLE kite_metrics.maxmind_{{params.dataset}}_ipv4
|
||||
WITH (format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
external_location = 's3://{{params.bucket}}/enrichment/maxmind/join/{{params.dataset}}/ipv4/')
|
||||
AS SELECT
|
||||
kite_metrics.maxmind_city_names_{{ds_nodash}}.country_iso_code country_iso_code,
|
||||
kite_metrics.maxmind_city_names_{{ds_nodash}}.country_name country_name,
|
||||
kite_metrics.maxmind_city_names_{{ds_nodash}}.subdivision_1_name subdivision_1_name,
|
||||
kite_metrics.maxmind_city_names_{{ds_nodash}}.city_name city_name,
|
||||
kite_metrics.maxmind_city_names_{{ds_nodash}}.time_zone time_zone,
|
||||
kite_metrics.maxmind_ipv4_city_{{ds_nodash}}.address address,
|
||||
kite_metrics.maxmind_ipv4_city_{{ds_nodash}}.mask mask
|
||||
FROM kite_metrics.maxmind_ipv4_city_{{ds_nodash}}
|
||||
JOIN kite_metrics.maxmind_city_names_{{ds_nodash}}
|
||||
ON kite_metrics.maxmind_ipv4_city_{{ds_nodash}}.geoname_id = kite_metrics.maxmind_city_names_{{ds_nodash}}.geoname_id
|
||||
''',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'bucket': bucket_name, 'dataset': dataset},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_ipv4_{}_table'.format(dataset),
|
||||
query='''DROP TABLE kite_metrics.maxmind_ipv4_{{params.dataset}}_{{ds_nodash}}''',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'dataset': dataset
|
||||
},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_{}_names_table'.format(dataset),
|
||||
query='''DROP TABLE kite_metrics.maxmind_{{params.dataset}}_names_{{ds_nodash}}''',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={
|
||||
'dataset': dataset
|
||||
},
|
||||
)
|
168
airflow/kite_airflow/dags/mixpanel_ingest.py
Normal file
168
airflow/kite_airflow/dags/mixpanel_ingest.py
Normal file
@ -0,0 +1,168 @@
|
||||
import datetime
|
||||
import io
|
||||
import gzip
|
||||
import json
|
||||
import time
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.models import Variable
|
||||
import pytz
|
||||
import requests
|
||||
import yaml
|
||||
from jinja2 import PackageLoader
|
||||
import pkg_resources
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2020, 1, 1),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
dag = DAG(
|
||||
'mixpanel_ingest',
|
||||
default_args=default_args,
|
||||
description='Mixpanel data ingest DAG.',
|
||||
schedule_interval='10 4 * * *',
|
||||
max_active_runs=1,
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
pacific = pytz.timezone('America/Los_Angeles')
|
||||
people_schema = yaml.load(pkg_resources.resource_stream('kite_airflow', 'files/mixpanel_people.schema.yaml'), Loader=yaml.FullLoader)
|
||||
|
||||
|
||||
def copy_profile_deltas(task_instance, execution_date, prev_execution_date_success, next_execution_date, **context):
|
||||
|
||||
ex_day = execution_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
if prev_execution_date_success:
|
||||
ex_day = prev_execution_date_success.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
|
||||
|
||||
next_ex_day = next_execution_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
chunks = [ex_day]
|
||||
while chunks[-1] < next_ex_day:
|
||||
chunks.append(chunks[-1] + datetime.timedelta(hours=4))
|
||||
|
||||
gz_file = io.BytesIO()
|
||||
|
||||
with gzip.GzipFile(fileobj=gz_file, mode="w") as f:
|
||||
start_date = chunks.pop(0)
|
||||
for chunk in chunks:
|
||||
filters = []
|
||||
for cmp, dt in [['>=', start_date], ['<', chunk]]:
|
||||
filters.append('user.time {} {}'.format(cmp, 1000 * int(time.mktime(dt.astimezone(pacific).timetuple()))))
|
||||
start_date = chunk
|
||||
print(filters)
|
||||
script = 'function main() {{ return People().filter(function(user) {{ return {}; }})}}'.format(' && '.join(filters))
|
||||
res = requests.post('https://mixpanel.com/api/2.0/jql',
|
||||
auth=(Variable.get('mixpanel_credentials', deserialize_json=True)['secret'], ''),
|
||||
data={'script': script})
|
||||
if res.status_code != 200:
|
||||
raise Exception(res.text)
|
||||
|
||||
for line in res.json():
|
||||
to_scrub = [line]
|
||||
while to_scrub:
|
||||
curr = to_scrub.pop(0)
|
||||
for key, value in list(curr.items()):
|
||||
if isinstance(value, (dict, list)) and len(value) == 0:
|
||||
del curr[key]
|
||||
if isinstance(value, dict):
|
||||
to_scrub.append(value)
|
||||
if key.startswith('$'):
|
||||
curr[key[1:]] = value
|
||||
del curr[key]
|
||||
|
||||
for ts_field in ['last_seen', 'time']:
|
||||
pacific_ts = datetime.datetime.fromtimestamp(line[ts_field] / 1000).replace(tzinfo=pacific)
|
||||
line[ts_field] = int(time.mktime(pacific_ts.astimezone(pytz.utc).timetuple()))
|
||||
|
||||
f.write(json.dumps(line).encode('utf8'))
|
||||
f.write(b'\n')
|
||||
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
key = 'mixpanel/people/raw/year={}/month={}/day={}/deltas.json.gz'.format(
|
||||
execution_date.year, execution_date.month, execution_date.day
|
||||
)
|
||||
s3.load_bytes(gz_file.getvalue(), key, 'kite-metrics')
|
||||
|
||||
|
||||
PythonOperator(
|
||||
python_callable=copy_profile_deltas,
|
||||
task_id=copy_profile_deltas.__name__,
|
||||
dag=dag,
|
||||
retries=2,
|
||||
provide_context=True,
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='rollup_people',
|
||||
query='athena/queries/mixpanel_people_rollup.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'schema': people_schema},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='cleanup_rollup_table',
|
||||
query="DROP TABLE mixpanel_people_rollup_{{ds_nodash}}",
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'schema': people_schema},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='update_people_table_location',
|
||||
query="""ALTER TABLE mixpanel_people
|
||||
SET LOCATION 's3://kite-metrics/mixpanel/people/rollups/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'""",
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=dag,
|
||||
params={'schema': people_schema},
|
||||
)
|
||||
|
||||
|
||||
ddl_dag = DAG(
|
||||
'mixpanel_ingest_schema_update',
|
||||
default_args=default_args,
|
||||
description='Mixpanel data schema definition.',
|
||||
schedule_interval=None,
|
||||
max_active_runs=1,
|
||||
)
|
||||
|
||||
for table_name, s3_prefix in {'mixpanel_people_raw': 'mixpanel/people/raw', 'mixpanel_people': 'mixpanel/people/rollups'}.items():
|
||||
AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='drop_{}'.format(table_name),
|
||||
query='DROP TABLE {{params.table_name}}',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=ddl_dag,
|
||||
params={'table_name': table_name},
|
||||
) >> AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_{}'.format(table_name),
|
||||
query='athena/tables/mixpanel_people.tmpl.sql',
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_metrics',
|
||||
dag=ddl_dag,
|
||||
params={
|
||||
'schema': people_schema,
|
||||
'table_name': table_name,
|
||||
's3_prefix': s3_prefix,
|
||||
'partitioned': table_name == 'mixpanel_people_raw',
|
||||
'json': table_name == 'mixpanel_people_raw',
|
||||
}
|
||||
)
|
105
airflow/kite_airflow/dags/mixpanel_ingest_events.py
Normal file
105
airflow/kite_airflow/dags/mixpanel_ingest_events.py
Normal file
@ -0,0 +1,105 @@
|
||||
import datetime
|
||||
import io
|
||||
import gzip
|
||||
import json
|
||||
import time
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.models import Variable
|
||||
import pendulum
|
||||
import requests
|
||||
from jinja2 import PackageLoader
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
pacific = pendulum.timezone('America/Los_Angeles')
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2020, 1, 1, tzinfo=pacific),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
|
||||
dag = DAG(
|
||||
'mixpanel_ingest_events',
|
||||
default_args=default_args,
|
||||
description='Mixpanel events ingest DAG.',
|
||||
schedule_interval='30 * * * *',
|
||||
max_active_runs=6,
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def copy_mp_raw_events(task_instance, execution_date, **context):
|
||||
pac_date = execution_date.astimezone(pacific)
|
||||
pac_hour = pac_date.replace(minute=0, second=0, microsecond=0)
|
||||
|
||||
script = '''function main() {{
|
||||
return Events({{from_date: "{date}", to_date: "{date}"}}).filter(function(event) {{
|
||||
return !event.name.startsWith("kite_status") && event.time >= {start} && event.time < {end};
|
||||
}});
|
||||
}}'''.format(
|
||||
date=pac_date.strftime('%Y-%m-%d'),
|
||||
start=1000 * int(time.mktime(pac_hour.timetuple())),
|
||||
end=1000 * int(time.mktime((pac_hour + datetime.timedelta(hours=1)).timetuple())),
|
||||
)
|
||||
print(script)
|
||||
res = requests.post('https://mixpanel.com/api/2.0/jql',
|
||||
auth=(Variable.get('mixpanel_credentials', deserialize_json=True)['secret'], ''),
|
||||
data={'script': script},)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise Exception(res.text)
|
||||
|
||||
files = {}
|
||||
|
||||
for line in res.json():
|
||||
to_scrub = [line]
|
||||
while to_scrub:
|
||||
curr = to_scrub.pop(0)
|
||||
for key, value in list(curr.items()):
|
||||
if isinstance(value, (dict, list)) and len(value) == 0:
|
||||
del curr[key]
|
||||
continue
|
||||
if isinstance(value, dict):
|
||||
to_scrub.append(value)
|
||||
continue
|
||||
if key.startswith('$'):
|
||||
curr[key[1:]] = value
|
||||
del curr[key]
|
||||
|
||||
pacific_ts = datetime.datetime.fromtimestamp(line['time'] / 1000).replace(tzinfo=pacific)
|
||||
utc_ts = pacific_ts.astimezone(pendulum.timezone('UTC'))
|
||||
line['time'] = int(time.mktime(utc_ts.timetuple()))
|
||||
|
||||
file_key = 'year={}/month={}/day={}/hour={}/event={}'.format(utc_ts.year, utc_ts.month, utc_ts.day, utc_ts.hour, line['name'])
|
||||
if file_key not in files:
|
||||
b_io = io.BytesIO()
|
||||
files[file_key] = (b_io, gzip.GzipFile(fileobj=b_io, mode="w"))
|
||||
|
||||
files[file_key][1].write(json.dumps(line).encode('utf8'))
|
||||
files[file_key][1].write(b'\n')
|
||||
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
for prefix, (b_io, gz_file) in files.items():
|
||||
gz_file.close()
|
||||
s3.load_bytes(b_io.getvalue(), 'mixpanel/events/raw/{}/events.json.gz'.format(prefix), 'kite-metrics', replace=True)
|
||||
|
||||
|
||||
PythonOperator(
|
||||
python_callable=copy_mp_raw_events,
|
||||
task_id=copy_mp_raw_events.__name__,
|
||||
dag=dag,
|
||||
retries=2,
|
||||
provide_context=True,
|
||||
)
|
1
airflow/kite_airflow/dags/monetizability.py
Normal file
1
airflow/kite_airflow/dags/monetizability.py
Normal file
@ -0,0 +1 @@
|
||||
XXXXXXX
|
312
airflow/kite_airflow/dags/youtube_crawl.py
Normal file
312
airflow/kite_airflow/dags/youtube_crawl.py
Normal file
@ -0,0 +1,312 @@
|
||||
from airflow import DAG
|
||||
import datetime
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.contrib.hooks.aws_sqs_hook import SQSHook
|
||||
import json
|
||||
import gzip
|
||||
import hashlib
|
||||
import collections
|
||||
import codecs
|
||||
import logging
|
||||
import uuid
|
||||
import csv
|
||||
import time
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
import requests
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
from jinja2 import PackageLoader
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BUCKET = 'kite-youtube-data'
|
||||
SCRATCH_SPACE_LOC = 's3://{}/athena-scratch-space/'.format(BUCKET)
|
||||
|
||||
|
||||
def iter_s3_file(s3_hook, bucket, key):
|
||||
json_file = s3_hook.get_key(key, BUCKET)
|
||||
for line in gzip.open(json_file.get()['Body']):
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
youtube_search_dag = DAG(
|
||||
'youtube_search',
|
||||
description='Find new Youtube channels.',
|
||||
default_args={
|
||||
'retries': 1,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'start_date': datetime.datetime(2020, 11, 6),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
},
|
||||
schedule_interval='0 4 * * *',
|
||||
max_active_runs=1,
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
schema_operators = []
|
||||
for table in ['youtube_queries', 'youtube_searches', 'youtube_channels', 'youtube_channel_details', 'youtube_socialblade_stats']:
|
||||
drop_op = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='drop_table_{}'.format(table),
|
||||
query='DROP TABLE IF EXISTS {}'.format(table),
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_youtube_crawl',
|
||||
dag=youtube_search_dag
|
||||
)
|
||||
|
||||
create_op = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='create_table_{}'.format(table),
|
||||
query='athena/tables/{}.tmpl.sql'.format(table),
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database='kite_youtube_crawl',
|
||||
dag=youtube_search_dag,
|
||||
)
|
||||
|
||||
drop_op >> create_op
|
||||
schema_operators.append(create_op)
|
||||
|
||||
BATCH_SIZE = 100
|
||||
MAX_RELATED_GENERATION = 1
|
||||
|
||||
get_queries_op = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='get_queries',
|
||||
query='SELECT q.*, s.query IS NOT NULL AS searched FROM youtube_queries q LEFT OUTER JOIN youtube_searches s ON (q.query=s.query) ORDER BY q.count DESC',
|
||||
output_location=SCRATCH_SPACE_LOC,
|
||||
database='kite_youtube_crawl',
|
||||
dag=youtube_search_dag,
|
||||
)
|
||||
schema_operators >> get_queries_op
|
||||
|
||||
get_existing_channels_op = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='get_existing_channels',
|
||||
query='SELECT id FROM youtube_channels',
|
||||
output_location=SCRATCH_SPACE_LOC,
|
||||
database='kite_youtube_crawl',
|
||||
dag=youtube_search_dag,
|
||||
)
|
||||
schema_operators >> get_existing_channels_op
|
||||
|
||||
|
||||
def get_scratch_space_csv(s3hook, ti, task_id):
|
||||
filename = ti.xcom_pull(task_ids=task_id)
|
||||
s3key = s3hook.get_key('athena-scratch-space/{}.csv'.format(filename), BUCKET)
|
||||
return csv.DictReader(codecs.getreader("utf-8")(s3key.get()['Body']))
|
||||
|
||||
|
||||
def write_gzip_string_to_s3(s3hook, contents, key, bucket):
|
||||
s3hook.load_string(gzip.compress(contents.encode('utf8')), key, bucket)
|
||||
|
||||
|
||||
def youtube_crawl(ti, ts_nodash, **kwargs):
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
ex_channels = {c['id'] for c in get_scratch_space_csv(s3, ti, get_existing_channels_op.task_id)}
|
||||
|
||||
curr_time = datetime.datetime.now()
|
||||
queries = get_scratch_space_csv(s3, ti, get_queries_op.task_id)
|
||||
selected_queries = [q for q in queries if q['searched'] == 'false'][:BATCH_SIZE]
|
||||
all_queries = {q['query'] for q in queries}
|
||||
search_records = []
|
||||
new_channels = []
|
||||
new_queries = []
|
||||
|
||||
try:
|
||||
for query in selected_queries:
|
||||
print("Running query {}".format(query['query']))
|
||||
query_hash = hashlib.md5(query['query'].encode('utf8')).hexdigest()
|
||||
|
||||
# resp = requests.get('https://serpapi.com/search.json',
|
||||
# params={'engine': 'youtube', 'search_query': query['query'], 'api_key': 'XXXXXXX'})
|
||||
resp = requests.get("https://www.googleapis.com/youtube/v3/search", params={
|
||||
"key": "XXXXXXX",
|
||||
"q": query['query'],
|
||||
"part": "snippet",
|
||||
"maxResults": "50"
|
||||
}, headers={'content-type': 'application/json'})
|
||||
|
||||
if resp.status_code != 200:
|
||||
print("Error from SerpAPI: {} {}".format(resp.status_code, resp.text))
|
||||
raise Exception()
|
||||
|
||||
resp_json = resp.json()
|
||||
|
||||
# if 'video_results' not in resp.json():
|
||||
if 'items' not in resp_json:
|
||||
print("No results for {}".format(query['query']))
|
||||
continue
|
||||
|
||||
response_key = 'search_responses/{}/{}.json.gz'.format(query_hash, ts_nodash)
|
||||
s3.load_bytes(gzip.compress(resp.text.encode('utf8')), response_key, BUCKET, replace=True)
|
||||
|
||||
# all_channels = {v['channel']['link'] for v in resp_json['video_results'] if 'link' in v['channel']}
|
||||
all_channels = {'https://www.youtube.com/channel/{}'.format(v['snippet']['channelId']) for v in resp_json['items']}
|
||||
n_new_channels = len(all_channels - ex_channels)
|
||||
|
||||
for c in all_channels - ex_channels:
|
||||
new_channels.append({
|
||||
'id': c,
|
||||
'query': query['query'],
|
||||
'timestamp': curr_time.isoformat(),
|
||||
})
|
||||
ex_channels.add(c)
|
||||
|
||||
for key in resp_json:
|
||||
if not key.startswith('searches_related_to_'):
|
||||
continue
|
||||
for search in resp_json[key]['searches']:
|
||||
if search['query'] not in all_queries:
|
||||
new_queries.append({
|
||||
'query': search['query'],
|
||||
'seed': False,
|
||||
'generation': int(query.get('generation') or 0) + 1,
|
||||
'parent': query['query']
|
||||
})
|
||||
|
||||
search_records.append({
|
||||
'query': query['query'],
|
||||
'query_hash': query_hash,
|
||||
'timestamp': curr_time.isoformat(),
|
||||
'total': len(all_channels),
|
||||
'unique': n_new_channels,
|
||||
})
|
||||
|
||||
finally:
|
||||
for key, objs in [('channels', new_channels), ('search_queries', new_queries), ('searches', search_records)]:
|
||||
if objs:
|
||||
contents = gzip.compress('\n'.join([json.dumps(obj) for obj in objs]).encode('utf8'))
|
||||
s3.load_bytes(contents, '{}/{}-{}.json.gz'.format(key, ts_nodash, uuid.uuid4().hex), BUCKET)
|
||||
|
||||
|
||||
youtube_crawl_op = PythonOperator(
|
||||
python_callable=youtube_crawl,
|
||||
task_id=youtube_crawl.__name__,
|
||||
dag=youtube_search_dag,
|
||||
provide_context=True,
|
||||
)
|
||||
(get_queries_op, get_existing_channels_op) >> youtube_crawl_op
|
||||
|
||||
get_new_channels_op = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='get_new_channels',
|
||||
query='''SELECT DISTINCT c.id
|
||||
FROM youtube_channels c
|
||||
LEFT OUTER JOIN youtube_channel_details d ON (
|
||||
concat('https://www.youtube.com/channel/', d.id)=c.id
|
||||
OR concat('https://www.youtube.com/user/', d.forUsername)=c.id
|
||||
)
|
||||
WHERE d.id IS NULL AND d.forUsername IS NULL''',
|
||||
output_location=SCRATCH_SPACE_LOC,
|
||||
database='kite_youtube_crawl',
|
||||
dag=youtube_search_dag,
|
||||
)
|
||||
|
||||
|
||||
def chunks(lst, n):
|
||||
"""Yield successive n-sized chunks from lst."""
|
||||
for i in range(0, len(lst), n):
|
||||
yield lst[i:i + n]
|
||||
|
||||
|
||||
def get_channel_details(ti, ts_nodash, **kwargs):
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
new_channels = {c['id'] for c in get_scratch_space_csv(s3, ti, get_new_channels_op.task_id)}
|
||||
|
||||
channels_by_type = collections.defaultdict(list)
|
||||
for channel in new_channels:
|
||||
c_parts = channel.split('/')
|
||||
channels_by_type[c_parts[-2]].append(c_parts[-1])
|
||||
|
||||
print("Getting channel details for {} new channels and {} new users".format(
|
||||
len(channels_by_type['channel']),
|
||||
len(channels_by_type['user']))
|
||||
)
|
||||
|
||||
c_details = []
|
||||
|
||||
url = "https://www.googleapis.com/youtube/v3/channels"
|
||||
generic_params = {
|
||||
"part": ["statistics", "snippet", "contentDetails", "status"],
|
||||
"key": "XXXXXXX",
|
||||
}
|
||||
|
||||
try:
|
||||
for username in channels_by_type.get('user', []):
|
||||
params = {'forUsername': username}
|
||||
params.update(generic_params)
|
||||
|
||||
resp = requests.get(url, params=params, headers={'content-type': 'application/json'})
|
||||
if not resp.json().get('items'):
|
||||
print("Failed to get user: {}".format(username))
|
||||
c_details.append({'forUsername': username})
|
||||
continue
|
||||
|
||||
for item in resp.json()['items']:
|
||||
item['forUsername'] = username
|
||||
c_details.append(item)
|
||||
|
||||
for chunk in chunks(channels_by_type.get('channel', []), 50):
|
||||
params = {'id': ','.join(chunk)}
|
||||
params.update(generic_params)
|
||||
|
||||
resp = requests.get(url, params=params, headers={'content-type': 'application/json'})
|
||||
if not resp.json().get('items'):
|
||||
print("Failed to get channels: {}".format(', '.join(chunk)))
|
||||
continue
|
||||
for item in resp.json()['items']:
|
||||
c_details.append(item)
|
||||
finally:
|
||||
print("Loading channel details for {} channels".format(len(c_details)))
|
||||
contents = gzip.compress('\n'.join([json.dumps(obj) for obj in c_details]).encode('utf8'))
|
||||
s3.load_bytes(contents, 'channel_details/{}-{}.json.gz'.format(ts_nodash, uuid.uuid4().hex), BUCKET)
|
||||
|
||||
|
||||
get_channel_details_op = PythonOperator(
|
||||
python_callable=get_channel_details,
|
||||
task_id=get_channel_details.__name__,
|
||||
dag=youtube_search_dag,
|
||||
provide_context=True,
|
||||
)
|
||||
youtube_crawl_op >> get_new_channels_op >> get_channel_details_op
|
||||
|
||||
get_new_socialblade_channels = AWSAthenaOperator(
|
||||
aws_conn_id='aws_us_east_1',
|
||||
task_id='get_new_socialblade_channels',
|
||||
query='''SELECT DISTINCT c.id
|
||||
FROM youtube_channel_details c
|
||||
LEFT OUTER JOIN youtube_socialblade_stats sb ON c.id=sb.id
|
||||
WHERE sb.id IS NULL AND CAST(c.statistics.viewCount AS bigint) > 100000''',
|
||||
output_location=SCRATCH_SPACE_LOC,
|
||||
database='kite_youtube_crawl',
|
||||
dag=youtube_search_dag,
|
||||
)
|
||||
|
||||
QUEUE_URL = 'https://sqs.us-east-1.amazonaws.com/XXXXXXX/queue-youtube-socialblade'
|
||||
|
||||
|
||||
def enqueue_socialblade_channels(ti, ts_nodash, **kwargs):
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
sqs_hook = SQSHook('aws_us_east_1')
|
||||
|
||||
new_channels = {c['id'] for c in get_scratch_space_csv(s3, ti, get_new_socialblade_channels.task_id)}
|
||||
print('Enqueuing {} channels'.format(len(new_channels)))
|
||||
sqs_hook.get_conn().purge_queue(QueueUrl=QUEUE_URL)
|
||||
|
||||
# Sleep to allow purge to complete
|
||||
time.sleep(60)
|
||||
for channel in new_channels:
|
||||
sqs_hook.send_message(QUEUE_URL, channel)
|
||||
|
||||
|
||||
enqueue_socialblade_channels_op = PythonOperator(
|
||||
python_callable=enqueue_socialblade_channels,
|
||||
task_id=enqueue_socialblade_channels.__name__,
|
||||
dag=youtube_search_dag,
|
||||
provide_context=True,
|
||||
)
|
||||
|
||||
get_channel_details_op >> get_new_socialblade_channels >> enqueue_socialblade_channels_op
|
280
airflow/kite_airflow/dags/youtube_dashboard.py
Normal file
280
airflow/kite_airflow/dags/youtube_dashboard.py
Normal file
@ -0,0 +1,280 @@
|
||||
import datetime
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.operators.python_operator import PythonOperator
|
||||
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
|
||||
import googleapiclient.discovery
|
||||
from jinja2 import PackageLoader
|
||||
|
||||
from kite_airflow.plugins.google import GoogleSheetsRangeOperator
|
||||
from kite_airflow.common import configs
|
||||
from kite_airflow.common import utils as common_utils
|
||||
from kite_airflow.youtube_dashboard import api
|
||||
from kite_airflow.youtube_dashboard import files
|
||||
from kite_airflow.youtube_dashboard import utils
|
||||
from kite_airflow.slack_alerts import task_fail_slack_alert
|
||||
|
||||
|
||||
BUCKET = 'kite-youtube-data' if common_utils.is_production() else 'kite-metrics-test'
|
||||
SCRATCH_SPACE_LOC = 's3://{}/athena-scratch-space/'.format(BUCKET)
|
||||
|
||||
DATABASE = 'prod_kite_link_stats_youtube' if common_utils.is_production() else 'kite_link_stats_youtube'
|
||||
TABLE_CHANNELS = {
|
||||
'name': 'kite_link_stats_youtube_channels',
|
||||
'data_location': 's3://{}/youtube-dashboard/channels/'.format(BUCKET),
|
||||
}
|
||||
TABLE_VIDEOS = {
|
||||
'name': 'kite_link_stats_youtube_videos',
|
||||
'data_location': 's3://{}/youtube-dashboard/videos/'.format(BUCKET),
|
||||
}
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': datetime.datetime(2020, 11, 21),
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 0,
|
||||
'retry_delay': datetime.timedelta(minutes=5),
|
||||
'on_failure_callback': task_fail_slack_alert,
|
||||
}
|
||||
|
||||
kite_link_stats_dag = DAG(
|
||||
'youtube_dashboard',
|
||||
description='Import links stats of sponsored videos for the YouTube dashboard.',
|
||||
default_args=default_args,
|
||||
schedule_interval='10 0 * * *',
|
||||
jinja_environment_kwargs={
|
||||
'loader': PackageLoader('kite_airflow', 'templates')
|
||||
},
|
||||
)
|
||||
|
||||
schema_operators = []
|
||||
for table in [TABLE_CHANNELS, TABLE_VIDEOS]:
|
||||
drop_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='drop_table_{}'.format(table['name']),
|
||||
query='DROP TABLE IF EXISTS {}'.format(table['name']),
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database=DATABASE,
|
||||
dag=kite_link_stats_dag,
|
||||
params={'data_location': table['data_location']},
|
||||
)
|
||||
|
||||
create_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='create_table_{}'.format(table['name']),
|
||||
query='athena/tables/{}.tmpl.sql'.format(table['name']),
|
||||
output_location='s3://kite-metrics-test/athena-results/ddl',
|
||||
database=DATABASE,
|
||||
dag=kite_link_stats_dag,
|
||||
params={'data_location': table['data_location']},
|
||||
)
|
||||
|
||||
drop_op >> create_op
|
||||
schema_operators.append(create_op)
|
||||
|
||||
get_channels_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='get_channels',
|
||||
query='SELECT * FROM {}'.format(TABLE_CHANNELS['name']),
|
||||
output_location=SCRATCH_SPACE_LOC,
|
||||
database=DATABASE,
|
||||
dag=kite_link_stats_dag,
|
||||
)
|
||||
schema_operators >> get_channels_op
|
||||
|
||||
get_videos_op = AWSAthenaOperator(
|
||||
aws_conn_id=configs.AWS_CONN_ID,
|
||||
task_id='get_videos',
|
||||
query='SELECT * FROM {}'.format(TABLE_VIDEOS['name']),
|
||||
output_location=SCRATCH_SPACE_LOC,
|
||||
database=DATABASE,
|
||||
dag=kite_link_stats_dag,
|
||||
)
|
||||
schema_operators >> get_videos_op
|
||||
|
||||
get_channels_sheet_operator = GoogleSheetsRangeOperator(
|
||||
gcp_conn_id='google_cloud_kite_dev',
|
||||
spreadsheet_id='XXXXXXX-J0',
|
||||
range="'List of Channels'!A:C",
|
||||
task_id='get_channels_sheet',
|
||||
dag=kite_link_stats_dag,
|
||||
)
|
||||
|
||||
|
||||
def update_videos_from_all_channels(ti, yt_client):
|
||||
'''
|
||||
Take all given channels and store the list their videos
|
||||
|
||||
In case of new channel we search all videos and in case of an existing
|
||||
channel we only search new videos via YouTube activities
|
||||
|
||||
Returns:\n
|
||||
list:
|
||||
new video items which we will use while taking snapshots. We need this
|
||||
because athena queries are evaluating at start so we will not receive
|
||||
these new videos via get videos query.
|
||||
'''
|
||||
channel_list = files.get_scratch_space_csv(ti, get_channels_op.task_id)
|
||||
|
||||
sheet_data = ti.xcom_pull(task_ids='get_channels_sheet')['values']
|
||||
cid_field = sheet_data[0].index('Channel ID')
|
||||
sheet_channels = {line[cid_field] for line in sheet_data[1:] if len(line) > cid_field and line[cid_field].strip()}
|
||||
|
||||
for new_c in sheet_channels - {c['id'] for c in channel_list}:
|
||||
channel_list.append({'id': new_c, 'is_backfilled': 'false', 'last_backfill_until': '', 'last_updated': ''})
|
||||
|
||||
new_video_list = []
|
||||
search_budget = 80
|
||||
exception = None
|
||||
|
||||
for channel in channel_list:
|
||||
channel_id = channel['id']
|
||||
|
||||
# indicates new channel or a channels whose backfilled is yet to complete
|
||||
if channel['is_backfilled'] == 'false':
|
||||
|
||||
# incase of backfill was incomplete then resumes where it's left off
|
||||
published_before = channel['last_backfill_until'] if channel['is_backfilled'] == 'false' else None
|
||||
|
||||
video_search_list, has_channel_search_remaining, no_of_searches, exception = api.get_all_video_search_list(
|
||||
yt_client,
|
||||
channel_id,
|
||||
published_before,
|
||||
search_budget,
|
||||
)
|
||||
|
||||
for video_search_item in video_search_list:
|
||||
new_video_list.append({
|
||||
'id': utils.get_video_id_of_search_item(video_search_item),
|
||||
'channel_id': channel_id,
|
||||
})
|
||||
|
||||
# only update channel attributes if videos are found (also handles YT out of quota cases)
|
||||
if(video_search_list):
|
||||
last_search_item = video_search_list[- 1]
|
||||
channel['last_backfill_until'] = utils.get_published_date_of_search_item(last_search_item)
|
||||
channel['is_backfilled'] = not has_channel_search_remaining
|
||||
|
||||
# update the last_updated of channel which will help is in limiting future searches
|
||||
channel['last_updated'] = common_utils.get_date_time_in_ISO()
|
||||
|
||||
search_budget -= no_of_searches
|
||||
|
||||
if search_budget <= 0:
|
||||
break
|
||||
|
||||
all_activity_list, exception = api.get_all_activity_list(
|
||||
yt_client,
|
||||
channel_id,
|
||||
channel['last_updated'],
|
||||
)
|
||||
|
||||
if(len(all_activity_list)):
|
||||
files.write_activities_on_file(all_activity_list)
|
||||
|
||||
video_activity_list = api.filter_video_activity_from_list(
|
||||
all_activity_list,
|
||||
)
|
||||
|
||||
for video_activity in video_activity_list:
|
||||
new_video_list.append({
|
||||
'id': utils.get_id_of_video_activity(video_activity),
|
||||
'channel_id': channel_id,
|
||||
})
|
||||
|
||||
# update the last_updated of channel which will help is in limiting future searches
|
||||
channel['last_updated'] = common_utils.get_date_time_in_ISO()
|
||||
|
||||
files.write_channels_on_file(channel_list)
|
||||
|
||||
if len(new_video_list) > 0:
|
||||
files.write_videos_on_file(new_video_list)
|
||||
|
||||
if exception:
|
||||
raise exception
|
||||
|
||||
return new_video_list
|
||||
|
||||
|
||||
def take_snapshots_and_update_files(video_list_for_snapshots, cached_urls_dict):
|
||||
snapshot_list = get_snapshots_list(video_list_for_snapshots, cached_urls_dict)
|
||||
files.write_snapshots_on_file(snapshot_list)
|
||||
files.write_cached_urls_on_file(cached_urls_dict)
|
||||
|
||||
|
||||
def get_snapshots_list(video_list, cached_urls_dict):
|
||||
if not video_list:
|
||||
return
|
||||
|
||||
snapshot_list = []
|
||||
for video_item in video_list:
|
||||
snapshot_list.append({
|
||||
'video_id': utils.get_id_of_video_item(video_item),
|
||||
'description': utils.get_description_of_video_item(video_item),
|
||||
'is_link_present': utils.is_link_present_in_description(video_item, cached_urls_dict), # also updates the cache in case of shorten urls
|
||||
'views': utils.get_views_of_video_item(video_item),
|
||||
'timestamp': common_utils.get_date_time_in_ISO(),
|
||||
})
|
||||
|
||||
return snapshot_list
|
||||
|
||||
|
||||
def update_snapshots_of_all_videos(ti, yt_client, new_video_list):
|
||||
'''
|
||||
Take snapshots of all of the available videos and new videos
|
||||
'''
|
||||
|
||||
video_list_for_snapshots = []
|
||||
cached_urls_dict = files.get_cached_urls_from_file()
|
||||
all_videos_list = files.get_scratch_space_csv(ti, get_videos_op.task_id)
|
||||
all_videos_id_list = [video['id'] for video in all_videos_list]
|
||||
no_of_batch_requests = 50 # to optimise YouTube quota
|
||||
|
||||
# appending new videos id also because get videos query don't return
|
||||
# us new results that are been during the execution of this script
|
||||
all_videos_id_list.extend(
|
||||
list(map(lambda video: video['id'], new_video_list))
|
||||
)
|
||||
|
||||
for start_index in range(0, len(all_videos_id_list), no_of_batch_requests):
|
||||
try:
|
||||
video_list = []
|
||||
end_index = (start_index) + no_of_batch_requests
|
||||
videos_id_batch_list = all_videos_id_list[start_index:end_index]
|
||||
video_list = api.get_video_list(yt_client, videos_id_batch_list)
|
||||
|
||||
video_list_for_snapshots.extend(video_list)
|
||||
except Exception:
|
||||
# store data until now in case of any error or if quota exceeded
|
||||
take_snapshots_and_update_files(video_list_for_snapshots, cached_urls_dict)
|
||||
raise
|
||||
|
||||
take_snapshots_and_update_files(video_list_for_snapshots, cached_urls_dict)
|
||||
|
||||
|
||||
def get_snaphots_of_videos(ti, **context):
|
||||
api_service_name = 'youtube'
|
||||
api_version = 'v3'
|
||||
api_key = 'XXXXXXX'
|
||||
|
||||
yt_client = googleapiclient.discovery.build(
|
||||
api_service_name, api_version, developerKey=api_key
|
||||
)
|
||||
|
||||
new_video_list = update_videos_from_all_channels(ti, yt_client)
|
||||
update_snapshots_of_all_videos(ti, yt_client, new_video_list)
|
||||
|
||||
|
||||
get_snaphots_of_videos_operator = PythonOperator(
|
||||
python_callable=get_snaphots_of_videos,
|
||||
task_id=get_snaphots_of_videos.__name__,
|
||||
dag=kite_link_stats_dag,
|
||||
provide_context=True,
|
||||
)
|
||||
(
|
||||
get_channels_op,
|
||||
get_videos_op,
|
||||
get_channels_sheet_operator,
|
||||
) >> get_snaphots_of_videos_operator
|
189
airflow/kite_airflow/files/hubspot_contactprops.yaml
Normal file
189
airflow/kite_airflow/files/hubspot_contactprops.yaml
Normal file
@ -0,0 +1,189 @@
|
||||
{% for lang in langs %}
|
||||
- name: user_data_last_{{lang}}_active_date
|
||||
label: Last {{lang}} active date (user data)
|
||||
type: datetime
|
||||
sql:
|
||||
type: bigint
|
||||
agg: max
|
||||
delta: max(to_unixtime(from_iso8601_timestamp(timestamp)) * if(properties__{{lang}}_edit > 0, 1, 0)) * 1000
|
||||
transform: nullif(user_data_last_{{lang}}_active_date, 0)
|
||||
{% endfor %}
|
||||
|
||||
- name: user_data_last_active_date
|
||||
label: Last active date (user data)
|
||||
type: datetime
|
||||
sql:
|
||||
type: bigint
|
||||
delta: max(to_unixtime(from_iso8601_timestamp(timestamp)) * if({% for lang in langs %}properties__{{lang}}_edit{% if not loop.last %} + {% endif %}{% endfor %}> 0, 1, 0)) * 1000
|
||||
agg: max
|
||||
transform: nullif(user_data_last_active_date, 0)
|
||||
|
||||
- name: user_data_last_kite_alive_date
|
||||
label: Last Kite alive date (user data)
|
||||
type: datetime
|
||||
sql:
|
||||
type: bigint
|
||||
agg: max
|
||||
delta: max(to_unixtime(from_iso8601_timestamp(timestamp))) * 1000
|
||||
transform: nullif(user_data_last_kite_alive_date, 0)
|
||||
|
||||
{% for lang in langs %}
|
||||
- name: {{lang}}_active_1d
|
||||
sql:
|
||||
delta: if(sum(coalesce(properties__{{lang}}_edit, 0)) > 0, 1, 0)
|
||||
type: bigint
|
||||
- name: {{lang}}_edit_1d
|
||||
sql:
|
||||
delta: sum(coalesce(properties__{{lang}}_edit, 0))
|
||||
type: bigint
|
||||
|
||||
{% for interval in [7, 28] %}
|
||||
- name: user_data_{{lang}}_active_{{interval}}d
|
||||
label: Days {{lang|title}} active in last {{interval}} days (user data)
|
||||
type: number
|
||||
sql:
|
||||
type: bigint
|
||||
delta_field: {{lang}}_active_1d
|
||||
agg: sum
|
||||
agg_days: {{interval}}
|
||||
- name: user_data_{{lang}}_edit_{{interval}}d
|
||||
label: {{lang|title}} edits in last {{interval}} days (user data)
|
||||
type: number
|
||||
sql:
|
||||
type: bigint
|
||||
delta_field: {{lang}}_edit_1d
|
||||
agg: sum
|
||||
agg_days: {{interval}}
|
||||
{% endfor %}
|
||||
|
||||
{% endfor %}
|
||||
|
||||
- name: any_edit_1d
|
||||
sql:
|
||||
delta: sum({% for lang in langs %}coalesce(properties__{{lang}}_edit, 0){% if not loop.last %} + {% endif %}{% endfor %})
|
||||
type: bigint
|
||||
- name: any_active_1d
|
||||
sql:
|
||||
delta: if(sum({% for lang in langs %}coalesce(properties__{{lang}}_edit, 0){% if not loop.last %} + {% endif %}{% endfor %}) > 0, 1, 0)
|
||||
type: bigint
|
||||
|
||||
{% for interval in [7, 28] %}
|
||||
- name: user_data_any_active_{{interval}}d
|
||||
label: Days any language active in last {{interval}} days (user data)
|
||||
type: number
|
||||
sql:
|
||||
type: bigint
|
||||
delta_field: any_active_1d
|
||||
agg: sum
|
||||
agg_days: {{interval}}
|
||||
{% endfor %}
|
||||
|
||||
- name: any_edit_12w
|
||||
sql:
|
||||
type: bigint
|
||||
delta_field: any_edit_1d
|
||||
agg: sum
|
||||
agg_days: 84
|
||||
|
||||
{% for lang in langs %}
|
||||
- name: {{lang}}_edit_12w
|
||||
type: number
|
||||
sql:
|
||||
type: bigint
|
||||
delta_field: {{lang}}_edit_1d
|
||||
agg: sum
|
||||
agg_days: 84
|
||||
- name: {{lang}}_percentage
|
||||
label: {{lang|title}} percentage (user data)
|
||||
type: number
|
||||
sql:
|
||||
type: bigint
|
||||
transform: cast((cast({{lang}}_edit_12w as double) / nullif(any_edit_12w, 0)) * 100 AS bigint)
|
||||
{% endfor %}
|
||||
|
||||
{% for editor in editors %}
|
||||
- name: python_edits_in_{{editor}}
|
||||
sql:
|
||||
delta: sum(if(properties__python_edit > 0, properties__{{editor}}_events, 0))
|
||||
type: bigint
|
||||
agg: sum
|
||||
agg_days: 84
|
||||
|
||||
- name: user_data_{{editor}}_installed
|
||||
label: Editor {{ editor }} installed (user data)
|
||||
type: bool
|
||||
options:
|
||||
- label: True
|
||||
value: true
|
||||
- label: False
|
||||
value: false
|
||||
sql:
|
||||
type: boolean
|
||||
agg: latest
|
||||
delta: bool_or(properties__{{editor}}_installed)
|
||||
{% endfor %}
|
||||
|
||||
- name: user_data_plan
|
||||
label: Plan type (user data)
|
||||
type: string
|
||||
sql:
|
||||
type: varchar(32)
|
||||
agg: latest
|
||||
delta: max_by(properties__plan, from_iso8601_timestamp(timestamp))
|
||||
|
||||
- name: user_data_server_deployment_id
|
||||
label: Server deployment ID (user data)
|
||||
type: string
|
||||
sql:
|
||||
type: varchar(64)
|
||||
agg: latest
|
||||
delta: max_by(properties__server_deployment_id, from_iso8601_timestamp(timestamp))
|
||||
|
||||
- name: user_data_primary_python_editor
|
||||
label: Primary Python editor (user data)
|
||||
type: string
|
||||
sql: {}
|
||||
|
||||
- name: user_data_primary_language
|
||||
label: Primary language (user data)
|
||||
type: string
|
||||
sql: {}
|
||||
|
||||
|
||||
{% for prefix, fields in {"properties": ["os"], "maxmind": ["country_name", "city_name", "subdivision_1_name", "time_zone"]}.items() %}
|
||||
{% for field in fields %}
|
||||
- name: {{prefix}}__{{field}}_1d
|
||||
sql:
|
||||
type: map<string,bigint>
|
||||
map_delta: {{prefix}}__{{field}}, {% for lang in langs %}coalesce(properties__{{lang}}_edit, 0){% if not loop.last %} + {% endif %}{% endfor %}
|
||||
|
||||
- name: user_data_{{field}}
|
||||
label: Most common {{field}} (user data).
|
||||
type: string
|
||||
sql:
|
||||
type: map<string,bigint>
|
||||
transform: max_by(k, v)
|
||||
map_agg: sum
|
||||
agg_days: 84
|
||||
delta_field: {{prefix}}__{{field}}_1d
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
|
||||
- name: user_data_paid_jetbrains_installed
|
||||
label: Editor paid Jetbrains installed (user data)
|
||||
type: bool
|
||||
options:
|
||||
- label: True
|
||||
value: true
|
||||
- label: False
|
||||
value: false
|
||||
sql:
|
||||
type: boolean
|
||||
agg: latest
|
||||
delta: bool_or({% for prefix in ["IU", "PY", "WS", "GO"] %}properties__intellij_version LIKE '{{prefix}}%'{% if not loop.last %} OR {% endif %}{% endfor %})
|
||||
|
||||
- name: user_data_primary_language
|
||||
label: Primary language (user data)
|
||||
type: string
|
||||
sql: {}
|
1
airflow/kite_airflow/files/kite_status.schema.yaml
Normal file
1
airflow/kite_airflow/files/kite_status.schema.yaml
Normal file
@ -0,0 +1 @@
|
||||
XXXXXXX
|
30
airflow/kite_airflow/files/kite_status_config.yaml
Normal file
30
airflow/kite_airflow/files/kite_status_config.yaml
Normal file
@ -0,0 +1,30 @@
|
||||
editors:
|
||||
- atom
|
||||
- intellij
|
||||
- jupyter
|
||||
- spyder
|
||||
- sublime3
|
||||
- vim
|
||||
- vscode
|
||||
|
||||
languages:
|
||||
bash: ['lexicaltextshprovider']
|
||||
c: ['lexicaltexthprovider', 'lexicaltextcprovider']
|
||||
cpp: ['lexicaltexthprovider', 'lexicaltextcppprovider']
|
||||
csharp: ['lexicaltextcsprovider']
|
||||
css: ['lexicaltextcssprovider']
|
||||
go: ['lexicalgolangprovider', 'lexicaltextgolangprovider']
|
||||
html: ['lexicaltexthtmlprovider']
|
||||
java: ['lexicaltextjavaprovider']
|
||||
javascript: ['lexicaljavascriptprovider', 'lexicaltextjsprovider']
|
||||
jsx: ['lexicaltextjsxprovider']
|
||||
kotlin: ['lexicaltextktprovider']
|
||||
less: ['lexicaltextlessprovider']
|
||||
objectivec: ['lexicaltexthprovider', 'lexicaltextmprovider']
|
||||
php: ['lexicaltextphpprovider']
|
||||
python: ['pythonlexicalprovider']
|
||||
ruby: ['lexicaltextrbprovider']
|
||||
scala: ['lexicaltextscalaprovider']
|
||||
tsx: ['lexicaltexttsxprovider']
|
||||
typescript: ['lexicaltexttsprovider']
|
||||
vue: ['lexicaltextvueprovider']
|
1
airflow/kite_airflow/files/mixpanel_people.schema.yaml
Normal file
1
airflow/kite_airflow/files/mixpanel_people.schema.yaml
Normal file
@ -0,0 +1 @@
|
||||
XXXXXXX
|
0
airflow/kite_airflow/plugins/__init__.py
Normal file
0
airflow/kite_airflow/plugins/__init__.py
Normal file
63
airflow/kite_airflow/plugins/google.py
Normal file
63
airflow/kite_airflow/plugins/google.py
Normal file
@ -0,0 +1,63 @@
|
||||
# from airflow.models.baseoperator import BaseOperator
|
||||
# from airflow.utils.decorators import apply_defaults
|
||||
|
||||
import time
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
from airflow import AirflowException
|
||||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook
|
||||
from airflow.plugins_manager import AirflowPlugin
|
||||
from airflow.models.baseoperator import BaseOperator
|
||||
from airflow.utils.decorators import apply_defaults
|
||||
|
||||
|
||||
class GoogleSheetsHook(GoogleCloudBaseHook):
|
||||
_conn = None
|
||||
|
||||
def __init__(self, api_version="v4", gcp_conn_id="google_cloud_default", delegate_to=None):
|
||||
super(GoogleSheetsHook, self).__init__(gcp_conn_id, delegate_to)
|
||||
self.api_version = api_version
|
||||
|
||||
def get_conn(self):
|
||||
"""
|
||||
Retrieves the connection to Cloud Functions.
|
||||
|
||||
:return: Google Cloud Build services object.
|
||||
"""
|
||||
if not self._conn:
|
||||
http_authorized = self._authorize()
|
||||
self._conn = build('sheets', self.api_version, http=http_authorized, cache_discovery=False)
|
||||
return self._conn
|
||||
|
||||
@GoogleCloudBaseHook.fallback_to_default_project_id
|
||||
def get_range(self, spreadsheet_id:str, range:str, **kwargs):
|
||||
conn = self.get_conn()
|
||||
|
||||
sheets = conn.spreadsheets().values()
|
||||
return sheets.get(spreadsheetId=spreadsheet_id, range=range).execute(num_retries=self.num_retries)
|
||||
|
||||
|
||||
class GoogleSheetsRangeOperator(BaseOperator):
|
||||
|
||||
@apply_defaults
|
||||
def __init__(
|
||||
self,
|
||||
spreadsheet_id: str,
|
||||
range: str,
|
||||
gcp_conn_id: str = 'google_cloud_default',
|
||||
*args, **kwargs) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self.gcp_conn_id = gcp_conn_id
|
||||
self.spreadsheet_id = spreadsheet_id
|
||||
self.range=range
|
||||
|
||||
def execute(self, context):
|
||||
hook = GoogleSheetsHook(gcp_conn_id=self.gcp_conn_id)
|
||||
return hook.get_range(spreadsheet_id=self.spreadsheet_id, range=self.range)
|
||||
|
||||
|
||||
class GoogleSheetsPlugin(AirflowPlugin):
|
||||
name = 'google_sheets'
|
||||
operators = [GoogleSheetsRangeOperator]
|
||||
hooks = [GoogleSheetsHook]
|
32
airflow/kite_airflow/s3_utils.py
Normal file
32
airflow/kite_airflow/s3_utils.py
Normal file
@ -0,0 +1,32 @@
|
||||
import gzip
|
||||
import json
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
from airflow.contrib.operators.s3_delete_objects_operator import S3DeleteObjectsOperator
|
||||
|
||||
|
||||
def read_s3_json_files(bucket, file_list):
|
||||
s3 = S3Hook('aws_us_east_1')
|
||||
|
||||
for file in sorted(file_list):
|
||||
obj = s3.get_key(file, bucket)
|
||||
for line in gzip.open(obj.get()['Body']):
|
||||
rec = json.loads(line)
|
||||
to_clean = [rec]
|
||||
while to_clean:
|
||||
this = to_clean.pop()
|
||||
for k in list(this.keys()):
|
||||
v = this[k]
|
||||
if isinstance(v, dict):
|
||||
to_clean.append(v)
|
||||
continue
|
||||
if v is None:
|
||||
del this[k]
|
||||
yield rec
|
||||
|
||||
|
||||
class S3DeletePrefixOperator(S3DeleteObjectsOperator):
|
||||
def execute(self, context):
|
||||
if isinstance(self.keys, str):
|
||||
hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
|
||||
self.keys = hook.list_keys(bucket_name=self.bucket, prefix=self.keys)
|
||||
return super(S3DeletePrefixOperator, self).execute(context)
|
104
airflow/kite_airflow/secrets_backend.py
Normal file
104
airflow/kite_airflow/secrets_backend.py
Normal file
@ -0,0 +1,104 @@
|
||||
from typing import Optional
|
||||
|
||||
import boto3
|
||||
from cached_property import cached_property
|
||||
|
||||
from airflow.secrets import BaseSecretsBackend
|
||||
from airflow.utils.log.logging_mixin import LoggingMixin
|
||||
|
||||
|
||||
class SecretsManagerBackend(BaseSecretsBackend, LoggingMixin):
|
||||
"""
|
||||
Retrieves Connection or Variables from AWS Secrets Manager
|
||||
|
||||
Configurable via ``airflow.cfg`` like so:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[secrets]
|
||||
backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend
|
||||
backend_kwargs = {"connections_prefix": "airflow/connections"}
|
||||
|
||||
For example, if secrets prefix is ``airflow/connections/smtp_default``, this would be accessible
|
||||
if you provide ``{"connections_prefix": "airflow/connections"}`` and request conn_id ``smtp_default``.
|
||||
And if variables prefix is ``airflow/variables/hello``, this would be accessible
|
||||
if you provide ``{"variables_prefix": "airflow/variables"}`` and request variable key ``hello``.
|
||||
|
||||
You can also pass additional keyword arguments like ``aws_secret_access_key``, ``aws_access_key_id``
|
||||
or ``region_name`` to this class and they would be passed on to Boto3 client.
|
||||
|
||||
:param connections_prefix: Specifies the prefix of the secret to read to get Connections.
|
||||
:type connections_prefix: str
|
||||
:param variables_prefix: Specifies the prefix of the secret to read to get Variables.
|
||||
:type variables_prefix: str
|
||||
:param profile_name: The name of a profile to use. If not given, then the default profile is used.
|
||||
:type profile_name: str
|
||||
:param sep: separator used to concatenate secret_prefix and secret_id. Default: "/"
|
||||
:type sep: str
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
connections_prefix: str = 'airflow/connections',
|
||||
variables_prefix: str = 'airflow/variables',
|
||||
profile_name: Optional[str] = None,
|
||||
sep: str = "/",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.connections_prefix = connections_prefix.rstrip("/")
|
||||
self.variables_prefix = variables_prefix.rstrip('/')
|
||||
self.profile_name = profile_name
|
||||
self.sep = sep
|
||||
self.kwargs = kwargs
|
||||
|
||||
@cached_property
|
||||
def client(self):
|
||||
"""
|
||||
Create a Secrets Manager client
|
||||
"""
|
||||
session = boto3.session.Session(
|
||||
profile_name=self.profile_name,
|
||||
)
|
||||
return session.client(service_name="secretsmanager", **self.kwargs)
|
||||
|
||||
def get_conn_uri(self, conn_id: str) -> Optional[str]:
|
||||
"""
|
||||
Get Connection Value
|
||||
|
||||
:param conn_id: connection id
|
||||
:type conn_id: str
|
||||
"""
|
||||
return self._get_secret(self.connections_prefix, conn_id)
|
||||
|
||||
def get_variable(self, key: str) -> Optional[str]:
|
||||
"""
|
||||
Get Airflow Variable from Environment Variable
|
||||
|
||||
:param key: Variable Key
|
||||
:return: Variable Value
|
||||
"""
|
||||
return self._get_secret(self.variables_prefix, key)
|
||||
|
||||
def _get_secret(self, path_prefix: str, secret_id: str) -> Optional[str]:
|
||||
"""
|
||||
Get secret value from Secrets Manager
|
||||
|
||||
:param path_prefix: Prefix for the Path to get Secret
|
||||
:type path_prefix: str
|
||||
:param secret_id: Secret Key
|
||||
:type secret_id: str
|
||||
"""
|
||||
secrets_path = self.build_path(path_prefix, secret_id, self.sep)
|
||||
try:
|
||||
response = self.client.get_secret_value(
|
||||
SecretId=secrets_path,
|
||||
)
|
||||
return response.get('SecretString')
|
||||
except self.client.exceptions.ResourceNotFoundException:
|
||||
self.log.debug(
|
||||
"An error occurred (ResourceNotFoundException) when calling the "
|
||||
"get_secret_value operation: "
|
||||
"Secret %s not found.", secrets_path
|
||||
)
|
||||
return None
|
41
airflow/kite_airflow/slack_alerts.py
Normal file
41
airflow/kite_airflow/slack_alerts.py
Normal file
@ -0,0 +1,41 @@
|
||||
from airflow.contrib.operators.slack_webhook_operator import SlackWebhookOperator
|
||||
from airflow.hooks.base_hook import BaseHook
|
||||
from airflow.models import Variable
|
||||
|
||||
|
||||
SLACK_CONN_ID = "slack_devops_notifications"
|
||||
|
||||
|
||||
def task_fail_slack_alert(context):
|
||||
"""
|
||||
Callback task that can be used in DAG to alert of failure task completion
|
||||
Args:
|
||||
context (dict): Context variable passed in from Airflow
|
||||
Returns:
|
||||
None: Calls the SlackWebhookOperator execute method internally
|
||||
"""
|
||||
|
||||
if Variable.get('env', 'dev') == 'dev':
|
||||
return
|
||||
|
||||
slack_webhook_token = BaseHook.get_connection(SLACK_CONN_ID).password
|
||||
slack_msg = """
|
||||
:red_circle: Task Failed.
|
||||
*Task*: {task}
|
||||
*Dag*: {dag} (https://airflow.kite.dev/admin/airflow/tree?dag_id={dag})
|
||||
*Execution Time*: {exec_date}
|
||||
""".format(
|
||||
task=context.get("task_instance").task_id,
|
||||
dag=context.get("task_instance").dag_id,
|
||||
exec_date=context.get("execution_date"),
|
||||
)
|
||||
|
||||
failed_alert = SlackWebhookOperator(
|
||||
task_id="slack_test",
|
||||
http_conn_id=SLACK_CONN_ID,
|
||||
webhook_token=slack_webhook_token,
|
||||
message=slack_msg,
|
||||
username="airflow",
|
||||
)
|
||||
|
||||
return failed_alert.execute(context=context)
|
@ -0,0 +1,38 @@
|
||||
-- Week range is Sunday - Saturday
|
||||
{% set start_date=execution_date %}
|
||||
{% set end_date=execution_date.add(days=6) %}
|
||||
|
||||
WITH any_edit AS (
|
||||
SELECT
|
||||
userid,
|
||||
CAST(
|
||||
COUNT_IF(
|
||||
{% for language in params.languages %}
|
||||
properties__{{language}}_edit > 0 {% if not loop.last -%} OR {%- endif -%}
|
||||
{% endfor %}
|
||||
) AS double
|
||||
) / 6 AS edits -- dividing by 6 because events get reported after every 10 minutes
|
||||
FROM
|
||||
kite_status_normalized
|
||||
WHERE
|
||||
(
|
||||
year > {{ start_date.year }}
|
||||
OR (year = {{ start_date.year }} AND month > {{ start_date.month }})
|
||||
OR (year = {{ start_date.year }} AND month = {{ start_date.month }} AND day >= {{ start_date.day }})
|
||||
)
|
||||
AND (
|
||||
year < {{ end_date.year }}
|
||||
OR (year = {{ end_date.year }} AND month < {{ end_date.month }})
|
||||
OR (year = {{ end_date.year }} AND month = {{ end_date.month }} AND day <= {{ end_date.day }})
|
||||
)
|
||||
GROUP BY
|
||||
userid
|
||||
)
|
||||
SELECT
|
||||
{% for i in range(1, 100) %}
|
||||
approx_percentile(edits, {{i/100}}) AS pct_{{i}} {% if i < 99 -%} , {%- endif -%}
|
||||
{% endfor %}
|
||||
FROM
|
||||
any_edit
|
||||
WHERE
|
||||
edits > 0;
|
@ -0,0 +1,80 @@
|
||||
-- Calculating coding stats of provided NUM_OF_WEEKS.
|
||||
-- Week range is Sunday - Saturday
|
||||
{% set start_date=execution_date.subtract(days=(7 * (params.num_of_weeks - 1))) %}
|
||||
{% set end_date=execution_date.add(days=6) %}
|
||||
|
||||
WITH coding_stats AS (
|
||||
SELECT
|
||||
userid,
|
||||
date_diff(
|
||||
'day',
|
||||
from_iso8601_timestamp(timestamp),
|
||||
CAST('{{end_date.to_date_string()}}' AS timestamp)
|
||||
) / 7 AS week,
|
||||
SUM(
|
||||
{% for language in params.languages %}
|
||||
COALESCE(properties__{{language}}_completions_num_selected, 0) {% if not loop.last -%} + {%- endif -%}
|
||||
{% endfor %}
|
||||
) AS completions_selected,
|
||||
CAST(
|
||||
COUNT_IF(
|
||||
{% for language in params.languages %}
|
||||
properties__{{language}}_edit > 0 {% if not loop.last -%} OR {%- endif -%}
|
||||
{% endfor %}
|
||||
) AS double
|
||||
) / 6 AS coding_hours,
|
||||
CAST(
|
||||
COUNT_IF(properties__python_edit > 0) AS double
|
||||
) / 6 AS python_hours
|
||||
FROM
|
||||
kite_status_normalized
|
||||
WHERE
|
||||
(
|
||||
year > {{ start_date.year }}
|
||||
OR (year = {{ start_date.year }} AND month > {{ start_date.month }})
|
||||
OR (year = {{ start_date.year }} AND month = {{ start_date.month }} AND day >= {{ start_date.day }})
|
||||
)
|
||||
AND (
|
||||
year < {{ end_date.year }}
|
||||
OR (year = {{ end_date.year }} AND month < {{ end_date.month }})
|
||||
OR (year = {{ end_date.year }} AND month = {{ end_date.month }} AND day <= {{ end_date.day }})
|
||||
)
|
||||
AND event = 'kite_status'
|
||||
AND regexp_like(kite_status_normalized.userid, '\\p{Cc}') = FALSE -- filter user id's that contains null
|
||||
AND regexp_replace(kite_status_normalized.userid, '\x{00}') != '' -- filter user id's were only null bytes
|
||||
GROUP BY
|
||||
1,
|
||||
2
|
||||
)
|
||||
SELECT
|
||||
coding_stats.userid,
|
||||
map_agg(week, completions_selected) AS completions_selected,
|
||||
map_agg(week, coding_hours) AS coding_hours,
|
||||
map_agg(week, python_hours) AS python_hours,
|
||||
reduce(
|
||||
array_agg(
|
||||
from_iso8601_timestamp({{ params.table_daily_active_users }}.timestamp)
|
||||
ORDER BY
|
||||
{{ params.table_daily_active_users }}.timestamp DESC
|
||||
),
|
||||
0,
|
||||
(acc, current) -> if(
|
||||
date_diff('day', current, CAST('{{end_date.to_date_string()}}' AS timestamp)) / 7 - acc < 1,
|
||||
date_diff('day', current, CAST('{{end_date.to_date_string()}}' AS timestamp)) / 7 + 1,
|
||||
acc
|
||||
),
|
||||
acc -> acc
|
||||
) AS streak,
|
||||
COUNT(
|
||||
DISTINCT date_diff(
|
||||
'day',
|
||||
from_iso8601_timestamp({{ params.table_daily_active_users }}.timestamp),
|
||||
CAST('{{end_date.to_date_string()}}' AS timestamp)
|
||||
) / 7
|
||||
) AS total_weeks
|
||||
FROM
|
||||
coding_stats
|
||||
LEFT OUTER JOIN {{ params.table_daily_active_users }} ON coding_stats.userid = {{ params.table_daily_active_users }}.userid
|
||||
GROUP BY
|
||||
coding_stats.userid
|
||||
;
|
@ -0,0 +1,8 @@
|
||||
{% if prev_execution_date_success == None -%}
|
||||
DROP TABLE IF EXISTS {{ params.table_name }};
|
||||
{%- else -%}
|
||||
-- void query which prevents ERROR: Parameter validation failed which occurs due to empty file that have no query
|
||||
SELECT *
|
||||
FROM {{ params.table_name }}
|
||||
LIMIT 0;
|
||||
{%- endif -%}
|
@ -0,0 +1,20 @@
|
||||
INSERT INTO
|
||||
{{ params.table_name }} (userid, timestamp)
|
||||
SELECT
|
||||
DISTINCT userid,
|
||||
timestamp
|
||||
FROM
|
||||
kite_status_normalized
|
||||
WHERE
|
||||
(
|
||||
{% for language in params.languages %}
|
||||
properties__{{language}}_events > 0 {% if not loop.last -%} OR {%- endif -%}
|
||||
{% endfor %}
|
||||
)
|
||||
AND regexp_like(userid, '\p{Cc}') = FALSE -- filter user id's that contains null
|
||||
AND regexp_replace(kite_status_normalized.userid, '\x{00}') != '' -- filter user id's were only null bytes
|
||||
AND year >= {{ execution_date.year }}
|
||||
AND month >= {{ execution_date.month }}
|
||||
AND day >= {{ execution_date.day }}
|
||||
AND hour > {{ execution_date.hour }}
|
||||
;
|
@ -0,0 +1,7 @@
|
||||
CREATE EXTERNAL TABLE IF NOT EXISTS `{{params.table_name}}`(
|
||||
userid string,
|
||||
timestamp string
|
||||
)
|
||||
STORED AS PARQUET
|
||||
LOCATION
|
||||
'{{ params.data_location }}'
|
@ -0,0 +1,30 @@
|
||||
CREATE TABLE cio_profile_attrs_{{ds_nodash}}
|
||||
WITH (
|
||||
format='JSON',
|
||||
external_location = 's3://kite-metrics/athena/cio_profile_attrs/{{ds}}'
|
||||
)
|
||||
AS
|
||||
WITH current AS (
|
||||
SELECT *
|
||||
FROM hubspot_intermediate
|
||||
WHERE year = {{execution_date.year}}
|
||||
AND month = {{execution_date.month}}
|
||||
AND day = {{execution_date.day}}
|
||||
AND delta=0
|
||||
)
|
||||
SELECT
|
||||
current.userid id,
|
||||
{% for prop in params.props -%}
|
||||
subquery_{{prop}}.value {{prop}}
|
||||
{%- if not loop.last -%},{% endif %}
|
||||
{%- endfor %}
|
||||
FROM current
|
||||
{% for prop in params.props %}
|
||||
LEFT JOIN (
|
||||
SELECT userid, max_by(k, v) value
|
||||
FROM current
|
||||
CROSS JOIN unnest(user_data_{{prop}}) AS t(k, v)
|
||||
GROUP BY userid
|
||||
) subquery_{{ prop }}
|
||||
ON current.userid = subquery_{{ prop }}.userid
|
||||
{%- endfor %}
|
@ -0,0 +1,21 @@
|
||||
CREATE TABLE hubspot_delta_{{ds_nodash}}
|
||||
WITH (
|
||||
format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
external_location = 's3://kite-metrics/athena/hubspot/intermediate/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/delta=1'
|
||||
)
|
||||
AS
|
||||
SELECT
|
||||
{% for prop in params.props %}
|
||||
{% if prop.sql.delta -%}
|
||||
CAST({{prop.sql.delta}} AS {{prop.sql.type}}) {{prop.name}},
|
||||
{%- endif -%}
|
||||
{% if prop.sql.map_delta -%}
|
||||
transform_values(multimap_agg({{prop.sql.map_delta}}), (k, v) -> reduce(v, 0, (s, x) -> s + x, (s) -> s)) {{prop.name}},
|
||||
{%- endif -%}
|
||||
{% endfor %}
|
||||
userid
|
||||
FROM kite_status_normalized
|
||||
WHERE year={{execution_date.year}} AND month={{execution_date.month}} AND day={{execution_date.day}}
|
||||
AND regexp_like(userid, '^[0-9]+$')
|
||||
GROUP BY userid
|
@ -0,0 +1 @@
|
||||
XXXXXXX
|
@ -0,0 +1,80 @@
|
||||
CREATE TABLE hubspot_rollup_{{ds_nodash}}
|
||||
WITH (
|
||||
format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
external_location = 's3://kite-metrics/athena/hubspot/intermediate/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/delta=0'
|
||||
)
|
||||
AS
|
||||
WITH current AS (
|
||||
SELECT *
|
||||
FROM hubspot_intermediate hs
|
||||
WHERE (
|
||||
hs.delta=1 AND
|
||||
hs.year={{execution_date.year}} AND
|
||||
hs.month={{execution_date.month}} AND
|
||||
hs.day={{execution_date.day}}
|
||||
) OR (
|
||||
hs.delta=0 AND
|
||||
hs.year={{(execution_date - macros.timedelta(days=1)).year}} AND
|
||||
hs.month={{(execution_date - macros.timedelta(days=1)).month}} AND
|
||||
hs.day={{(execution_date - macros.timedelta(days=1)).day}}
|
||||
)
|
||||
),
|
||||
scalar_aggs AS (
|
||||
SELECT
|
||||
{% for prop in params.scalar_props %}
|
||||
{%- if prop.sql.agg == 'latest' -%}
|
||||
coalesce(max_by({{prop.name}}, delta)) {{prop.name}},
|
||||
{%- elif prop.sql.delta_field -%}
|
||||
{{ prop.sql.agg }}(coalesce({{prop.name}}, {{prop.sql.delta_field}})) {{prop.name}},
|
||||
{%- else -%}
|
||||
{{ prop.sql.agg }}({{prop.name}}) {{prop.name}},
|
||||
{%- endif -%}
|
||||
{% endfor %}
|
||||
current.userid
|
||||
FROM current
|
||||
GROUP BY current.userid
|
||||
)
|
||||
SELECT scalar_aggs.userid,
|
||||
{% for prop in params.scalar_props %}
|
||||
{%- if prop.sql.agg_days -%}
|
||||
|
||||
scalar_aggs.{{prop.name}} - coalesce(scalar_diff_{{ prop.sql.agg_days}}d.{{ prop.sql.delta_field or prop.name }}, 0) {{prop.name}}
|
||||
{%- else -%}
|
||||
scalar_aggs.{{prop.name}} {{prop.name}}
|
||||
{%- endif -%}
|
||||
{%- if (not loop.last) or params.map_props %},{% endif %}
|
||||
{% endfor %}
|
||||
{% for prop in params.map_props %}
|
||||
{{prop.name}}_aggs.value {{prop.name}}
|
||||
{%- if not loop.last %},{% endif %}
|
||||
{% endfor %}
|
||||
FROM scalar_aggs
|
||||
{% for tbl in params.scalar_time_rollups %}
|
||||
LEFT JOIN hubspot_intermediate scalar_diff_{{tbl}}d
|
||||
ON scalar_aggs.userid = scalar_diff_{{tbl}}d.userid
|
||||
AND scalar_diff_{{tbl}}d.delta = 1
|
||||
AND scalar_diff_{{tbl}}d.year={{(execution_date - macros.timedelta(days=tbl)).year}}
|
||||
AND scalar_diff_{{tbl}}d.month={{(execution_date - macros.timedelta(days=tbl)).month}}
|
||||
AND scalar_diff_{{tbl}}d.day={{(execution_date - macros.timedelta(days=tbl)).day}}
|
||||
{% endfor %}
|
||||
{% for prop in params.map_props %}
|
||||
LEFT JOIN (
|
||||
SELECT userid, transform_values(multimap_agg(k, v), (inner_k, inner_v) -> reduce(inner_v, cast(0 as bigint), (s, x) -> s + x, (s) -> s)) value
|
||||
FROM (
|
||||
SELECT userid, k, v
|
||||
FROM current
|
||||
CROSS JOIN unnest(coalesce({{prop.name}}, {{prop.sql.delta_field}})) as t(k, v)
|
||||
UNION ALL
|
||||
SELECT userid, k, v * -1
|
||||
FROM hubspot_intermediate
|
||||
CROSS JOIN unnest({{prop.sql.delta_field}}) as t(k, v)
|
||||
WHERE hubspot_intermediate.delta = 1
|
||||
AND hubspot_intermediate.year={{(execution_date - macros.timedelta(days=prop.sql.agg_days)).year}}
|
||||
AND hubspot_intermediate.month={{(execution_date - macros.timedelta(days=prop.sql.agg_days)).month}}
|
||||
AND hubspot_intermediate.day={{(execution_date - macros.timedelta(days=prop.sql.agg_days)).day}}
|
||||
)
|
||||
GROUP BY userid
|
||||
) {{prop.name}}_aggs
|
||||
ON scalar_aggs.userid={{prop.name}}_aggs.userid
|
||||
{% endfor %}
|
@ -0,0 +1,19 @@
|
||||
INSERT INTO activations
|
||||
WITH new_activations as (
|
||||
SELECT coalesce(properties__user_id, properties__anonymous_id) userid,
|
||||
min(from_iso8601_timestamp(timestamp)) activation_date,
|
||||
to_unixtime(min(from_iso8601_timestamp(timestamp))) activation_timestamp
|
||||
FROM kite_status_normalized
|
||||
WHERE year = {{execution_date.year}} AND month = {{execution_date.month}} AND day = {{execution_date.day}}
|
||||
AND (event='ast_node_resolved' OR event='anon_supported_file_edited')
|
||||
GROUP BY coalesce(properties__user_id, properties__anonymous_id)
|
||||
)
|
||||
SELECT new_activations.userid,
|
||||
new_activations.activation_timestamp,
|
||||
day(new_activations.activation_date) day,
|
||||
year(new_activations.activation_date) year,
|
||||
month(new_activations.activation_date) month
|
||||
FROM activations
|
||||
RIGHT OUTER JOIN new_activations ON activations.userid=new_activations.userid
|
||||
WHERE new_activations.activation_timestamp < activations.activation_timestamp
|
||||
OR activations.userid IS NULL
|
@ -0,0 +1 @@
|
||||
XXXXXXX
|
@ -0,0 +1,12 @@
|
||||
CREATE TABLE kite_status_1d_{{params.key}}_{{ds_nodash}}_json
|
||||
WITH (
|
||||
format='JSON',
|
||||
external_location = 's3://kite-metrics/athena/kite_status_1d_{{params.key}}/json/{{ds}}'
|
||||
)
|
||||
AS
|
||||
SELECT *
|
||||
FROM kite_status_1d_{{params.key}}_{{ds_nodash}}
|
||||
WHERE ({% for lang in params.languages %}{{lang}}_events > 0{% if not loop.last %} OR {% endif %}{% endfor %})
|
||||
AND year = {{execution_date.year}}
|
||||
AND month = {{execution_date.month}}
|
||||
AND day = {{execution_date.day}}
|
@ -0,0 +1,81 @@
|
||||
CREATE table kite_metrics.kite_status_normalized_{{ds_nodash}}
|
||||
WITH (
|
||||
format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
partitioned_by = ARRAY['hour'],
|
||||
external_location = 's3://kite-metrics/athena/kite_status_normalized/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'
|
||||
)
|
||||
AS
|
||||
WITH kite_status_normalized_ts AS (
|
||||
SELECT
|
||||
{% for field in params.schema['properties'] %}
|
||||
{%- if field != 'timestamp' %}{{field}},{% endif %}
|
||||
{%- endfor %}
|
||||
{#- Normalize older timestamps. Convert to ISO format and reset them based on prefix because they were client-reported and unreliable. #}
|
||||
if(regexp_like(timestamp, '^[0-9]+$'), to_iso8601(date_add('second', cast(timestamp as bigint) / 1000 - cast(to_unixtime(timestamp '{{execution_date.strftime('%Y-%m-%d %H:00')}}') as bigint), timestamp '{{execution_date.strftime('%Y-%m-%d %H:00')}}')), timestamp) timestamp
|
||||
FROM kite_metrics.kite_status
|
||||
WHERE event IS NOT NULL
|
||||
AND event != ''
|
||||
AND prefix >= '{{(execution_date.replace(hour=0, minute=0, second=0, microsecond=0) - macros.timedelta(hours=1)).strftime('%Y/%m/%d/%H')}}'
|
||||
AND prefix <= '{{(execution_date.replace(hour=0, minute=0, second=0, microsecond=0) + macros.timedelta(hours=25)).strftime('%Y/%m/%d/%H')}}'
|
||||
),
|
||||
kite_status_filtered AS (
|
||||
SELECT *,
|
||||
reduce(zip_with(split(sourceip, '.'),
|
||||
sequence(3, 0, -1),
|
||||
(n, p) -> cast(cast(n as bigint) * pow(256, p) as bigint)
|
||||
),
|
||||
cast(0 as bigint),
|
||||
(s, x) -> s + x,
|
||||
(s)->s
|
||||
) sourceIpNumber
|
||||
FROM kite_status_normalized_ts
|
||||
WHERE timestamp >= '{{execution_date.replace(hour=0, minute=0, second=0, microsecond=0).strftime('%Y-%m-%dT%H:%M:%S')}}'
|
||||
AND timestamp < '{{(execution_date.replace(hour=0, minute=0, second=0, microsecond=0) + macros.timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S')}}'
|
||||
),
|
||||
maxmind_masks AS (
|
||||
SELECT DISTINCT kite_status_filtered.sourceIp sourceip,
|
||||
bitwise_and(kite_status_filtered.sourceIpNumber, maxmind.mask) maskedSourceIpNumber,
|
||||
maxmind.mask
|
||||
FROM kite_status_filtered
|
||||
CROSS JOIN (SELECT DISTINCT mask FROM maxmind_city_ipv4) maxmind
|
||||
WHERE regexp_like(kite_status_filtered.sourceIp, '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$')
|
||||
),
|
||||
maxmind_cities AS (
|
||||
SELECT sourceip,
|
||||
arbitrary(maxmind.country_name) country_name,
|
||||
arbitrary(maxmind.country_iso_code) country_iso_code,
|
||||
arbitrary(maxmind.subdivision_1_name) subdivision_1_name,
|
||||
arbitrary(maxmind.city_name) city_name,
|
||||
arbitrary(maxmind.time_zone) time_zone
|
||||
FROM maxmind_masks
|
||||
JOIN maxmind_city_ipv4 maxmind
|
||||
ON maxmind_masks.mask = maxmind.mask
|
||||
AND maxmind_masks.maskedSourceIpNumber = maxmind.address
|
||||
GROUP BY sourceip
|
||||
)
|
||||
SELECT
|
||||
{% for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
|
||||
kite_status_filtered.{{ key }} {{ key }},
|
||||
{% endfor %}
|
||||
|
||||
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False %}
|
||||
{% if value.type.startswith('array') or value.type.startswith('map') -%}
|
||||
if(cardinality(kite_status_filtered.properties.{{ key }}) > 0, kite_status_filtered.properties.{{ key }}) properties__{{ key }},
|
||||
{%- else -%}
|
||||
kite_status_filtered.properties.{{ key }} properties__{{ key }},
|
||||
{%- endif -%}
|
||||
{% endfor %}
|
||||
maxmind_cities.country_name maxmind__country_name,
|
||||
maxmind_cities.country_iso_code maxmind__country_iso_code,
|
||||
maxmind_cities.subdivision_1_name maxmind__subdivision_1_name,
|
||||
maxmind_cities.city_name maxmind__city_name,
|
||||
maxmind_cities.time_zone maxmind__time_zone,
|
||||
monetizable_scores.score monetizable_score,
|
||||
monetizable_scores.model_version monetizable_model_version,
|
||||
hour(from_iso8601_timestamp(kite_status_filtered.timestamp)) hour
|
||||
FROM kite_status_filtered
|
||||
LEFT OUTER JOIN maxmind_cities
|
||||
ON kite_status_filtered.sourceIp = maxmind_cities.sourceip
|
||||
LEFT OUTER JOIN monetizable_scores
|
||||
ON kite_status_filtered.userid = monetizable_scores.userid
|
@ -0,0 +1,31 @@
|
||||
{% set execution_day = execution_date.replace(hour=0, minute=0, second=0, microsecond=0) %}
|
||||
CREATE table kite_metrics.kite_status_normalized_{{ds_nodash}}
|
||||
WITH (
|
||||
format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
partitioned_by = ARRAY['hour'],
|
||||
external_location = 's3://kite-metrics/athena/kite_status_normalized/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'
|
||||
)
|
||||
AS SELECT
|
||||
{% for key, value in params.schema.items()|sort if key != "properties" %}
|
||||
kite_status_segment.{{ key }} {{ key }},
|
||||
{% endfor %}
|
||||
|
||||
{% for key, value in params.schema['properties'].items()|sort %}
|
||||
{% if value.startswith('array') or value.startswith('map') -%}
|
||||
if(cardinality(kite_status_segment.properties.{{ key }}) > 0, kite_status_segment.properties.{{ key }}) properties__{{ key }},
|
||||
{%- else -%}
|
||||
kite_status_segment.properties.{{ key }} properties__{{ key }},
|
||||
{%- endif -%}
|
||||
{% endfor %}
|
||||
hour(from_iso8601_timestamp(kite_status_segment.timestamp)) hour
|
||||
FROM kite_metrics.kite_status_segment
|
||||
WHERE event IS NOT NULL
|
||||
AND event != ''
|
||||
AND prefix IN (
|
||||
'{{1000 * (execution_day - macros.timedelta(days=1)).int_timestamp}}',
|
||||
'{{1000 * execution_day.int_timestamp}}',
|
||||
'{{1000 * (execution_day + macros.timedelta(days=1)).int_timestamp}}'
|
||||
)
|
||||
AND timestamp >= '{{execution_day.strftime('%Y-%m-%dT%H:%M:%S')}}'
|
||||
AND timestamp < '{{(execution_day + macros.timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S')}}'
|
@ -0,0 +1,22 @@
|
||||
CREATE TABLE mixpanel_people_rollup_{{ds_nodash}}
|
||||
WITH (
|
||||
format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
external_location = 's3://kite-metrics/mixpanel/people/rollups/year={{execution_date.year}}/month={{execution_date.month}}/day={{execution_date.day}}/'
|
||||
)
|
||||
AS
|
||||
WITH candidates AS (
|
||||
SELECT {% for key in params.schema|sort %}{{ key }}{% if not loop.last %}, {% endif %}{% endfor %}
|
||||
FROM mixpanel_people_raw
|
||||
{% if prev_execution_date_success %}WHERE year > {{ prev_execution_date_success.year }} OR (year = {{ prev_execution_date_success.year }} AND month > {{ prev_execution_date_success.month }} ) OR (year = {{ prev_execution_date_success.year }} AND month = {{ prev_execution_date_success.month }} AND day > {{ prev_execution_date_success.day }}) {% endif %}
|
||||
UNION ALL
|
||||
SELECT {% for key in params.schema|sort %}{{ key }}{% if not loop.last %}, {% endif %}{% endfor %}
|
||||
FROM mixpanel_people
|
||||
)
|
||||
SELECT
|
||||
distinct_id,
|
||||
{% for key in params.schema if key != 'distinct_id' %}
|
||||
max_by({{ key}}, time) {{ key }}{% if not loop.last %},{% endif %}
|
||||
{% endfor %}
|
||||
FROM candidates
|
||||
GROUP BY distinct_id
|
@ -0,0 +1,18 @@
|
||||
CREATE TABLE monetizable_scores_{{ds_nodash}}
|
||||
WITH (
|
||||
format='PARQUET',
|
||||
parquet_compression='SNAPPY',
|
||||
external_location = 's3://{{params.bucket}}/monetizable/final_users/{{ds_nodash}}'
|
||||
)
|
||||
AS SELECT
|
||||
userid,
|
||||
max_by(score, timestamp) score,
|
||||
max_by(model_version, timestamp) model_version,
|
||||
max(timestamp) timestamp
|
||||
FROM
|
||||
(
|
||||
SELECT userid, score, model_version, timestamp FROM monetizable_scores
|
||||
UNION ALL
|
||||
SELECT userid, score, model_version, {{ execution_date.int_timestamp }} FROM monetizable_inf_results_{{ds_nodash}}
|
||||
) AS subq
|
||||
GROUP BY userid
|
@ -0,0 +1,73 @@
|
||||
CREATE TABLE monetizable_new_users_{{ds_nodash}}
|
||||
WITH (
|
||||
external_location ='s3://{{params.bucket}}/monetizable/new_users/{{ds_nodash}}',
|
||||
format='JSON'
|
||||
)
|
||||
AS WITH
|
||||
people AS (
|
||||
SELECT
|
||||
CAST(properties.user_id AS VARCHAR) AS userid,
|
||||
BOOL_OR(properties.windows_domain_membership) AS windows_domain_membership,
|
||||
ARBITRARY(properties.cio_experiment_trial_end_v1) AS discount
|
||||
FROM mixpanel_people
|
||||
GROUP BY 1
|
||||
),
|
||||
status AS (
|
||||
SELECT
|
||||
kite_status_normalized.userid,
|
||||
MIN(month) AS activation_month,
|
||||
ARBITRARY(properties__os) AS os,
|
||||
ARBITRARY(maxmind__country_iso_code) AS country_iso_code,
|
||||
ARBITRARY(properties__cpu_threads) AS cpu_threads,
|
||||
BOOL_OR(properties__git_found) AS git_found,
|
||||
BOOL_OR(properties__atom_installed) AS atom_installed,
|
||||
BOOL_OR(properties__intellij_installed) AS intellij_installed,
|
||||
false AS pycharm_installed,
|
||||
BOOL_OR(properties__sublime3_installed) AS sublime3_installed,
|
||||
BOOL_OR(properties__vim_installed) AS vim_installed,
|
||||
BOOL_OR(properties__vscode_installed) AS vscode_installed,
|
||||
BOOL_OR(SUBSTR(properties__intellij_version, 1, 2) NOT IN ('IC', 'PC')) AS intellij_paid,
|
||||
BOOL_OR(properties__plan IN ('pro_yearly', 'pro_monthly', 'pro_trial')) AS trial_or_converted,
|
||||
BOOL_OR(properties__plan IN ('pro_yearly', 'pro_monthly')) AS converted
|
||||
FROM kite_status_normalized
|
||||
LEFT OUTER JOIN monetizable_scores ON kite_status_normalized.userid = monetizable_scores.userid
|
||||
WHERE
|
||||
event = 'kite_status'
|
||||
AND (
|
||||
year > {{ prev_execution_date.year }}
|
||||
OR (year = {{ prev_execution_date.year }} AND month > {{ prev_execution_date.month }})
|
||||
OR (year = {{ prev_execution_date.year }} AND month = {{ prev_execution_date.month }} AND day = {{ prev_execution_date.day }})
|
||||
)
|
||||
AND (
|
||||
year < {{ next_execution_date.year }}
|
||||
OR (year = {{ next_execution_date.year }} AND month < {{ next_execution_date.month }})
|
||||
OR (year = {{ next_execution_date.year }} AND month = {{ next_execution_date.month }} AND day <= {{ next_execution_date.day }})
|
||||
)
|
||||
AND kite_status_normalized.userid IS NOT NULL
|
||||
AND kite_status_normalized.userid != '0'
|
||||
AND (monetizable_scores.timestamp IS NULL OR date_diff('day', from_unixtime(monetizable_scores.timestamp, 'utc'), now()) >= 7)
|
||||
GROUP BY 1
|
||||
)
|
||||
SELECT
|
||||
status.userid,
|
||||
activation_month,
|
||||
cast(to_unixtime(current_timestamp) as bigint) timestamp,
|
||||
COALESCE(os, '{unknown}') AS os,
|
||||
COALESCE(country_iso_code, '{unknown}') AS country_iso_code,
|
||||
COALESCE(cpu_threads, 0) cpu_threads,
|
||||
COALESCE(git_found, FALSE) AS git_found,
|
||||
COALESCE(atom_installed, FALSE) AS atom_installed,
|
||||
COALESCE(intellij_installed, FALSE) AS intellij_installed,
|
||||
COALESCE(pycharm_installed, FALSE) AS pycharm_installed,
|
||||
COALESCE(sublime3_installed, FALSE) AS sublime3_installed,
|
||||
COALESCE(vim_installed, FALSE) AS vim_installed,
|
||||
COALESCE(vscode_installed, FALSE) AS vscode_installed,
|
||||
COALESCE(intellij_paid, FALSE) AS intellij_paid,
|
||||
COALESCE(windows_domain_membership, FALSE) AS windows_domain_membership,
|
||||
COALESCE(discount, 'no discount') AS discount,
|
||||
COALESCE(trial_or_converted, FALSE) AS trial_or_converted,
|
||||
COALESCE(converted, FALSE) AS converted
|
||||
|
||||
FROM status
|
||||
LEFT JOIN people
|
||||
ON status.userid = people.userid
|
@ -0,0 +1,26 @@
|
||||
CREATE EXTERNAL TABLE IF NOT EXISTS `hubspot_intermediate` (
|
||||
userid string,
|
||||
{% for prop in params.props if prop.sql.type %}
|
||||
{{ prop.name }} {{ prop.sql.type }}{% if not loop.last %},{% endif %}
|
||||
{% endfor %}
|
||||
)
|
||||
PARTITIONED BY (
|
||||
`year` int,
|
||||
`month` int,
|
||||
`day` int,
|
||||
`delta` int
|
||||
)
|
||||
STORED AS PARQUET
|
||||
LOCATION 's3://kite-metrics/athena/hubspot/intermediate/'
|
||||
TBLPROPERTIES (
|
||||
'projection.enabled'='true',
|
||||
'projection.year.type'='integer',
|
||||
'projection.year.range'='2010,2100',
|
||||
'projection.month.type'='integer',
|
||||
'projection.month.range'='1,12',
|
||||
'projection.day.type'='integer',
|
||||
'projection.day.range'='1,31',
|
||||
'projection.delta.type'='integer',
|
||||
'projection.delta.range'='0,1',
|
||||
'storage.location.template'='s3://kite-metrics/athena/hubspot/intermediate/year=${year}/month=${month}/day=${day}/delta=${delta}'
|
||||
);
|
@ -0,0 +1,15 @@
|
||||
CREATE EXTERNAL TABLE `kite_link_stats_youtube_channels`(
|
||||
id string,
|
||||
name string,
|
||||
last_updated string,
|
||||
is_backfilled boolean,
|
||||
last_backfill_until string
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
'{{ params.data_location }}'
|
@ -0,0 +1,12 @@
|
||||
CREATE EXTERNAL TABLE `kite_link_stats_youtube_videos`(
|
||||
id string,
|
||||
channel_id string
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
'{{ params.data_location }}'
|
@ -0,0 +1,29 @@
|
||||
CREATE EXTERNAL TABLE `kite_status`(
|
||||
{%- for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
|
||||
`{{ key }}` {{ value.type|safe }},
|
||||
{%- endfor %}
|
||||
`properties` struct<
|
||||
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False -%}
|
||||
{{ key }}:{{ value.type|safe }}{% if not loop.last %},{% endif %}
|
||||
{%- endfor %}
|
||||
>
|
||||
)
|
||||
PARTITIONED BY (
|
||||
`prefix` string)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-metrics/firehose/kite_status'
|
||||
TBLPROPERTIES (
|
||||
'projection.enabled'='true',
|
||||
'projection.prefix.format'='yyyy/MM/dd/HH',
|
||||
'projection.prefix.interval'='1',
|
||||
'projection.prefix.interval.unit'='HOURS',
|
||||
'projection.prefix.range'='2018/01/01/00,NOW',
|
||||
'projection.prefix.type'='date',
|
||||
'storage.location.template'='s3://kite-metrics/firehose/kite_status/${prefix}'
|
||||
)
|
@ -0,0 +1,37 @@
|
||||
CREATE EXTERNAL TABLE `kite_status_normalized`(
|
||||
{% for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
|
||||
`{{ key }}` {{ value.type|safe }},
|
||||
{% endfor %}
|
||||
|
||||
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False %}
|
||||
`properties__{{ key }}` {{ value.type|safe }},
|
||||
{% endfor %}
|
||||
|
||||
`maxmind__country_name` string,
|
||||
`maxmind__country_iso_code` string,
|
||||
`maxmind__subdivision_1_name` string,
|
||||
`maxmind__city_name` string,
|
||||
`maxmind__time_zone` string,
|
||||
`monetizable_score` double,
|
||||
`monetizable_model_version` string
|
||||
)
|
||||
PARTITIONED BY (
|
||||
`year` int,
|
||||
`month` int,
|
||||
`day` int,
|
||||
`hour` int
|
||||
)
|
||||
STORED AS PARQUET
|
||||
LOCATION 's3://kite-metrics/athena/kite_status_normalized/'
|
||||
TBLPROPERTIES (
|
||||
'projection.enabled'='true',
|
||||
'projection.year.type'='integer',
|
||||
'projection.year.range'='2010,2100',
|
||||
'projection.month.type'='integer',
|
||||
'projection.month.range'='1,12',
|
||||
'projection.day.type'='integer',
|
||||
'projection.day.range'='1,31',
|
||||
'projection.hour.type'='integer',
|
||||
'projection.hour.range'='0,23',
|
||||
'storage.location.template'='s3://kite-metrics/athena/kite_status_normalized/year=${year}/month=${month}/day=${day}/hour=${hour}'
|
||||
);
|
@ -0,0 +1,27 @@
|
||||
CREATE EXTERNAL TABLE `kite_status_segment`(
|
||||
{%- for key, value in params.schema['properties'].items()|sort if key != "properties" and value.get("kite_status_normalized") != False %}
|
||||
`{{ key }}` {{ value.type|safe }},
|
||||
{%- endfor %}
|
||||
`properties` struct<
|
||||
{% for key, value in params.schema['properties']['properties']['properties'].items()|sort if value.get("kite_status_normalized") != False -%}
|
||||
{{ key }}:{{ value.type|safe }}{% if not loop.last %},{% endif %}
|
||||
{%- endfor %}
|
||||
>
|
||||
)
|
||||
PARTITIONED BY (
|
||||
`prefix` string)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-metrics/segment-logs/XXXXXXX'
|
||||
TBLPROPERTIES (
|
||||
'projection.enabled'='true',
|
||||
'projection.prefix.interval'='XXXXXXX',
|
||||
'projection.prefix.range'='XXXXXXX,XXXXXXX',
|
||||
'projection.prefix.type'='integer',
|
||||
'storage.location.template'='s3://kite-metrics/segment-logs/XXXXXXX/${prefix}'
|
||||
)
|
@ -0,0 +1,44 @@
|
||||
{% macro struct(dct) -%}
|
||||
struct<
|
||||
{% for key, value in dct.items() %}
|
||||
{{ key }}: {% if value is mapping %}{{ struct(value) }}{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %}
|
||||
{% endfor %}
|
||||
>
|
||||
{%- endmacro %}
|
||||
|
||||
CREATE EXTERNAL TABLE `{{ params.table_name }}` (
|
||||
{% for key, value in params.schema.items() %}
|
||||
{{ key }} {% if value is mapping %}{{ struct(value) }}{% else %}{{ value }}{% endif %}{% if not loop.last %},{% endif %}
|
||||
{% endfor %}
|
||||
)
|
||||
{% if params.partitioned %}
|
||||
PARTITIONED BY (
|
||||
`year` int,
|
||||
`month` int,
|
||||
`day` int
|
||||
)
|
||||
{% endif %}
|
||||
{% if params.json %}
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-metrics/{{ params.s3_prefix }}'
|
||||
TBLPROPERTIES (
|
||||
'projection.enabled'='true',
|
||||
'projection.year.type'='integer',
|
||||
'projection.year.range'='2020,2100',
|
||||
'projection.month.type'='integer',
|
||||
'projection.month.range'='1,12',
|
||||
'projection.day.type'='integer',
|
||||
'projection.day.range'='1,31',
|
||||
'storage.location.template'='s3://kite-metrics/{{ params.s3_prefix }}/year=${year}/month=${month}/day=${day}'
|
||||
)
|
||||
{% else %}
|
||||
STORED AS PARQUET
|
||||
LOCATION
|
||||
's3://kite-metrics/{{ params.s3_prefix }}'
|
||||
{% endif %}
|
@ -0,0 +1,10 @@
|
||||
CREATE EXTERNAL TABLE monetizable_inf_results_{{ds_nodash}} (
|
||||
userid string,
|
||||
score double,
|
||||
model_version string,
|
||||
timestamp bigint
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
LOCATION
|
||||
's3://{{params.bucket}}/monetizable/inf_results/{{ds_nodash}}/'
|
@ -0,0 +1,21 @@
|
||||
CREATE EXTERNAL TABLE `youtube_channel_details`(
|
||||
id string,
|
||||
forUsername string,
|
||||
snippet struct<
|
||||
title: string,
|
||||
customUrl: string
|
||||
>,
|
||||
statistics struct<
|
||||
viewCount: string,
|
||||
subscriberCount: string,
|
||||
videoCount: string
|
||||
>
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-youtube-data/channel_details/'
|
@ -0,0 +1,11 @@
|
||||
CREATE EXTERNAL TABLE `youtube_channels`(
|
||||
id string
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-youtube-data/channels/'
|
@ -0,0 +1,15 @@
|
||||
CREATE EXTERNAL TABLE `youtube_queries`(
|
||||
tagname string,
|
||||
count bigint,
|
||||
query string,
|
||||
seed boolean,
|
||||
generation int
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-youtube-data/search_queries/'
|
@ -0,0 +1,15 @@
|
||||
CREATE EXTERNAL TABLE `youtube_searches`(
|
||||
query string,
|
||||
query_hash string,
|
||||
timestamp string,
|
||||
total int,
|
||||
unique int
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-youtube-data/searches/'
|
@ -0,0 +1,14 @@
|
||||
CREATE EXTERNAL TABLE `youtube_socialblade_stats`(
|
||||
id string,
|
||||
timestamp timestamp,
|
||||
success boolean,
|
||||
monthlyViews string
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.openx.data.jsonserde.JsonSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.mapred.TextInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
||||
LOCATION
|
||||
's3://kite-youtube-data/socialblade_stats/'
|
0
airflow/kite_airflow/youtube_dashboard/__init__.py
Normal file
0
airflow/kite_airflow/youtube_dashboard/__init__.py
Normal file
178
airflow/kite_airflow/youtube_dashboard/api.py
Normal file
178
airflow/kite_airflow/youtube_dashboard/api.py
Normal file
@ -0,0 +1,178 @@
|
||||
import time
|
||||
|
||||
from kite_airflow.common import utils as common_utils
|
||||
from kite_airflow.youtube_dashboard import utils
|
||||
|
||||
|
||||
def get_activity_list(yt_client, channel_id, published_after=None, page_token=None):
|
||||
'''
|
||||
Uses YouTube Activity List API to get activities.
|
||||
|
||||
Returns:\n
|
||||
list: activity items
|
||||
string: token which we can use to request next page
|
||||
'''
|
||||
request = yt_client.activities().list(
|
||||
part='id,snippet,contentDetails',
|
||||
channelId=channel_id,
|
||||
maxResults=50,
|
||||
publishedAfter=published_after if published_after else common_utils.get_date_time_in_ISO(),
|
||||
pageToken=page_token,
|
||||
)
|
||||
activity_list_response = request.execute()
|
||||
|
||||
return activity_list_response['items'], activity_list_response.get('nextPageToken')
|
||||
|
||||
|
||||
def get_all_activity_list(yt_client, channel_id, published_after=None):
|
||||
'''
|
||||
Uses YouTube Activity List API to get the list of all activities from given date.
|
||||
|
||||
Returns:\n
|
||||
list: all activities found
|
||||
'''
|
||||
|
||||
all_activities = []
|
||||
next_page_token = None
|
||||
exception = None
|
||||
|
||||
try:
|
||||
while True:
|
||||
activity_list, next_page_token = get_activity_list(
|
||||
yt_client,
|
||||
channel_id,
|
||||
published_after,
|
||||
next_page_token,
|
||||
)
|
||||
|
||||
if activity_list.count:
|
||||
all_activities.extend(activity_list)
|
||||
|
||||
if not next_page_token:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
exception = e
|
||||
|
||||
finally:
|
||||
return all_activities, exception
|
||||
|
||||
|
||||
def filter_video_activity_from_list(activity_list):
|
||||
'''
|
||||
Filters upload video activities from all activities
|
||||
'''
|
||||
|
||||
new_upload_video_activity_list = []
|
||||
|
||||
for activity in activity_list:
|
||||
if activity['snippet']['type'] == 'upload':
|
||||
new_upload_video_activity_list.append(activity)
|
||||
|
||||
return new_upload_video_activity_list
|
||||
|
||||
|
||||
def get_unique_upload_video_activity_list(video_activity_list):
|
||||
'''
|
||||
Filters duplicated upload video activities.
|
||||
|
||||
Youtube Activity API can send same upload video activity twice
|
||||
(don't know the exact reason) and there is no easy way to filter
|
||||
them therefore this function is added which filters them based
|
||||
on video id's
|
||||
'''
|
||||
|
||||
video_ids = set() # using it to filter videos
|
||||
unique_video_activity_list = []
|
||||
|
||||
for video_activity in video_activity_list:
|
||||
video_id = utils.get_id_of_video_activity(video_activity)
|
||||
|
||||
if not video_id in video_ids:
|
||||
video_ids.add(video_id)
|
||||
unique_video_activity_list.append(video_activity)
|
||||
|
||||
return unique_video_activity_list
|
||||
|
||||
|
||||
def get_video_search_list(yt_client, channel_id, published_before=None, page_token=None):
|
||||
'''
|
||||
Uses YouTube Search List API to get recent videos.
|
||||
|
||||
Returns:\n
|
||||
list: searched videos items
|
||||
string: token which we can use to request next page
|
||||
'''
|
||||
|
||||
request = yt_client.search().list(
|
||||
part='snippet',
|
||||
channelId=channel_id,
|
||||
maxResults=50,
|
||||
publishedBefore=published_before if published_before else common_utils.get_date_time_in_ISO(),
|
||||
type='video',
|
||||
order='date',
|
||||
pageToken=page_token,
|
||||
)
|
||||
video_search_list_response = request.execute()
|
||||
|
||||
return video_search_list_response['items'], video_search_list_response.get('nextPageToken')
|
||||
|
||||
|
||||
def get_all_video_search_list(yt_client, channel_id, published_before, search_budget):
|
||||
'''
|
||||
Uses YouTube Search List API to get all available videos of a channel
|
||||
|
||||
Returns:\n
|
||||
list: all videos of channel
|
||||
'''
|
||||
|
||||
no_of_searches = 0
|
||||
all_video_searches = []
|
||||
next_page_token = None
|
||||
has_channel_search_remaining = True
|
||||
exception = None
|
||||
|
||||
try:
|
||||
while True:
|
||||
video_search_list, next_page_token = get_video_search_list(
|
||||
yt_client,
|
||||
channel_id,
|
||||
published_before,
|
||||
next_page_token
|
||||
)
|
||||
|
||||
has_channel_search_remaining = bool(next_page_token)
|
||||
|
||||
if video_search_list.count:
|
||||
all_video_searches.extend(video_search_list)
|
||||
|
||||
if not next_page_token:
|
||||
break
|
||||
|
||||
no_of_searches += 1
|
||||
|
||||
if search_budget - no_of_searches <= 0:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
exception = e
|
||||
|
||||
finally:
|
||||
return all_video_searches, bool(has_channel_search_remaining), no_of_searches, exception
|
||||
|
||||
|
||||
def get_video_list(yt_client, videos_id_list):
|
||||
'''
|
||||
Uses YouTube Video List API to get details about the video
|
||||
|
||||
Returns:\n
|
||||
list: detailed info of videos
|
||||
'''
|
||||
|
||||
request = yt_client.videos().list(
|
||||
part='snippet,statistics',
|
||||
id=','.join(videos_id_list)
|
||||
)
|
||||
video_list_response = request.execute()
|
||||
|
||||
return video_list_response['items']
|
106
airflow/kite_airflow/youtube_dashboard/files.py
Normal file
106
airflow/kite_airflow/youtube_dashboard/files.py
Normal file
@ -0,0 +1,106 @@
|
||||
import datetime
|
||||
import json
|
||||
import csv
|
||||
import codecs
|
||||
|
||||
from airflow.hooks.S3_hook import S3Hook
|
||||
|
||||
from kite_airflow.common import configs
|
||||
from kite_airflow.common import utils
|
||||
from kite_airflow.common import files
|
||||
|
||||
|
||||
BUCKET = 'kite-youtube-data' if utils.is_production() else 'kite-metrics-test'
|
||||
|
||||
DIR_PROJECT = 'youtube-dashboard'
|
||||
DIR_SCRATCH = 'athena-scratch-space'
|
||||
DIR_CHANNELS = '{}/channels'.format(DIR_PROJECT)
|
||||
DIR_VIDEOS = '{}/videos'.format(DIR_PROJECT)
|
||||
DIR_ACTIVITIES = '{}/activities'.format(DIR_PROJECT)
|
||||
DIR_SNAPSHOTS = '{}/snapshots'.format(DIR_PROJECT)
|
||||
|
||||
FILE_CACHED_URLS = '{}/cached_urls.csv'.format(DIR_PROJECT)
|
||||
|
||||
|
||||
def get_scratch_space_csv(ti, task_id):
|
||||
'''
|
||||
Get file content of a csv in json list
|
||||
'''
|
||||
|
||||
s3 = S3Hook(configs.AWS_CONN_ID)
|
||||
filename = ti.xcom_pull(task_ids=task_id)
|
||||
s3key = s3.get_key(
|
||||
'{}/{}.csv'.format(DIR_SCRATCH, filename),
|
||||
BUCKET,
|
||||
)
|
||||
|
||||
json_list = []
|
||||
reader = csv.DictReader(
|
||||
codecs.getreader("utf-8")(s3key.get()['Body'])
|
||||
)
|
||||
|
||||
for row in reader:
|
||||
json_list.append(row)
|
||||
|
||||
return json_list
|
||||
|
||||
|
||||
def write_json_list_on_file(file_path, json_list):
|
||||
s3_hook = S3Hook(configs.AWS_CONN_ID)
|
||||
data = []
|
||||
|
||||
for index, json_obj in enumerate(json_list):
|
||||
data.append(json.dumps(json_obj))
|
||||
|
||||
s3_hook.load_bytes(
|
||||
'\n'.join(data).encode('utf-8'),
|
||||
file_path,
|
||||
BUCKET,
|
||||
replace=True,
|
||||
)
|
||||
|
||||
|
||||
def get_cached_urls_from_file():
|
||||
try:
|
||||
cached_urls_list = files.get_csv_file_as_dict(BUCKET, FILE_CACHED_URLS)
|
||||
|
||||
except:
|
||||
cached_urls_list = []
|
||||
|
||||
cached_urls_dict = {}
|
||||
for cached_url in cached_urls_list:
|
||||
cached_urls_dict[cached_url['url']] = bool(cached_url['is_a_kite_redirect'])
|
||||
|
||||
return cached_urls_dict
|
||||
|
||||
|
||||
def write_cached_urls_on_file(cached_urls_dict):
|
||||
cached_urls_list = []
|
||||
for url, is_kite_redirect in cached_urls_dict.items():
|
||||
cached_urls_list.append(
|
||||
{
|
||||
'url': url,
|
||||
'is_a_kite_redirect': 'True' if is_kite_redirect else '' # empty string represents false
|
||||
}
|
||||
)
|
||||
|
||||
files.write_dict_on_csv_file(BUCKET, FILE_CACHED_URLS, cached_urls_list)
|
||||
|
||||
|
||||
def write_channels_on_file(channel_list):
|
||||
write_json_list_on_file(DIR_CHANNELS + '/channels.json', channel_list)
|
||||
|
||||
|
||||
def write_activities_on_file(activity_list):
|
||||
file_path = DIR_ACTIVITIES + '/activities' + utils.get_unique_suffix()
|
||||
write_json_list_on_file(file_path, activity_list)
|
||||
|
||||
|
||||
def write_videos_on_file(video_list):
|
||||
file_path = DIR_VIDEOS + '/videos' + utils.get_unique_suffix()
|
||||
write_json_list_on_file(file_path, video_list)
|
||||
|
||||
|
||||
def write_snapshots_on_file(snapshot_list):
|
||||
file_path = DIR_SNAPSHOTS + '/snapshots' + utils.get_unique_suffix()
|
||||
write_json_list_on_file(file_path, snapshot_list)
|
75
airflow/kite_airflow/youtube_dashboard/utils.py
Normal file
75
airflow/kite_airflow/youtube_dashboard/utils.py
Normal file
@ -0,0 +1,75 @@
|
||||
import re
|
||||
import requests
|
||||
|
||||
|
||||
def get_video_id_of_search_item(search_item):
|
||||
return search_item['id']['videoId']
|
||||
|
||||
|
||||
def get_published_date_of_search_item(search_item):
|
||||
return search_item['snippet']['publishedAt']
|
||||
|
||||
|
||||
def get_id_of_video_activity(video_activity):
|
||||
return video_activity['contentDetails']['upload']['videoId']
|
||||
|
||||
|
||||
def get_id_of_video_item(video_item):
|
||||
return video_item['id']
|
||||
|
||||
|
||||
def get_description_of_video_item(video_item):
|
||||
return video_item['snippet']['description']
|
||||
|
||||
|
||||
def get_views_of_video_item(video_item):
|
||||
return video_item['statistics'].get('viewCount')
|
||||
|
||||
|
||||
def is_link_present_in_description(video_item, cached_urls_dict):
|
||||
'''
|
||||
Looks for kite link in the description and in case of shorten url also update the cache
|
||||
which we use for performance improvement i.e. prevent future request for same url because
|
||||
mostly descriptions of same channel have repetative links
|
||||
|
||||
Returns:\n
|
||||
boolean:
|
||||
indicates if kite link was present
|
||||
'''
|
||||
|
||||
kite_url = 'kite.com'
|
||||
description = get_description_of_video_item(video_item)
|
||||
|
||||
# youtubers always uses word Kite in description so if it's not present
|
||||
# then no further search is needed
|
||||
if 'kite' not in description.lower():
|
||||
return False
|
||||
|
||||
if kite_url in description:
|
||||
return True
|
||||
|
||||
# some youtubers uses link shortener so for those we uses a combination of cache
|
||||
# and HEAD requests to look if kite redirects are present
|
||||
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', description)
|
||||
unique_urls = list(dict.fromkeys(urls))
|
||||
|
||||
for url in unique_urls:
|
||||
if url in cached_urls_dict:
|
||||
if cached_urls_dict[url]:
|
||||
return True
|
||||
else:
|
||||
continue # not returning False because Kite link can be added after we have took the snapshot
|
||||
|
||||
try:
|
||||
response = requests.head(url)
|
||||
location_header = response.headers.get('Location')
|
||||
is_a_kite_redirect = location_header and kite_url in location_header;
|
||||
cached_urls_dict[url] = 'True' if is_a_kite_redirect else ''; # empty string represents false
|
||||
|
||||
if is_a_kite_redirect:
|
||||
return True
|
||||
|
||||
except Exception:
|
||||
cached_urls_dict[url] = '';
|
||||
|
||||
return False
|
9
airflow/requirements.txt
Normal file
9
airflow/requirements.txt
Normal file
@ -0,0 +1,9 @@
|
||||
elasticsearch==7.7.0
|
||||
gevent
|
||||
mixpanel
|
||||
customerio
|
||||
requests
|
||||
sagemaker
|
||||
google-api-python-client
|
||||
google-auth-httplib2
|
||||
google-auth-oauthlib
|
17
airflow/setup.py
Normal file
17
airflow/setup.py
Normal file
@ -0,0 +1,17 @@
|
||||
import setuptools
|
||||
|
||||
setuptools.setup(
|
||||
name="kite-airflow-dags", # Replace with your own username
|
||||
version="0.0.1",
|
||||
author="Kite Team",
|
||||
description="Kite Airflow codes.",
|
||||
packages=setuptools.find_packages(),
|
||||
python_requires='>=3.6',
|
||||
include_package_data = True,
|
||||
|
||||
entry_points = {
|
||||
'airflow.plugins': [
|
||||
'google_plugin = kite_airflow.plugins.google:GoogleSheetsPlugin'
|
||||
]
|
||||
}
|
||||
)
|
1
airflow/terraform/.gitignore
vendored
Normal file
1
airflow/terraform/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
.terraform
|
383
airflow/terraform/main.tf
Normal file
383
airflow/terraform/main.tf
Normal file
@ -0,0 +1,383 @@
|
||||
terraform {
|
||||
backend "s3" {
|
||||
bucket = "kite-terraform-state"
|
||||
workspace_key_prefix = "deployments/airflow"
|
||||
key = "terraform.tfstate"
|
||||
region = "us-west-1"
|
||||
}
|
||||
}
|
||||
|
||||
provider "aws" {
|
||||
region = var.region
|
||||
}
|
||||
|
||||
provider "aws" {
|
||||
region = "us-west-1"
|
||||
alias = "uswest1"
|
||||
}
|
||||
|
||||
resource "aws_ecs_cluster" "airflow" {
|
||||
name = var.service_name
|
||||
capacity_providers = ["FARGATE"]
|
||||
}
|
||||
|
||||
resource "aws_iam_role" "airflow_task_execution" {
|
||||
name = "instance_role_airflow"
|
||||
|
||||
assume_role_policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Action": "sts:AssumeRole",
|
||||
"Principal": {
|
||||
"Service": "ecs-tasks.amazonaws.com"
|
||||
},
|
||||
"Effect": "Allow",
|
||||
"Sid": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy" "airflow_task_execution" {
|
||||
name = "airflow-execution-policy"
|
||||
role = aws_iam_role.airflow_task_execution.id
|
||||
|
||||
policy = <<-EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"ecr:GetAuthorizationToken",
|
||||
"ecr:BatchCheckLayerAvailability",
|
||||
"ecr:GetDownloadUrlForLayer",
|
||||
"ecr:BatchGetImage",
|
||||
"logs:CreateLogGroup",
|
||||
"logs:CreateLogStream",
|
||||
"logs:PutLogEvents"
|
||||
],
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"secretsmanager:GetResourcePolicy",
|
||||
"secretsmanager:GetSecretValue",
|
||||
"secretsmanager:DescribeSecret",
|
||||
"secretsmanager:ListSecretVersionIds"
|
||||
],
|
||||
"Resource": [
|
||||
"${data.aws_secretsmanager_secret.sql_alchemy_conn_str.arn}",
|
||||
"${data.aws_secretsmanager_secret.result_db_uri.arn}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
resource "aws_iam_role" "airflow_task" {
|
||||
name = "airflow-container-role"
|
||||
|
||||
assume_role_policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Action": "sts:AssumeRole",
|
||||
"Principal": {
|
||||
"Service": "ecs-tasks.amazonaws.com"
|
||||
},
|
||||
"Effect": "Allow",
|
||||
"Sid": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "airflow-ecr" {
|
||||
role = aws_iam_role.airflow_task.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "airflow-sm" {
|
||||
role = aws_iam_role.airflow_task.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/SecretsManagerReadWrite"
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "airflow-s3" {
|
||||
role = aws_iam_role.airflow_task.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "airflow-athena" {
|
||||
role = aws_iam_role.airflow_task.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonAthenaFullAccess"
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "airflow-ecs" {
|
||||
role = aws_iam_role.airflow_task.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/AmazonECS_FullAccess"
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy" "airflow-cloudwatch" {
|
||||
name = "airflow-cloudwatch-policy"
|
||||
role = aws_iam_role.airflow_task.id
|
||||
|
||||
policy = <<-EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"logs:CreateLogGroup",
|
||||
"logs:CreateLogStream",
|
||||
"logs:PutLogEvents",
|
||||
"logs:DescribeLogStreams"
|
||||
],
|
||||
"Resource": [
|
||||
"arn:aws:logs:*:*:*"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
data "aws_vpc" "kite_prod" {
|
||||
filter {
|
||||
name = "tag:Name"
|
||||
values = ["kite-prod"]
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_subnet" "private1" {
|
||||
vpc_id = data.aws_vpc.kite_prod.id
|
||||
filter {
|
||||
name = "tag:Name"
|
||||
values = ["az1-private"]
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_subnet" "private2" {
|
||||
vpc_id = data.aws_vpc.kite_prod.id
|
||||
filter {
|
||||
name = "tag:Name"
|
||||
values = ["az2-private"]
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_security_group" "vpn" {
|
||||
name = "all-vpn"
|
||||
vpc_id = data.aws_vpc.kite_prod.id
|
||||
}
|
||||
|
||||
resource "aws_security_group" "airflow" {
|
||||
name = "Airflow"
|
||||
description = "Airflow test security group"
|
||||
vpc_id = data.aws_vpc.kite_prod.id
|
||||
|
||||
ingress {
|
||||
from_port = 8080
|
||||
to_port = 8080
|
||||
protocol = "TCP"
|
||||
|
||||
self = true
|
||||
security_groups = [data.aws_security_group.vpn.id]
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_secretsmanager_secret" "sql_alchemy_conn_str" {
|
||||
provider = aws.uswest1
|
||||
name = "airflow/db_uri"
|
||||
}
|
||||
|
||||
data "aws_secretsmanager_secret" "result_db_uri" {
|
||||
provider = aws.uswest1
|
||||
name = "airflow/result_db_uri"
|
||||
}
|
||||
|
||||
# Verify the image is published
|
||||
data "aws_ecr_image" "airflow" {
|
||||
provider = aws.uswest1
|
||||
|
||||
repository_name = var.repository_name
|
||||
image_tag = var.tag
|
||||
}
|
||||
|
||||
resource "aws_ecs_task_definition" "airflow" {
|
||||
for_each = var.tasks
|
||||
|
||||
family = each.key
|
||||
container_definitions = jsonencode(
|
||||
[
|
||||
{
|
||||
"name" = each.key,
|
||||
"image" = "${data.aws_ecr_image.airflow.registry_id}.dkr.ecr.us-west-1.amazonaws.com/${var.repository_name}:${var.tag}",
|
||||
"portMappings" = [
|
||||
{
|
||||
"containerPort" = each.value.port,
|
||||
"protocol" = "tcp"
|
||||
}
|
||||
],
|
||||
"essential" = true,
|
||||
"entryPoint" = ["airflow", each.key],
|
||||
"environment" = [
|
||||
{ "name" = "AIRFLOW_VAR_ENV", "value" = "production" },
|
||||
],
|
||||
"secrets" = [
|
||||
{ "name" = "AIRFLOW__CORE__SQL_ALCHEMY_CONN", "valueFrom" = data.aws_secretsmanager_secret.sql_alchemy_conn_str.arn },
|
||||
{ "name" = "AIRFLOW__CELERY__RESULT_BACKEND", "valueFrom" = data.aws_secretsmanager_secret.result_db_uri.arn }
|
||||
],
|
||||
"logConfiguration" = {
|
||||
"logDriver" = "awslogs",
|
||||
"options" = {
|
||||
"awslogs-create-group" = "true",
|
||||
"awslogs-region" = var.region,
|
||||
"awslogs-group" = "/ecs/airflow/${each.key}",
|
||||
"awslogs-stream-prefix" = "ecs"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
requires_compatibilities = ["FARGATE"]
|
||||
network_mode = "awsvpc"
|
||||
|
||||
execution_role_arn = aws_iam_role.airflow_task_execution.arn
|
||||
task_role_arn = aws_iam_role.airflow_task.arn
|
||||
|
||||
cpu = each.value.cpu
|
||||
memory = each.value.memory
|
||||
}
|
||||
|
||||
resource "aws_appautoscaling_target" "worker" {
|
||||
max_capacity = 8
|
||||
min_capacity = 1
|
||||
resource_id = "service/${aws_ecs_cluster.airflow.name}/${aws_ecs_service.airflow["worker"].name}"
|
||||
scalable_dimension = "ecs:service:DesiredCount"
|
||||
service_namespace = "ecs"
|
||||
}
|
||||
|
||||
resource "aws_appautoscaling_policy" "worker_policy_memory" {
|
||||
name = "memory-autoscaling"
|
||||
policy_type = "TargetTrackingScaling"
|
||||
resource_id = aws_appautoscaling_target.worker.resource_id
|
||||
scalable_dimension = aws_appautoscaling_target.worker.scalable_dimension
|
||||
service_namespace = aws_appautoscaling_target.worker.service_namespace
|
||||
|
||||
target_tracking_scaling_policy_configuration {
|
||||
predefined_metric_specification {
|
||||
predefined_metric_type = "ECSServiceAverageMemoryUtilization"
|
||||
}
|
||||
|
||||
target_value = 70
|
||||
scale_in_cooldown = 60
|
||||
scale_out_cooldown = 120
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_appautoscaling_policy" "worker_policy_cpu" {
|
||||
name = "cpu-autoscaling"
|
||||
policy_type = "TargetTrackingScaling"
|
||||
resource_id = aws_appautoscaling_target.worker.resource_id
|
||||
scalable_dimension = aws_appautoscaling_target.worker.scalable_dimension
|
||||
service_namespace = aws_appautoscaling_target.worker.service_namespace
|
||||
|
||||
target_tracking_scaling_policy_configuration {
|
||||
predefined_metric_specification {
|
||||
predefined_metric_type = "ECSServiceAverageCPUUtilization"
|
||||
}
|
||||
|
||||
target_value = 70
|
||||
scale_in_cooldown = 60
|
||||
scale_out_cooldown = 120
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_ecs_service" "airflow" {
|
||||
for_each = var.tasks
|
||||
|
||||
name = each.key
|
||||
cluster = aws_ecs_cluster.airflow.arn
|
||||
launch_type = "FARGATE"
|
||||
platform_version = "LATEST"
|
||||
task_definition = aws_ecs_task_definition.airflow[each.key].arn
|
||||
desired_count = 1
|
||||
|
||||
network_configuration {
|
||||
subnets = [data.aws_subnet.private1.id]
|
||||
security_groups = [aws_security_group.airflow.id]
|
||||
}
|
||||
|
||||
dynamic "load_balancer" {
|
||||
for_each = each.value.load_balancer ? [1] : []
|
||||
|
||||
content {
|
||||
target_group_arn = aws_lb_target_group.airflow.arn
|
||||
container_name = each.key
|
||||
container_port = each.value.port
|
||||
}
|
||||
}
|
||||
|
||||
enable_ecs_managed_tags = true
|
||||
propagate_tags = "TASK_DEFINITION"
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [desired_count]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_lb" "airflow" {
|
||||
name = "airflow"
|
||||
subnets = [data.aws_subnet.private1.id, data.aws_subnet.private2.id]
|
||||
load_balancer_type = "application"
|
||||
internal = true
|
||||
security_groups = [data.aws_security_group.vpn.id]
|
||||
}
|
||||
|
||||
resource "aws_lb_target_group" "airflow" {
|
||||
name = "airflow"
|
||||
port = var.webserver_port
|
||||
protocol = "HTTP"
|
||||
vpc_id = data.aws_vpc.kite_prod.id
|
||||
target_type = "ip"
|
||||
|
||||
health_check {
|
||||
path = "/health"
|
||||
matcher = "200"
|
||||
interval = 300
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_acm_certificate" "kite_dev" {
|
||||
domain = "*.kite.dev"
|
||||
statuses = ["ISSUED"]
|
||||
}
|
||||
|
||||
resource "aws_lb_listener" "airflow" {
|
||||
load_balancer_arn = aws_lb.airflow.arn
|
||||
port = 443
|
||||
protocol = "HTTPS"
|
||||
|
||||
default_action {
|
||||
target_group_arn = aws_lb_target_group.airflow.arn
|
||||
type = "forward"
|
||||
}
|
||||
|
||||
certificate_arn = data.aws_acm_certificate.kite_dev.arn
|
||||
}
|
30
airflow/terraform/monetizable.tf
Normal file
30
airflow/terraform/monetizable.tf
Normal file
@ -0,0 +1,30 @@
|
||||
resource "aws_ecs_task_definition" "monetizable" {
|
||||
family = "monetizable"
|
||||
container_definitions = jsonencode(
|
||||
[
|
||||
{
|
||||
"name" = "monetizable",
|
||||
"image" = "${data.aws_ecr_image.airflow.registry_id}.dkr.ecr.us-west-1.amazonaws.com/kite-airflow-monetizable:${var.tag}",
|
||||
"essential" = true,
|
||||
"logConfiguration" = {
|
||||
"logDriver" = "awslogs",
|
||||
"options" = {
|
||||
"awslogs-create-group" = "true",
|
||||
"awslogs-region" = var.region,
|
||||
"awslogs-group" = "/ecs/airflow/monetizable",
|
||||
"awslogs-stream-prefix" = "ecs"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
requires_compatibilities = ["FARGATE"]
|
||||
network_mode = "awsvpc"
|
||||
|
||||
execution_role_arn = aws_iam_role.airflow_task_execution.arn
|
||||
task_role_arn = aws_iam_role.airflow_task.arn
|
||||
|
||||
cpu = 1 * 1024.0
|
||||
memory = 2 * 1024.0
|
||||
}
|
4
airflow/terraform/outputs.tf
Normal file
4
airflow/terraform/outputs.tf
Normal file
@ -0,0 +1,4 @@
|
||||
output "tag" {
|
||||
value = var.tag
|
||||
description = "The currently-deployed tag."
|
||||
}
|
43
airflow/terraform/variables.tf
Normal file
43
airflow/terraform/variables.tf
Normal file
@ -0,0 +1,43 @@
|
||||
variable service_name {
|
||||
default = "airflow"
|
||||
}
|
||||
|
||||
variable region {
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
variable webserver_port {
|
||||
default = 8080
|
||||
}
|
||||
|
||||
variable repository_name {
|
||||
type = string
|
||||
default = "kite-airflow"
|
||||
}
|
||||
|
||||
variable tag {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable tasks {
|
||||
default = {
|
||||
webserver = {
|
||||
port = 8080
|
||||
cpu = 0.5 * 1024.0
|
||||
memory = 1 * 1024.0
|
||||
load_balancer = true
|
||||
},
|
||||
scheduler = {
|
||||
port = 8793
|
||||
cpu = 1 * 1024.0
|
||||
memory = 2 * 1024.0
|
||||
load_balancer = false
|
||||
},
|
||||
worker = {
|
||||
port = 8793
|
||||
cpu = 2 * 1024.0
|
||||
memory = 4 * 1024.0
|
||||
load_balancer = false
|
||||
}
|
||||
}
|
||||
}
|
105
airflow/unittests.cfg
Normal file
105
airflow/unittests.cfg
Normal file
@ -0,0 +1,105 @@
|
||||
[core]
|
||||
unit_test_mode = True
|
||||
dags_folder = /opt/airflow/dags
|
||||
plugins_folder = /opt/airflow/plugins
|
||||
base_log_folder = /opt/airflow//logs
|
||||
logging_level = INFO
|
||||
fab_logging_level = WARN
|
||||
log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log
|
||||
log_processor_filename_template = {{ filename }}.log
|
||||
dag_processor_manager_log_location = /opt/airflow//logs/dag_processor_manager/dag_processor_manager.log
|
||||
executor = SequentialExecutor
|
||||
sql_alchemy_conn = sqlite:////opt/airflow//unittests.db
|
||||
load_examples = True
|
||||
donot_pickle = False
|
||||
load_default_connections = True
|
||||
dag_concurrency = 16
|
||||
dags_are_paused_at_creation = False
|
||||
fernet_key = XXXXXXX
|
||||
enable_xcom_pickling = False
|
||||
killed_task_cleanup_time = 5
|
||||
secure_mode = False
|
||||
hostname_callable = socket:getfqdn
|
||||
worker_precheck = False
|
||||
default_task_retries = 0
|
||||
|
||||
[cli]
|
||||
api_client = airflow.api.client.local_client
|
||||
endpoint_url = http://localhost:8080
|
||||
|
||||
[api]
|
||||
auth_backend = airflow.api.auth.backend.default
|
||||
|
||||
[operators]
|
||||
default_owner = airflow
|
||||
|
||||
[hive]
|
||||
default_hive_mapred_queue = airflow
|
||||
|
||||
[webserver]
|
||||
base_url = http://localhost:8080
|
||||
web_server_host = 0.0.0.0
|
||||
web_server_port = 8080
|
||||
dag_orientation = LR
|
||||
dag_default_view = tree
|
||||
log_fetch_timeout_sec = 5
|
||||
hide_paused_dags_by_default = False
|
||||
page_size = 100
|
||||
rbac = False
|
||||
|
||||
[email]
|
||||
email_backend = airflow.utils.email.send_email_smtp
|
||||
|
||||
[smtp]
|
||||
smtp_host = localhost
|
||||
smtp_user = airflow
|
||||
smtp_port = 25
|
||||
smtp_password = airflow
|
||||
smtp_mail_from = airflow@example.com
|
||||
|
||||
[celery]
|
||||
celery_app_name = airflow.executors.celery_executor
|
||||
worker_concurrency = 16
|
||||
worker_log_server_port = 8793
|
||||
broker_url = sqla+mysql://airflow:airflow@localhost:3306/airflow
|
||||
result_backend = db+mysql://airflow:airflow@localhost:3306/airflow
|
||||
flower_host = 0.0.0.0
|
||||
flower_port = 5555
|
||||
default_queue = default
|
||||
sync_parallelism = 0
|
||||
|
||||
[mesos]
|
||||
master = localhost:5050
|
||||
framework_name = Airflow
|
||||
task_cpu = 1
|
||||
task_memory = 256
|
||||
checkpoint = False
|
||||
authenticate = False
|
||||
docker_image_slave = test/docker-airflow
|
||||
|
||||
[scheduler]
|
||||
job_heartbeat_sec = 1
|
||||
scheduler_heartbeat_sec = 5
|
||||
scheduler_health_check_threshold = 30
|
||||
authenticate = true
|
||||
max_threads = 2
|
||||
catchup_by_default = True
|
||||
scheduler_zombie_task_threshold = 300
|
||||
dag_dir_list_interval = 0
|
||||
max_tis_per_query = 512
|
||||
|
||||
[admin]
|
||||
hide_sensitive_variable_fields = True
|
||||
|
||||
[elasticsearch]
|
||||
host =
|
||||
log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number}
|
||||
end_of_log_mark = end_of_log
|
||||
|
||||
[elasticsearch_configs]
|
||||
|
||||
use_ssl = False
|
||||
verify_certs = True
|
||||
|
||||
[kubernetes]
|
||||
dags_volume_claim = default
|
11
codecov.yml
Normal file
11
codecov.yml
Normal file
@ -0,0 +1,11 @@
|
||||
codecov:
|
||||
allow_coverage_offsets: true
|
||||
coverage:
|
||||
status:
|
||||
project: off
|
||||
patch:
|
||||
default:
|
||||
target: 95%
|
||||
ignore:
|
||||
- **/cmd/*
|
||||
- **/cmd/**/*
|
1
concourse/.gitignore
vendored
Normal file
1
concourse/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
secrets.yml
|
28
concourse/Makefile
Normal file
28
concourse/Makefile
Normal file
@ -0,0 +1,28 @@
|
||||
all:
|
||||
|
||||
images/docker/image:
|
||||
cd images/docker && docker build -t kiteco/concourse .
|
||||
|
||||
images/docker/push: images/docker/image
|
||||
docker push kiteco/concourse
|
||||
|
||||
pipelines/bundle-plugins/set:
|
||||
fly -t kite sp -p bundle-plugins -c pipelines/bundle-plugins/pipeline.yml
|
||||
|
||||
YTT_ARGS=''
|
||||
ifneq ($(BRANCH),)
|
||||
YTT_ARGS="--data-value dev_branch=$(BRANCH)"
|
||||
endif
|
||||
|
||||
BE_SVCS_DIR=pipelines/deploy-backend-services
|
||||
$(BE_SVCS_DIR)/%/set: PIPELINE=$*
|
||||
$(BE_SVCS_DIR)/%/set:
|
||||
ytt "$(YTT_ARGS)" -f $(BE_SVCS_DIR)/pipeline-template.ytt.yml -f $(BE_SVCS_DIR)/data-defaults.ytt.yml -f $(BE_SVCS_DIR)/$(PIPELINE)/data.ytt.yml > $(BE_SVCS_DIR)/$(PIPELINE)/pipeline.yml
|
||||
fly -t kite sp -l secrets.yml -p $(PIPELINE) -c $(BE_SVCS_DIR)/$(PIPELINE)/pipeline.yml
|
||||
rm $(BE_SVCS_DIR)/$(PIPELINE)/pipeline.yml
|
||||
|
||||
pipelines/%/set: PIPELINE=$*
|
||||
pipelines/%/set:
|
||||
ytt -f pipelines/$(PIPELINE)/pipeline.ytt.yml > pipelines/$(PIPELINE)/pipeline.yml
|
||||
fly -t kite sp -p $(PIPELINE) -c pipelines/$(PIPELINE)/pipeline.yml
|
||||
rm pipelines/$(PIPELINE)/pipeline.yml
|
92
concourse/README.md
Normal file
92
concourse/README.md
Normal file
@ -0,0 +1,92 @@
|
||||
> [Concourse](https://concourse-ci.org) is an open-source continuous thing-doer.
|
||||
|
||||
At Kite, we use Concourse for at least parts of our build/deploy pipelines.
|
||||
The goal is to incrementally port all deployment jobs to Concourse,
|
||||
but in the meanwhile, our prior build system (Solness) will trigger Concourse jobs as needed.
|
||||
|
||||
In order to manually run the pipelines, you can login at [concourse.kite.com](http://concourse.kite.com)
|
||||
from within the AWS dev VPN. Find credentials in Quip.
|
||||
|
||||
For now we *do not* intend to move developer CI onto to Concourse,
|
||||
since scaling up a self-hosted CI system comes with its own set of challenges,
|
||||
and our current solution (Travis) is "good enough." This is purely for deployments.
|
||||
|
||||
## Development
|
||||
|
||||
Read the Concourse docs!
|
||||
|
||||
Pipelines are composed of jobs which are in turn composed of tasks.
|
||||
We have a pipeline called "release" defined in `pipelines/release/pipeline.ytt`.
|
||||
|
||||
In order to develop this pipeline, you need the Concourse `fly` tool,
|
||||
as well as the YAML templating tool `ytt`.
|
||||
|
||||
This pipeline can be updated using the `fly` CLI tool, or with the `make` command:
|
||||
```
|
||||
make pipelines/deploy/set
|
||||
```
|
||||
|
||||
### Secrets
|
||||
|
||||
All secrets are currently stored in AWS Systems Manager Parameter Store in us-west-1.
|
||||
The Concourse Web node is configured to look up secrets from SSM.
|
||||
|
||||
## Provisioning a Worker
|
||||
|
||||
Eventually, we should use Packer to provision worker AMIs, but for now
|
||||
workers must be manually configured.
|
||||
|
||||
### Windows
|
||||
|
||||
1. Start with a "Windows Server 2019 with Containers" machine image.
|
||||
2. Provision all the tools needed for building Kite,
|
||||
as per the Windows [README](../windows/README.md).
|
||||
* also `choco install windows-sdk-10.0`
|
||||
3. Allocate and mount a separate disk (100G) for all Concourse-related data
|
||||
* Below, we assume it's mounted at `D:`.
|
||||
* `mkdir D:\containers`, `mkdir D:\concourse`
|
||||
4. Enable long paths using registry editor.
|
||||
* set `HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled` to 1.
|
||||
5. Download [WinSW](https://github.com/kohsuke/winsw)
|
||||
and [Concourse](https://github.com/concourse/concourse/).
|
||||
* Move Concourse binary to `D:\concourse\concourse-bin.exe`
|
||||
* Move the WinSW binary to `D:\concourse\concourse.exe`
|
||||
5. Provision a worker key on the Windows machine.
|
||||
```
|
||||
cd D:\concourse
|
||||
ssh-keygen -t rsa -b 4096 -f tsa-worker-key
|
||||
...
|
||||
cat D:\concourse\tsa-worker-key.pub
|
||||
```
|
||||
* add the public key to `authorized_keys` on the Concourse web node.
|
||||
* restart the web node.
|
||||
6. Create `D:\concourse\concourse.xml` to configure all the Concourse options.
|
||||
```
|
||||
<service>
|
||||
<id>concourse</id>
|
||||
<name>Concourse</name>
|
||||
<description>Concourse Windows worker.</description>
|
||||
<startmode>Automatic</startmode>
|
||||
<executable>D:\concourse\concourse-bin.exe</executable>
|
||||
<argument>worker</argument>
|
||||
<argument>/work-dir</argument>
|
||||
<argument>D:\containers</argument>
|
||||
<argument>/tsa-worker-private-key</argument>
|
||||
<argument>D:\concourse\tsa-worker-key</argument>
|
||||
<argument>/tsa-public-key</argument>
|
||||
<argument>D:\concourse\tsa-host-key.pub</argument>
|
||||
<argument>/tsa-host</argument> <argument>10.86.0.122:2222</argument>
|
||||
<onfailure action="restart" delay="10 sec"/>
|
||||
<onfailure action="restart" delay="20 sec"/>
|
||||
<logmode>rotate</logmode>
|
||||
</service>
|
||||
```
|
||||
7. Install and start the Concourse service
|
||||
```
|
||||
D:\concourse\concourse.exe install
|
||||
D:\concourse\concourse.exe start
|
||||
```
|
||||
8. License VS Community 2019 under the system user.
|
||||
* Download [`PsExec.exe`](https://docs.microsoft.com/en-us/sysinternals/downloads/psexec)
|
||||
* Start VS under the system user: `PsExec.exe -sid "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\devenv.com"`
|
||||
* Log in to license the software.
|
50
concourse/images/docker/Dockerfile
Normal file
50
concourse/images/docker/Dockerfile
Normal file
@ -0,0 +1,50 @@
|
||||
FROM ubuntu:bionic
|
||||
|
||||
ARG GO_VERSION=1.15.3
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
wget \
|
||||
gzip \
|
||||
zip unzip \
|
||||
openssl \
|
||||
libssl-dev \
|
||||
make \
|
||||
openssh-client \
|
||||
libstdc++6 \
|
||||
software-properties-common \
|
||||
openjdk-11-jre-headless \
|
||||
openjdk-8-jdk \
|
||||
makeself \
|
||||
chrpath \
|
||||
gcc \
|
||||
build-essential \
|
||||
gpg-agent \
|
||||
jq
|
||||
|
||||
RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
|
||||
RUN apt-get install -y nodejs
|
||||
RUN npm install -g n
|
||||
RUN n 11.12.0
|
||||
|
||||
RUN apt-add-repository ppa:git-core/ppa \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
||||
RUN apt-get install -y git-lfs
|
||||
|
||||
RUN wget https://dl.google.com/go/go$GO_VERSION.linux-amd64.tar.gz
|
||||
RUN tar -C /usr/local -xzf go$GO_VERSION.linux-amd64.tar.gz
|
||||
RUN rm go$GO_VERSION.linux-amd64.tar.gz
|
||||
ENV PATH=/usr/local/go/bin:$PATH
|
||||
|
||||
RUN apt-get install -y --no-install-recommends python3.7 python3-pip python3-setuptools
|
||||
RUN pip3 install awscli wheel pipenv
|
||||
|
||||
RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
|
||||
|
||||
RUN git config --global user.email "ops@kite.com"
|
||||
RUN git config --global user.name "Kite Concourse"
|
85
concourse/pipelines/bundle-plugins/pipeline.yml
Normal file
85
concourse/pipelines/bundle-plugins/pipeline.yml
Normal file
@ -0,0 +1,85 @@
|
||||
resource_types:
|
||||
- name: slack-notification
|
||||
type: docker-image
|
||||
source:
|
||||
repository: cfcommunity/slack-notification-resource
|
||||
|
||||
resources:
|
||||
- name: time.8am
|
||||
type: time
|
||||
source:
|
||||
start: 7:00 AM
|
||||
stop: 8:00 AM
|
||||
location: America/Los_Angeles
|
||||
days: [Monday, Tuesday, Wednesday, Thursday, Friday]
|
||||
|
||||
- name: git.kiteco.intellij-plugin-private
|
||||
type: git
|
||||
source:
|
||||
uri: git@github.com:kiteco/intellij-plugin-private.git
|
||||
branch: master
|
||||
private_key: ((ssh_private))
|
||||
disable_ci_skip: true
|
||||
|
||||
- name: image-build
|
||||
type: docker-image
|
||||
source: {repository: kiteco/concourse}
|
||||
|
||||
- name: slack-deep-intellij
|
||||
type: slack-notification
|
||||
source:
|
||||
url: ((slack_deep-intellij))
|
||||
|
||||
jobs:
|
||||
- name: stage-intellij-release
|
||||
plan:
|
||||
|
||||
- get: time.8am
|
||||
trigger: true
|
||||
|
||||
- get: intellij-plugin-private
|
||||
resource: git.kiteco.intellij-plugin-private
|
||||
|
||||
- get: image-build
|
||||
|
||||
- task: version-bump-intellij
|
||||
image: image-build
|
||||
config:
|
||||
platform: linux
|
||||
inputs:
|
||||
- name: intellij-plugin-private
|
||||
outputs:
|
||||
- name: intellij-plugin-private
|
||||
run:
|
||||
path: intellij-plugin-private/release_version.bash
|
||||
|
||||
- put: intellij-plugin-private
|
||||
resource: git.kiteco.intellij-plugin-private
|
||||
params:
|
||||
repository: intellij-plugin-private
|
||||
tag: intellij-plugin-private/pluginVersion.txt
|
||||
tag_prefix: v
|
||||
|
||||
- task: intellij-build-binaries
|
||||
image: image-build
|
||||
config:
|
||||
platform: linux
|
||||
params:
|
||||
AWS_ACCESS_KEY_ID: ((aws_id))
|
||||
AWS_SECRET_ACCESS_KEY: ((aws_secret))
|
||||
|
||||
inputs:
|
||||
- name: intellij-plugin-private
|
||||
|
||||
run:
|
||||
path: intellij-plugin-private/concourse/stage-plugin.bash
|
||||
|
||||
on_failure:
|
||||
put: slack-deep-intellij
|
||||
params:
|
||||
text: "IntelliJ <http://concourse.kite.com/builds/$BUILD_ID|build> failed! <@XXXXXXX> <@XXXXXXX>"
|
||||
|
||||
on_success:
|
||||
put: slack-deep-intellij
|
||||
params:
|
||||
text: "IntelliJ <http://concourse.kite.com/builds/$BUILD_ID|build> succeeded!"
|
@ -0,0 +1,5 @@
|
||||
#@data/values
|
||||
---
|
||||
dev_branch: ""
|
||||
package_regexp: ""
|
||||
terraform_location: ""
|
@ -0,0 +1,4 @@
|
||||
#@data/values
|
||||
---
|
||||
package_regexp: kite-server.tgz
|
||||
terraform_location: kiteserver
|
@ -0,0 +1,166 @@
|
||||
#@ load("@ytt:data", "data")
|
||||
resource_types:
|
||||
- name: terraform
|
||||
type: docker-image
|
||||
source:
|
||||
repository: ljfranklin/terraform-resource
|
||||
tag: latest
|
||||
|
||||
#@yaml/text-templated-strings
|
||||
resources:
|
||||
- name: kiteco
|
||||
type: git
|
||||
source:
|
||||
uri: git@github.com:kiteco/kiteco.git
|
||||
private_key: ((ssh.private))
|
||||
disable_ci_skip: true
|
||||
#@ if data.values.dev_branch != '':
|
||||
branch: (@= data.values.dev_branch @)
|
||||
#@ else:
|
||||
branch: release
|
||||
fetch_tags: true
|
||||
tag_filter: v2*
|
||||
#@ end
|
||||
- name: kite-deploy-package
|
||||
type: s3
|
||||
source:
|
||||
bucket: kite-deploys
|
||||
regexp: v(.*)/(@= data.values.package_regexp @)
|
||||
region_name: us-west-1
|
||||
access_key_id: ((aws.id))
|
||||
secret_access_key: ((aws.secret))
|
||||
- name: terraform
|
||||
type: terraform
|
||||
source:
|
||||
backend_type: s3
|
||||
env_name: production
|
||||
terraform_source: "kiteco/devops/terraform/cloud/deployments/(@= data.values.terraform_location @)/"
|
||||
backend_config:
|
||||
access_key: ((aws.id))
|
||||
bucket: kite-terraform-state
|
||||
key: terraform.tfstate
|
||||
region: us-west-1
|
||||
secret_key: ((aws.secret))
|
||||
workspace_key_prefix: deployments/(@= data.values.terraform_location @)
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ((aws.id))
|
||||
AWS_SECRET_ACCESS_KEY: ((aws.secret))
|
||||
GOOGLE_CREDENTIALS: ((gcloud))
|
||||
|
||||
jobs:
|
||||
- name: stage-jump-instances-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: &kiteco_get_params
|
||||
depth: 10
|
||||
submodules: none
|
||||
disable_git_lfs: true
|
||||
- get: kite-deploy-package
|
||||
trigger: true
|
||||
- load_var: version-tag
|
||||
file: kite-deploy-package/version
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
gray: ((.:version-tag))
|
||||
blue: blue
|
||||
|
||||
- name: stage-jump-instances-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-jump-instances-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: stage-add-to-lb-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-jump-instances-apply]
|
||||
trigger: true
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
blue: blue
|
||||
green: gray
|
||||
|
||||
- name: stage-add-to-lb-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-add-to-lb-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: switch-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-add-to-lb-apply]
|
||||
trigger: true
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
green: blue
|
||||
blue: green
|
||||
|
||||
- name: switch-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [switch-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: retire-remove-lb-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
blue: blue
|
||||
gray: green
|
||||
|
||||
- name: retire-remove-lb-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [retire-remove-lb-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: retire-terminate-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [retire-remove-lb-apply]
|
||||
trigger: true
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
blue: blue
|
||||
|
||||
- name: retire-terminate-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [retire-terminate-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
114
concourse/pipelines/deploy-metrics-collector/pipeline.ytt.yml
Normal file
114
concourse/pipelines/deploy-metrics-collector/pipeline.ytt.yml
Normal file
@ -0,0 +1,114 @@
|
||||
resource_types:
|
||||
- name: terraform
|
||||
type: docker-image
|
||||
source:
|
||||
repository: ljfranklin/terraform-resource
|
||||
tag: latest
|
||||
|
||||
resources:
|
||||
- name: kiteco
|
||||
type: git
|
||||
source:
|
||||
uri: git@github.com:kiteco/kiteco.git
|
||||
branch: release
|
||||
private_key: ((ssh_private))
|
||||
disable_ci_skip: true
|
||||
fetch_tags: true
|
||||
tag_filter: v2* #! this'll last for the millenium
|
||||
- name: puppet
|
||||
type: s3
|
||||
source:
|
||||
bucket: kite-deploys
|
||||
regexp: puppet/puppet-v(.*).tar.gz
|
||||
region_name: us-west-1
|
||||
access_key_id: ((aws_id))
|
||||
secret_access_key: ((aws_secret))
|
||||
- name: terraform
|
||||
type: terraform
|
||||
source:
|
||||
backend_type: s3
|
||||
env_name: us-east-1
|
||||
terraform_source: kiteco/devops/terraform/cloud/deployments/metrics/
|
||||
vars:
|
||||
region: us-east-1
|
||||
backend_config:
|
||||
access_key: ((aws_id))
|
||||
bucket: kite-terraform-state
|
||||
key: terraform.tfstate
|
||||
region: us-west-1
|
||||
secret_key: ((aws_secret))
|
||||
workspace_key_prefix: deployments/metrics-collector
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ((aws_id))
|
||||
AWS_SECRET_ACCESS_KEY: ((aws_secret))
|
||||
|
||||
jobs:
|
||||
- name: stage-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: &kiteco_get_params
|
||||
depth: 10
|
||||
submodules: none
|
||||
disable_git_lfs: true
|
||||
- get: puppet
|
||||
- task: tfvars
|
||||
file: kiteco/concourse/tasks/tf-vars/task.yml
|
||||
vars:
|
||||
build: puppet
|
||||
versions: '{"green": "VERSION", "blue": "blue"}'
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
var_files: [tfvars/terraform.tfvars]
|
||||
|
||||
- name: stage-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: switch-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-apply]
|
||||
trigger: true
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
green: blue
|
||||
blue: green
|
||||
|
||||
- name: switch-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [switch-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: cleanup-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
blue: blue
|
||||
|
||||
- name: cleanup-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [cleanup-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
114
concourse/pipelines/deploy-nchan/pipeline.ytt.yml
Normal file
114
concourse/pipelines/deploy-nchan/pipeline.ytt.yml
Normal file
@ -0,0 +1,114 @@
|
||||
resource_types:
|
||||
- name: terraform
|
||||
type: docker-image
|
||||
source:
|
||||
repository: ljfranklin/terraform-resource
|
||||
tag: latest
|
||||
|
||||
resources:
|
||||
- name: kiteco
|
||||
type: git
|
||||
source:
|
||||
branch: release
|
||||
disable_ci_skip: true
|
||||
fetch_tags: true
|
||||
private_key: ((ssh_private))
|
||||
uri: git@github.com:kiteco/kiteco.git
|
||||
tag_filter: v2*
|
||||
- name: convcohort
|
||||
type: s3
|
||||
source:
|
||||
bucket: kite-deploys
|
||||
regexp: v(.*)/convcohort
|
||||
region_name: us-west-1
|
||||
access_key_id: ((aws_id))
|
||||
secret_access_key: ((aws_secret))
|
||||
- name: terraform
|
||||
type: terraform
|
||||
source:
|
||||
backend_type: s3
|
||||
env_name: production
|
||||
terraform_source: kiteco/devops/terraform/cloud/deployments/nchan/
|
||||
backend_config:
|
||||
access_key: ((aws_id))
|
||||
bucket: kite-terraform-state
|
||||
key: terraform.tfstate
|
||||
region: us-west-1
|
||||
secret_key: ((aws_secret))
|
||||
workspace_key_prefix: deployments/nchan
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ((aws_id))
|
||||
AWS_SECRET_ACCESS_KEY: ((aws_secret))
|
||||
GOOGLE_CREDENTIALS: ((gcloud))
|
||||
|
||||
jobs:
|
||||
- name: stage-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: &kiteco_get_params
|
||||
depth: 10
|
||||
submodules: none
|
||||
disable_git_lfs: true
|
||||
- get: convcohort
|
||||
trigger: true
|
||||
- task: tfvars
|
||||
file: kiteco/concourse/tasks/tf-vars/task.yml
|
||||
vars:
|
||||
build: convcohort
|
||||
versions: '{"green": "VERSION", "blue": "blue"}'
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
var_files: [tfvars/terraform.tfvars]
|
||||
|
||||
- name: stage-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: switch-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-apply]
|
||||
trigger: true
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
green: blue
|
||||
blue: green
|
||||
|
||||
- name: switch-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [switch-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: cleanup-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
blue: blue
|
||||
|
||||
- name: cleanup-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [cleanup-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
115
concourse/pipelines/deploy-release-server/pipeline.ytt.yml
Normal file
115
concourse/pipelines/deploy-release-server/pipeline.ytt.yml
Normal file
@ -0,0 +1,115 @@
|
||||
resource_types:
|
||||
- name: terraform
|
||||
type: docker-image
|
||||
source:
|
||||
repository: ljfranklin/terraform-resource
|
||||
tag: latest
|
||||
|
||||
|
||||
resources:
|
||||
- name: kiteco
|
||||
type: git
|
||||
source:
|
||||
branch: release
|
||||
disable_ci_skip: true
|
||||
fetch_tags: true
|
||||
private_key: ((ssh_private))
|
||||
uri: git@github.com:kiteco/kiteco.git
|
||||
tag_filter: v2*
|
||||
- name: release
|
||||
type: s3
|
||||
source:
|
||||
bucket: kite-deploys
|
||||
regexp: v(.*)/release
|
||||
region_name: us-west-1
|
||||
access_key_id: ((aws_id))
|
||||
secret_access_key: ((aws_secret))
|
||||
- name: terraform
|
||||
type: terraform
|
||||
source:
|
||||
backend_type: s3
|
||||
env_name: production
|
||||
terraform_source: kiteco/devops/terraform/cloud/deployments/release/
|
||||
backend_config:
|
||||
access_key: ((aws_id))
|
||||
bucket: kite-terraform-state
|
||||
key: terraform.tfstate
|
||||
region: us-west-1
|
||||
secret_key: ((aws_secret))
|
||||
workspace_key_prefix: deployments/release
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ((aws_id))
|
||||
AWS_SECRET_ACCESS_KEY: ((aws_secret))
|
||||
GOOGLE_CREDENTIALS: ((gcloud))
|
||||
|
||||
jobs:
|
||||
- name: stage-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: &kiteco_get_params
|
||||
depth: 10
|
||||
submodules: none
|
||||
disable_git_lfs: true
|
||||
- get: release
|
||||
trigger: true
|
||||
- task: tfvars
|
||||
file: kiteco/concourse/tasks/tf-vars/task.yml
|
||||
vars:
|
||||
build: release
|
||||
versions: '{"green": "VERSION", "blue": "blue"}'
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
var_files: [tfvars/terraform.tfvars]
|
||||
|
||||
- name: stage-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: switch-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [stage-apply]
|
||||
trigger: true
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
green: blue
|
||||
blue: green
|
||||
|
||||
- name: switch-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [switch-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
||||
|
||||
- name: cleanup-plan
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
- put: terraform
|
||||
params:
|
||||
plan_only: true
|
||||
vars:
|
||||
versions:
|
||||
blue: blue
|
||||
|
||||
- name: cleanup-apply
|
||||
plan:
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
passed: [cleanup-plan]
|
||||
- put: terraform
|
||||
params:
|
||||
plan_run: true
|
148
concourse/pipelines/release/pipeline.ytt.yml
Normal file
148
concourse/pipelines/release/pipeline.ytt.yml
Normal file
@ -0,0 +1,148 @@
|
||||
resource_types:
|
||||
- name: slack-notification
|
||||
type: docker-image
|
||||
source:
|
||||
repository: cfcommunity/slack-notification-resource
|
||||
|
||||
resources:
|
||||
- name: image-lfs-pull
|
||||
type: docker-image
|
||||
source: {repository: kiteco/concourse.lfs-pull}
|
||||
- name: image-build
|
||||
type: docker-image
|
||||
source: {repository: kiteco/concourse}
|
||||
- name: kiteco
|
||||
type: git
|
||||
source:
|
||||
uri: git@github.com:kiteco/kiteco.git
|
||||
branch: release
|
||||
private_key: ((ssh_private))
|
||||
disable_ci_skip: true
|
||||
tag_filter: v2* #! this'll last for the millenium
|
||||
- name: slack-release-notifications
|
||||
type: slack-notification
|
||||
source:
|
||||
url: ((slack_release-notifications))
|
||||
|
||||
jobs:
|
||||
#@ platforms = ["windows", "linux"]
|
||||
#@ for platform in platforms:
|
||||
- name: #@ "stage-{}-release".format(platform)
|
||||
plan:
|
||||
- get: image-lfs-pull
|
||||
- get: image-build
|
||||
- get: kiteco
|
||||
trigger: true
|
||||
params: &kiteco_get_params
|
||||
depth: 10
|
||||
submodules: none
|
||||
disable_git_lfs: true
|
||||
|
||||
- in_parallel:
|
||||
- task: kiteco-lfs-pull
|
||||
file: kiteco/concourse/tasks/lfs-pull/task.yml
|
||||
image: image-lfs-pull
|
||||
input_mapping: {repo: kiteco}
|
||||
output_mapping: {repo: kiteco}
|
||||
vars:
|
||||
private_key: ((ssh_private))
|
||||
|
||||
- do:
|
||||
- task: build-release-binary
|
||||
file: kiteco/concourse/tasks/build-release-binary/task.yml
|
||||
image: image-build
|
||||
|
||||
- task: prepare-release
|
||||
file: kiteco/concourse/tasks/prepare-release/task.yml
|
||||
image: image-build
|
||||
on_success:
|
||||
put: slack-release-notifications
|
||||
params:
|
||||
text_file: slack/message
|
||||
vars:
|
||||
platform: #@ platform
|
||||
release_db_uri: ((release_gcp-db-uri))
|
||||
|
||||
- task: build-client
|
||||
file: #@ "kiteco/concourse/tasks/build-{}-client/task.yml".format(platform)
|
||||
#@ if platform == "linux":
|
||||
image: image-build
|
||||
#@ end
|
||||
vars:
|
||||
#@ if platform == "linux":
|
||||
private_key: ((linux_update-signing-key))
|
||||
aws_access_key_id: ((aws_id))
|
||||
aws_access_key_secret: ((aws_secret))
|
||||
#@ elif platform == "windows":
|
||||
update_signing: ((windows_update-signing-key-password))
|
||||
#@ end
|
||||
|
||||
- task: upload-client
|
||||
file: kiteco/concourse/tasks/upload-client-build/task.yml
|
||||
image: image-build
|
||||
vars:
|
||||
aws_access_key_id: ((aws_id))
|
||||
aws_access_key_secret: ((aws_secret))
|
||||
|
||||
- task: stage-client
|
||||
file: kiteco/concourse/tasks/release-client-build/task.yml
|
||||
image: image-build
|
||||
input_mapping: {meta: build}
|
||||
vars:
|
||||
release_db_uri: ((release_gcp-db-uri))
|
||||
percentage: 100
|
||||
|
||||
on_failure: &fail_notif
|
||||
put: slack-release-notifications
|
||||
params:
|
||||
text: #@ "{} client release <http://concourse.kite.com/builds/$BUILD_ID|build> failed <@XXXXXXX>".format(platform.capitalize())
|
||||
on_success:
|
||||
put: slack-release-notifications
|
||||
params:
|
||||
text: #@ "{} client release available on staging <@XXXXXXX>".format(platform.capitalize())
|
||||
#@ end
|
||||
|
||||
- name: stage-backend-release
|
||||
plan:
|
||||
- get: image-lfs-pull
|
||||
- get: image-build
|
||||
- get: kiteco
|
||||
trigger: true
|
||||
params: *kiteco_get_params
|
||||
- task: kiteco-lfs-pull
|
||||
file: kiteco/concourse/tasks/lfs-pull/task.yml
|
||||
image: image-lfs-pull
|
||||
input_mapping: {repo: kiteco}
|
||||
output_mapping: {repo: kiteco}
|
||||
vars:
|
||||
private_key: ((ssh_private))
|
||||
- task: build-backend
|
||||
file: kiteco/concourse/tasks/build-backend/task.yml
|
||||
image: image-build
|
||||
- task: upload-backend
|
||||
file: kiteco/concourse/tasks/upload-backend-build/task.yml
|
||||
image: image-build
|
||||
vars:
|
||||
aws_access_key_id: ((aws_id))
|
||||
aws_access_key_secret: ((aws_secret))
|
||||
on_failure:
|
||||
put: slack-release-notifications
|
||||
params:
|
||||
text: "Backend build <http://concourse.kite.com/builds/$BUILD_ID|build> failed <@XXXXXXX>"
|
||||
on_success:
|
||||
put: slack-release-notifications
|
||||
params:
|
||||
text: "Backend build <http://concourse.kite.com/builds/$BUILD_ID|build> succeeded <@XXXXXXX>"
|
||||
- name: puppet
|
||||
plan:
|
||||
- get: image-build
|
||||
- get: kiteco
|
||||
params: *kiteco_get_params
|
||||
- task: build-puppet
|
||||
file: kiteco/concourse/tasks/build-puppet/task.yml
|
||||
- task: upload-puppet
|
||||
file: kiteco/concourse/tasks/upload-puppet/task.yml
|
||||
image: image-build
|
||||
vars:
|
||||
aws_access_key_id: ((aws_id))
|
||||
aws_access_key_secret: ((aws_secret))
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user