{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:20:47.099673Z", "start_time": "2019-12-31T09:20:47.086203Z" } }, "outputs": [], "source": [ "# PACKAGE IMPORTS #\n", "from datetime import datetime, timedelta\n", "import numpy as np\n", "import pandas as pd\n", "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "import seaborn as sns\n", "# Use seaborn style defaults and set the default figure size\n", "sns.set(rc={'figure.figsize':(11, 4)})\n", "\n", "from scipy import stats\n", "from pprint import pprint" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create dataframe object from data in local file (.csv)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:00.582248Z", "start_time": "2019-12-31T07:36:00.558445Z" } }, "outputs": [], "source": [ "def load_and_preprocess_data(filename, emails_req=True):\n", " \"\"\"Load data from file and apply transformations\"\"\"\n", " #Load data from csv\n", " df = pd.read_csv(filename)\n", " \n", " #pre-process data cleaning\n", " df['date'] = pd.to_datetime(df['date']) # convert to datetime object\n", " df['week_num'] = df['date'].dt.week # get week number\n", " df['email'] = df['email'].dropna() # remove records with no email address\n", " \n", " \n", " #filter to relevant columns\n", "# col_list = [\n", "# 'user_distinct_id', \n", "# 'email', \n", "# 'date',\n", "# 'week_num',\n", "# 'python_edit',\n", "# 'completions_num_selected',\n", "# ]\n", "# df = df[col_list]\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.062304Z", "start_time": "2019-12-31T07:36:00.586691Z" } }, "outputs": [], "source": [ "df_ks1d_all = load_and_preprocess_data('Completions_Analysis_ks1d.csv')" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "##### Data Import Option: Get JQL query results directly from Mixpanel API" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.091061Z", "start_time": "2019-12-31T07:36:04.066059Z" }, "hidden": true, "jupyter": { "source_hidden": true } }, "outputs": [], "source": [ "#!pip install mixpanel-jql\n", "\n", "#from datetime import datetime\n", "#from mixpanel_jql import JQL, Reducer, Events\n", "\n", "#mixpanel_api_secret = '...'\n", "\n", "# query = JQL(\n", "# mixpanel_api_secret,\n", "# events=Events({\n", "# 'event_selectors': [{'event': \"X\"}],\n", "# 'from_date': datetime(2016, 5, 1),\n", "# 'to_date': datetime(2016, 5, 31)\n", "# })\n", "# ).group_by(\n", "# keys=[\n", "# \"new Date(e.time).toISOString().split('T')[0]\",\n", "# ],\n", "# accumulator=Reducer.count()\n", "# )\n", "\n", "### Build dataframe from mixpanel-jql query which streams back results\n", "#for row in query.send():\n", "# df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Analyze kite_status_1d events in Dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.506909Z", "start_time": "2019-12-31T07:36:04.114425Z" } }, "outputs": [], "source": [ "#dataframe for ks1d events including all results from JQL query\n", "df = df_ks1d_all.copy()\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.665210Z", "start_time": "2019-12-31T07:36:04.510597Z" } }, "outputs": [], "source": [ "# example of how to create new dataframe objects that contain only desired rows, cols\n", "df_recent = df.loc[df['date'] > '2019-09-01']\n", "\n", "#df_recent.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.674540Z", "start_time": "2019-12-31T07:36:04.667981Z" } }, "outputs": [], "source": [ "#df.loc[df['email'] == 'caelan@kite.com'].sort_values('python_edit', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.690041Z", "start_time": "2019-12-31T07:36:04.684756Z" } }, "outputs": [], "source": [ "# Example of computing new stats and append a new column to the dataframe\n", "#df['ratio_compl_triggered_to_pyedit'] = df.completions_triggered / df.python_edit\n", "#df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualization tools" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.717936Z", "start_time": "2019-12-31T07:36:04.704136Z" } }, "outputs": [], "source": [ "# Add Plotly" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Python_edit total for a prior calendar period" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.732918Z", "start_time": "2019-12-31T07:36:04.722481Z" } }, "outputs": [], "source": [ "#df.loc[:,['date', 'python_edit']][df.date < '2019-10-21']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:04.756493Z", "start_time": "2019-12-31T07:36:04.746351Z" } }, "outputs": [], "source": [ "# use datetime library for relative time operations\n", "now = datetime.now()\n", "one_week = timedelta(days=7)\n", "\n", "one_week_ago = now - one_week\n", "print(one_week_ago.month, one_week_ago.day)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:05.075427Z", "start_time": "2019-12-31T07:36:04.763488Z" } }, "outputs": [], "source": [ "# Dataframe with events from last week only\n", "df_lastweek = df.loc[:,['email','date', 'python_edit']][df.date > '2019-10-21']\n", "\n", "# Dataframe with events from this calendar year-to-date\n", "df_ytd = df.loc[:,:][df.date >= '2019-01-01']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Counts by week per user (email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T07:36:07.090608Z", "start_time": "2019-12-31T07:36:05.356209Z" } }, "outputs": [], "source": [ "df_weekly_totals = df.groupby(['email', 'week_num']).sum()\n", "df_weekly_totals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Counts Year-to-Date per user" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:33:42.774044Z", "start_time": "2019-12-31T09:33:41.739699Z" } }, "outputs": [], "source": [ "df_ytd_totals = df_ytd.groupby(['email']).sum()\n", "#df_ytd_totals" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:33:19.079013Z", "start_time": "2019-12-31T09:33:19.074966Z" } }, "source": [ "## Counts for 2019" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:32:16.015014Z", "start_time": "2019-12-31T09:32:15.329247Z" } }, "outputs": [], "source": [ "#Dataframe with all data from 2019\n", "df_2019 = df.loc[:,:][(df.date >= '2019-01-01') & (df.date <= '2019-12-31')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:32:32.296433Z", "start_time": "2019-12-31T09:32:31.568585Z" } }, "outputs": [], "source": [ "df_2019.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### _How does completions_selected trend daily over 2019?_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Outlier Analysis & Removal" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T08:45:20.241840Z", "start_time": "2019-12-31T08:45:16.918249Z" } }, "outputs": [], "source": [ "df_2019.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")\n", "# This plot shows abnormal spikes on a few dates in Feb, June, July, and Aug" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T08:45:20.308304Z", "start_time": "2019-12-31T08:45:20.249025Z" }, "scrolled": true }, "outputs": [], "source": [ "outlier_rows = df_2019[(stats.zscore(df_2019['completions_num_selected']) > 5)]\n", "outlier_dates = pd.Series(outlier_rows['date'].unique()).dt.date.sort_values()\n", "\n", "outlier_dates.apply(lambda x: x.strftime('%Y-%m-%d')).tolist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The list of outlier dates was obtained by this method:\n", "- For each column, first it computes the Z-score of each value in the column, \n", "relative to the column mean and standard deviation.\n", "- Then it takes the absolute of Z-score because the direction does not matter.\n", "- Z-score must be within 5 (i.e., value not more than 5 standard deviations from col mean)\n", "- Below, the result is used to index a new dataframe designated \"cleaned\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T08:45:20.856070Z", "start_time": "2019-12-31T08:45:20.492709Z" } }, "outputs": [], "source": [ "df_2019_cleaned = df_2019[(np.abs(stats.zscore(df_2019['completions_num_selected'])) < 5)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T08:45:21.200314Z", "start_time": "2019-12-31T08:45:21.155143Z" }, "scrolled": true }, "outputs": [], "source": [ "df_2019_cleaned" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2019 Dataframe with outlier rows removed" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T08:45:53.574080Z", "start_time": "2019-12-31T08:45:50.166408Z" }, "scrolled": false }, "outputs": [], "source": [ "df_2019_cleaned.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T08:51:27.168014Z", "start_time": "2019-12-31T08:51:27.028061Z" }, "scrolled": false }, "outputs": [], "source": [ "df_2019_cleaned['completions_num_selected'].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:10:40.043465Z", "start_time": "2019-12-31T09:10:39.803603Z" }, "scrolled": true }, "outputs": [], "source": [ "# Sum all activity for each date to a single aggregate value, for each column\n", "date_grouped_df = df_2019_cleaned.groupby('date').sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:37:10.962355Z", "start_time": "2019-12-31T09:37:09.591125Z" } }, "outputs": [], "source": [ "# Rolling 14 day average completions_num_selected\n", "date_grouped_df['completions_num_selected'].rolling(window=14).mean().plot(title=\"Rolling 14 Day Average completions_num_selected\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:39:56.159332Z", "start_time": "2019-12-31T09:39:55.084351Z" }, "scrolled": true }, "outputs": [], "source": [ "# Rolling 14 day average completions_num_selected\n", "date_grouped_df['completions_num_selected'].rolling(window=1).max().plot(title=\"Daily max completions_num_selected\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:41:23.473299Z", "start_time": "2019-12-31T09:41:23.467552Z" } }, "outputs": [], "source": [ "# Note: likely more individual outliers to remove from Feb, Jul, Aug based on above chart" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### _What’s the total number of completions_selected by all Kite users in 2019?_" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-12-31T09:35:55.162325Z", "start_time": "2019-12-31T09:35:55.146261Z" }, "scrolled": true }, "outputs": [], "source": [ "print(\"Total completions_num_selected in 2019: \", df_2019_cleaned['completions_num_selected'].sum())\n", "# approx. 70 million before / 50 million after removing some outliers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Q4 2019\n", "Ranking users by completion feature usage" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Dataframe with all data from Q4 2019 only\n", "df_q42019 = df_2019_cleaned.loc[:,:][(df_2019_cleaned.date >= '2019-10-01') & (df_2019_cleaned.date <= '2019-12-31')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### python_edit top ranked users" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_q42019 = df_q42019.sort_values('python_edit', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Top 10 users ranked by highest single day python_edit total\n", "df_q42019.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Top 10 users ranked by highest python_edit total (sum of user's activity for dates in Q4)\n", "df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Just these users' email addresseses as a rank ordered list\n", "df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).index.tolist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### completions_num_selected top ranked users" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Top 10 users ranked by highest completions_num_selected total (sum of user's activity for dates in Q4)\n", "df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "#The top 100 users' email addresseses as a rank ordered list\n", "top_user_emails = df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).index.tolist()[:100]\n", "for email in top_user_emails:\n", " print(email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } }, "toc": { "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "toc_cell": false, "toc_position": {}, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }