kiteco-public/jupyter-notebooks/Kite_User_Completions_Analysis_SkipEverling_20200110.ipynb
2021-12-31 23:54:19 -08:00

726 lines
18 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:20:47.099673Z",
"start_time": "2019-12-31T09:20:47.086203Z"
}
},
"outputs": [],
"source": [
"# PACKAGE IMPORTS #\n",
"from datetime import datetime, timedelta\n",
"import numpy as np\n",
"import pandas as pd\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"import seaborn as sns\n",
"# Use seaborn style defaults and set the default figure size\n",
"sns.set(rc={'figure.figsize':(11, 4)})\n",
"\n",
"from scipy import stats\n",
"from pprint import pprint"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create dataframe object from data in local file (.csv)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:00.582248Z",
"start_time": "2019-12-31T07:36:00.558445Z"
}
},
"outputs": [],
"source": [
"def load_and_preprocess_data(filename, emails_req=True):\n",
" \"\"\"Load data from file and apply transformations\"\"\"\n",
" #Load data from csv\n",
" df = pd.read_csv(filename)\n",
" \n",
" #pre-process data cleaning\n",
" df['date'] = pd.to_datetime(df['date']) # convert to datetime object\n",
" df['week_num'] = df['date'].dt.week # get week number\n",
" df['email'] = df['email'].dropna() # remove records with no email address\n",
" \n",
" \n",
" #filter to relevant columns\n",
"# col_list = [\n",
"# 'user_distinct_id', \n",
"# 'email', \n",
"# 'date',\n",
"# 'week_num',\n",
"# 'python_edit',\n",
"# 'completions_num_selected',\n",
"# ]\n",
"# df = df[col_list]\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.062304Z",
"start_time": "2019-12-31T07:36:00.586691Z"
}
},
"outputs": [],
"source": [
"df_ks1d_all = load_and_preprocess_data('Completions_Analysis_ks1d.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"##### Data Import Option: Get JQL query results directly from Mixpanel API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.091061Z",
"start_time": "2019-12-31T07:36:04.066059Z"
},
"hidden": true,
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"#!pip install mixpanel-jql\n",
"\n",
"#from datetime import datetime\n",
"#from mixpanel_jql import JQL, Reducer, Events\n",
"\n",
"#mixpanel_api_secret = '...'\n",
"\n",
"# query = JQL(\n",
"# mixpanel_api_secret,\n",
"# events=Events({\n",
"# 'event_selectors': [{'event': \"X\"}],\n",
"# 'from_date': datetime(2016, 5, 1),\n",
"# 'to_date': datetime(2016, 5, 31)\n",
"# })\n",
"# ).group_by(\n",
"# keys=[\n",
"# \"new Date(e.time).toISOString().split('T')[0]\",\n",
"# ],\n",
"# accumulator=Reducer.count()\n",
"# )\n",
"\n",
"### Build dataframe from mixpanel-jql query which streams back results\n",
"#for row in query.send():\n",
"# df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Analyze kite_status_1d events in Dataframe"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.506909Z",
"start_time": "2019-12-31T07:36:04.114425Z"
}
},
"outputs": [],
"source": [
"#dataframe for ks1d events including all results from JQL query\n",
"df = df_ks1d_all.copy()\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.665210Z",
"start_time": "2019-12-31T07:36:04.510597Z"
}
},
"outputs": [],
"source": [
"# example of how to create new dataframe objects that contain only desired rows, cols\n",
"df_recent = df.loc[df['date'] > '2019-09-01']\n",
"\n",
"#df_recent.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.674540Z",
"start_time": "2019-12-31T07:36:04.667981Z"
}
},
"outputs": [],
"source": [
"#df.loc[df['email'] == 'caelan@kite.com'].sort_values('python_edit', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.690041Z",
"start_time": "2019-12-31T07:36:04.684756Z"
}
},
"outputs": [],
"source": [
"# Example of computing new stats and append a new column to the dataframe\n",
"#df['ratio_compl_triggered_to_pyedit'] = df.completions_triggered / df.python_edit\n",
"#df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualization tools"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.717936Z",
"start_time": "2019-12-31T07:36:04.704136Z"
}
},
"outputs": [],
"source": [
"# Add Plotly"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Python_edit total for a prior calendar period"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.732918Z",
"start_time": "2019-12-31T07:36:04.722481Z"
}
},
"outputs": [],
"source": [
"#df.loc[:,['date', 'python_edit']][df.date < '2019-10-21']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.756493Z",
"start_time": "2019-12-31T07:36:04.746351Z"
}
},
"outputs": [],
"source": [
"# use datetime library for relative time operations\n",
"now = datetime.now()\n",
"one_week = timedelta(days=7)\n",
"\n",
"one_week_ago = now - one_week\n",
"print(one_week_ago.month, one_week_ago.day)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:05.075427Z",
"start_time": "2019-12-31T07:36:04.763488Z"
}
},
"outputs": [],
"source": [
"# Dataframe with events from last week only\n",
"df_lastweek = df.loc[:,['email','date', 'python_edit']][df.date > '2019-10-21']\n",
"\n",
"# Dataframe with events from this calendar year-to-date\n",
"df_ytd = df.loc[:,:][df.date >= '2019-01-01']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Counts by week per user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:07.090608Z",
"start_time": "2019-12-31T07:36:05.356209Z"
}
},
"outputs": [],
"source": [
"df_weekly_totals = df.groupby(['email', 'week_num']).sum()\n",
"df_weekly_totals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Counts Year-to-Date per user"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:33:42.774044Z",
"start_time": "2019-12-31T09:33:41.739699Z"
}
},
"outputs": [],
"source": [
"df_ytd_totals = df_ytd.groupby(['email']).sum()\n",
"#df_ytd_totals"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:33:19.079013Z",
"start_time": "2019-12-31T09:33:19.074966Z"
}
},
"source": [
"## Counts for 2019"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:32:16.015014Z",
"start_time": "2019-12-31T09:32:15.329247Z"
}
},
"outputs": [],
"source": [
"#Dataframe with all data from 2019\n",
"df_2019 = df.loc[:,:][(df.date >= '2019-01-01') & (df.date <= '2019-12-31')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:32:32.296433Z",
"start_time": "2019-12-31T09:32:31.568585Z"
}
},
"outputs": [],
"source": [
"df_2019.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### _How does completions_selected trend daily over 2019?_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Outlier Analysis & Removal"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:20.241840Z",
"start_time": "2019-12-31T08:45:16.918249Z"
}
},
"outputs": [],
"source": [
"df_2019.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")\n",
"# This plot shows abnormal spikes on a few dates in Feb, June, July, and Aug"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:20.308304Z",
"start_time": "2019-12-31T08:45:20.249025Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"outlier_rows = df_2019[(stats.zscore(df_2019['completions_num_selected']) > 5)]\n",
"outlier_dates = pd.Series(outlier_rows['date'].unique()).dt.date.sort_values()\n",
"\n",
"outlier_dates.apply(lambda x: x.strftime('%Y-%m-%d')).tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The list of outlier dates was obtained by this method:\n",
"- For each column, first it computes the Z-score of each value in the column, \n",
"relative to the column mean and standard deviation.\n",
"- Then it takes the absolute of Z-score because the direction does not matter.\n",
"- Z-score must be within 5 (i.e., value not more than 5 standard deviations from col mean)\n",
"- Below, the result is used to index a new dataframe designated \"cleaned\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:20.856070Z",
"start_time": "2019-12-31T08:45:20.492709Z"
}
},
"outputs": [],
"source": [
"df_2019_cleaned = df_2019[(np.abs(stats.zscore(df_2019['completions_num_selected'])) < 5)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:21.200314Z",
"start_time": "2019-12-31T08:45:21.155143Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"df_2019_cleaned"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2019 Dataframe with outlier rows removed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:53.574080Z",
"start_time": "2019-12-31T08:45:50.166408Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"df_2019_cleaned.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:51:27.168014Z",
"start_time": "2019-12-31T08:51:27.028061Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"df_2019_cleaned['completions_num_selected'].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:10:40.043465Z",
"start_time": "2019-12-31T09:10:39.803603Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"# Sum all activity for each date to a single aggregate value, for each column\n",
"date_grouped_df = df_2019_cleaned.groupby('date').sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:37:10.962355Z",
"start_time": "2019-12-31T09:37:09.591125Z"
}
},
"outputs": [],
"source": [
"# Rolling 14 day average completions_num_selected\n",
"date_grouped_df['completions_num_selected'].rolling(window=14).mean().plot(title=\"Rolling 14 Day Average completions_num_selected\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:39:56.159332Z",
"start_time": "2019-12-31T09:39:55.084351Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"# Rolling 14 day average completions_num_selected\n",
"date_grouped_df['completions_num_selected'].rolling(window=1).max().plot(title=\"Daily max completions_num_selected\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:41:23.473299Z",
"start_time": "2019-12-31T09:41:23.467552Z"
}
},
"outputs": [],
"source": [
"# Note: likely more individual outliers to remove from Feb, Jul, Aug based on above chart"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### _Whats the total number of completions_selected by all Kite users in 2019?_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:35:55.162325Z",
"start_time": "2019-12-31T09:35:55.146261Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"print(\"Total completions_num_selected in 2019: \", df_2019_cleaned['completions_num_selected'].sum())\n",
"# approx. 70 million before / 50 million after removing some outliers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Q4 2019\n",
"Ranking users by completion feature usage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Dataframe with all data from Q4 2019 only\n",
"df_q42019 = df_2019_cleaned.loc[:,:][(df_2019_cleaned.date >= '2019-10-01') & (df_2019_cleaned.date <= '2019-12-31')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### python_edit top ranked users"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_q42019 = df_q42019.sort_values('python_edit', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Top 10 users ranked by highest single day python_edit total\n",
"df_q42019.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Top 10 users ranked by highest python_edit total (sum of user's activity for dates in Q4)\n",
"df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Just these users' email addresseses as a rank ordered list\n",
"df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).index.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### completions_num_selected top ranked users"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Top 10 users ranked by highest completions_num_selected total (sum of user's activity for dates in Q4)\n",
"df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"#The top 100 users' email addresseses as a rank ordered list\n",
"top_user_emails = df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).index.tolist()[:100]\n",
"for email in top_user_emails:\n",
" print(email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {},
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}