kiteco-public/jupyter-notebooks/Kite_User_Completions_Analysis_SkipEverling_20200110.ipynb

726 lines
18 KiB
Plaintext
Raw Normal View History

2022-01-01 10:54:19 +03:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:20:47.099673Z",
"start_time": "2019-12-31T09:20:47.086203Z"
}
},
"outputs": [],
"source": [
"# PACKAGE IMPORTS #\n",
"from datetime import datetime, timedelta\n",
"import numpy as np\n",
"import pandas as pd\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"import seaborn as sns\n",
"# Use seaborn style defaults and set the default figure size\n",
"sns.set(rc={'figure.figsize':(11, 4)})\n",
"\n",
"from scipy import stats\n",
"from pprint import pprint"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create dataframe object from data in local file (.csv)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:00.582248Z",
"start_time": "2019-12-31T07:36:00.558445Z"
}
},
"outputs": [],
"source": [
"def load_and_preprocess_data(filename, emails_req=True):\n",
" \"\"\"Load data from file and apply transformations\"\"\"\n",
" #Load data from csv\n",
" df = pd.read_csv(filename)\n",
" \n",
" #pre-process data cleaning\n",
" df['date'] = pd.to_datetime(df['date']) # convert to datetime object\n",
" df['week_num'] = df['date'].dt.week # get week number\n",
" df['email'] = df['email'].dropna() # remove records with no email address\n",
" \n",
" \n",
" #filter to relevant columns\n",
"# col_list = [\n",
"# 'user_distinct_id', \n",
"# 'email', \n",
"# 'date',\n",
"# 'week_num',\n",
"# 'python_edit',\n",
"# 'completions_num_selected',\n",
"# ]\n",
"# df = df[col_list]\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.062304Z",
"start_time": "2019-12-31T07:36:00.586691Z"
}
},
"outputs": [],
"source": [
"df_ks1d_all = load_and_preprocess_data('Completions_Analysis_ks1d.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"##### Data Import Option: Get JQL query results directly from Mixpanel API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.091061Z",
"start_time": "2019-12-31T07:36:04.066059Z"
},
"hidden": true,
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"#!pip install mixpanel-jql\n",
"\n",
"#from datetime import datetime\n",
"#from mixpanel_jql import JQL, Reducer, Events\n",
"\n",
"#mixpanel_api_secret = '...'\n",
"\n",
"# query = JQL(\n",
"# mixpanel_api_secret,\n",
"# events=Events({\n",
"# 'event_selectors': [{'event': \"X\"}],\n",
"# 'from_date': datetime(2016, 5, 1),\n",
"# 'to_date': datetime(2016, 5, 31)\n",
"# })\n",
"# ).group_by(\n",
"# keys=[\n",
"# \"new Date(e.time).toISOString().split('T')[0]\",\n",
"# ],\n",
"# accumulator=Reducer.count()\n",
"# )\n",
"\n",
"### Build dataframe from mixpanel-jql query which streams back results\n",
"#for row in query.send():\n",
"# df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Analyze kite_status_1d events in Dataframe"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.506909Z",
"start_time": "2019-12-31T07:36:04.114425Z"
}
},
"outputs": [],
"source": [
"#dataframe for ks1d events including all results from JQL query\n",
"df = df_ks1d_all.copy()\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.665210Z",
"start_time": "2019-12-31T07:36:04.510597Z"
}
},
"outputs": [],
"source": [
"# example of how to create new dataframe objects that contain only desired rows, cols\n",
"df_recent = df.loc[df['date'] > '2019-09-01']\n",
"\n",
"#df_recent.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.674540Z",
"start_time": "2019-12-31T07:36:04.667981Z"
}
},
"outputs": [],
"source": [
"#df.loc[df['email'] == 'caelan@kite.com'].sort_values('python_edit', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.690041Z",
"start_time": "2019-12-31T07:36:04.684756Z"
}
},
"outputs": [],
"source": [
"# Example of computing new stats and append a new column to the dataframe\n",
"#df['ratio_compl_triggered_to_pyedit'] = df.completions_triggered / df.python_edit\n",
"#df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualization tools"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.717936Z",
"start_time": "2019-12-31T07:36:04.704136Z"
}
},
"outputs": [],
"source": [
"# Add Plotly"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Python_edit total for a prior calendar period"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.732918Z",
"start_time": "2019-12-31T07:36:04.722481Z"
}
},
"outputs": [],
"source": [
"#df.loc[:,['date', 'python_edit']][df.date < '2019-10-21']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:04.756493Z",
"start_time": "2019-12-31T07:36:04.746351Z"
}
},
"outputs": [],
"source": [
"# use datetime library for relative time operations\n",
"now = datetime.now()\n",
"one_week = timedelta(days=7)\n",
"\n",
"one_week_ago = now - one_week\n",
"print(one_week_ago.month, one_week_ago.day)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:05.075427Z",
"start_time": "2019-12-31T07:36:04.763488Z"
}
},
"outputs": [],
"source": [
"# Dataframe with events from last week only\n",
"df_lastweek = df.loc[:,['email','date', 'python_edit']][df.date > '2019-10-21']\n",
"\n",
"# Dataframe with events from this calendar year-to-date\n",
"df_ytd = df.loc[:,:][df.date >= '2019-01-01']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Counts by week per user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T07:36:07.090608Z",
"start_time": "2019-12-31T07:36:05.356209Z"
}
},
"outputs": [],
"source": [
"df_weekly_totals = df.groupby(['email', 'week_num']).sum()\n",
"df_weekly_totals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Counts Year-to-Date per user"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:33:42.774044Z",
"start_time": "2019-12-31T09:33:41.739699Z"
}
},
"outputs": [],
"source": [
"df_ytd_totals = df_ytd.groupby(['email']).sum()\n",
"#df_ytd_totals"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:33:19.079013Z",
"start_time": "2019-12-31T09:33:19.074966Z"
}
},
"source": [
"## Counts for 2019"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:32:16.015014Z",
"start_time": "2019-12-31T09:32:15.329247Z"
}
},
"outputs": [],
"source": [
"#Dataframe with all data from 2019\n",
"df_2019 = df.loc[:,:][(df.date >= '2019-01-01') & (df.date <= '2019-12-31')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:32:32.296433Z",
"start_time": "2019-12-31T09:32:31.568585Z"
}
},
"outputs": [],
"source": [
"df_2019.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### _How does completions_selected trend daily over 2019?_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Outlier Analysis & Removal"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:20.241840Z",
"start_time": "2019-12-31T08:45:16.918249Z"
}
},
"outputs": [],
"source": [
"df_2019.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")\n",
"# This plot shows abnormal spikes on a few dates in Feb, June, July, and Aug"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:20.308304Z",
"start_time": "2019-12-31T08:45:20.249025Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"outlier_rows = df_2019[(stats.zscore(df_2019['completions_num_selected']) > 5)]\n",
"outlier_dates = pd.Series(outlier_rows['date'].unique()).dt.date.sort_values()\n",
"\n",
"outlier_dates.apply(lambda x: x.strftime('%Y-%m-%d')).tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The list of outlier dates was obtained by this method:\n",
"- For each column, first it computes the Z-score of each value in the column, \n",
"relative to the column mean and standard deviation.\n",
"- Then it takes the absolute of Z-score because the direction does not matter.\n",
"- Z-score must be within 5 (i.e., value not more than 5 standard deviations from col mean)\n",
"- Below, the result is used to index a new dataframe designated \"cleaned\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:20.856070Z",
"start_time": "2019-12-31T08:45:20.492709Z"
}
},
"outputs": [],
"source": [
"df_2019_cleaned = df_2019[(np.abs(stats.zscore(df_2019['completions_num_selected'])) < 5)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:21.200314Z",
"start_time": "2019-12-31T08:45:21.155143Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"df_2019_cleaned"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2019 Dataframe with outlier rows removed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:45:53.574080Z",
"start_time": "2019-12-31T08:45:50.166408Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"df_2019_cleaned.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T08:51:27.168014Z",
"start_time": "2019-12-31T08:51:27.028061Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"df_2019_cleaned['completions_num_selected'].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:10:40.043465Z",
"start_time": "2019-12-31T09:10:39.803603Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"# Sum all activity for each date to a single aggregate value, for each column\n",
"date_grouped_df = df_2019_cleaned.groupby('date').sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:37:10.962355Z",
"start_time": "2019-12-31T09:37:09.591125Z"
}
},
"outputs": [],
"source": [
"# Rolling 14 day average completions_num_selected\n",
"date_grouped_df['completions_num_selected'].rolling(window=14).mean().plot(title=\"Rolling 14 Day Average completions_num_selected\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:39:56.159332Z",
"start_time": "2019-12-31T09:39:55.084351Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"# Rolling 14 day average completions_num_selected\n",
"date_grouped_df['completions_num_selected'].rolling(window=1).max().plot(title=\"Daily max completions_num_selected\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:41:23.473299Z",
"start_time": "2019-12-31T09:41:23.467552Z"
}
},
"outputs": [],
"source": [
"# Note: likely more individual outliers to remove from Feb, Jul, Aug based on above chart"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### _Whats the total number of completions_selected by all Kite users in 2019?_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-31T09:35:55.162325Z",
"start_time": "2019-12-31T09:35:55.146261Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"print(\"Total completions_num_selected in 2019: \", df_2019_cleaned['completions_num_selected'].sum())\n",
"# approx. 70 million before / 50 million after removing some outliers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Q4 2019\n",
"Ranking users by completion feature usage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Dataframe with all data from Q4 2019 only\n",
"df_q42019 = df_2019_cleaned.loc[:,:][(df_2019_cleaned.date >= '2019-10-01') & (df_2019_cleaned.date <= '2019-12-31')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### python_edit top ranked users"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_q42019 = df_q42019.sort_values('python_edit', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Top 10 users ranked by highest single day python_edit total\n",
"df_q42019.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Top 10 users ranked by highest python_edit total (sum of user's activity for dates in Q4)\n",
"df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Just these users' email addresseses as a rank ordered list\n",
"df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).index.tolist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### completions_num_selected top ranked users"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Top 10 users ranked by highest completions_num_selected total (sum of user's activity for dates in Q4)\n",
"df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"#The top 100 users' email addresseses as a rank ordered list\n",
"top_user_emails = df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).index.tolist()[:100]\n",
"for email in top_user_emails:\n",
" print(email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {},
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}