kiteco-public/jupyter-notebooks/Kite_User_Completions_Analysis_SkipEverling_20200110.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:20:47.099673Z",
     "start_time": "2019-12-31T09:20:47.086203Z"
    }
   },
   "outputs": [],
   "source": [
    "# PACKAGE IMPORTS #\n",
    "from datetime import datetime, timedelta\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "import seaborn as sns\n",
    "# Use seaborn style defaults and set the default figure size\n",
    "sns.set(rc={'figure.figsize':(11, 4)})\n",
    "\n",
    "from scipy import stats\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Create dataframe object from data in local file (.csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:00.582248Z",
     "start_time": "2019-12-31T07:36:00.558445Z"
    }
   },
   "outputs": [],
   "source": [
    "def load_and_preprocess_data(filename, emails_req=True):\n",
    "    \"\"\"Load data from file and apply transformations\"\"\"\n",
    "    #Load data from csv\n",
    "    df = pd.read_csv(filename)\n",
    "    \n",
    "    #pre-process data cleaning\n",
    "    df['date'] = pd.to_datetime(df['date']) # convert to datetime object\n",
    "    df['week_num'] = df['date'].dt.week # get week number\n",
    "    df['email'] = df['email'].dropna() # remove records with no email address\n",
    "    \n",
    "     \n",
    "    #filter to relevant columns\n",
    "#     col_list = [\n",
    "#             'user_distinct_id', \n",
    "#             'email', \n",
    "#             'date',\n",
    "#             'week_num',\n",
    "#             'python_edit',\n",
    "#             'completions_num_selected',\n",
    "#         ]\n",
    "#     df = df[col_list]\n",
    "   \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.062304Z",
     "start_time": "2019-12-31T07:36:00.586691Z"
    }
   },
   "outputs": [],
   "source": [
    "df_ks1d_all = load_and_preprocess_data('Completions_Analysis_ks1d.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "##### Data Import Option: Get JQL query results directly from Mixpanel API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.091061Z",
     "start_time": "2019-12-31T07:36:04.066059Z"
    },
    "hidden": true,
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "#!pip install mixpanel-jql\n",
    "\n",
    "#from datetime import datetime\n",
    "#from mixpanel_jql import JQL, Reducer, Events\n",
    "\n",
    "#mixpanel_api_secret = '...'\n",
    "\n",
    "# query = JQL(\n",
    "#             mixpanel_api_secret,\n",
    "#             events=Events({\n",
    "#                 'event_selectors': [{'event': \"X\"}],\n",
    "#                 'from_date': datetime(2016, 5, 1),\n",
    "#                 'to_date': datetime(2016, 5, 31)\n",
    "#             })\n",
    "#         ).group_by(\n",
    "#             keys=[\n",
    "#                 \"new Date(e.time).toISOString().split('T')[0]\",\n",
    "#             ],\n",
    "#             accumulator=Reducer.count()\n",
    "#         )\n",
    "\n",
    "### Build dataframe from mixpanel-jql query which streams back results\n",
    "#for row in query.send():\n",
    "#  df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Analyze kite_status_1d events in Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.506909Z",
     "start_time": "2019-12-31T07:36:04.114425Z"
    }
   },
   "outputs": [],
   "source": [
    "#dataframe for ks1d events including all results from JQL query\n",
    "df = df_ks1d_all.copy()\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.665210Z",
     "start_time": "2019-12-31T07:36:04.510597Z"
    }
   },
   "outputs": [],
   "source": [
    "# example of how to create new dataframe objects that contain only desired rows, cols\n",
    "df_recent = df.loc[df['date'] > '2019-09-01']\n",
    "\n",
    "#df_recent.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.674540Z",
     "start_time": "2019-12-31T07:36:04.667981Z"
    }
   },
   "outputs": [],
   "source": [
    "#df.loc[df['email'] == 'caelan@kite.com'].sort_values('python_edit', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.690041Z",
     "start_time": "2019-12-31T07:36:04.684756Z"
    }
   },
   "outputs": [],
   "source": [
    "# Example of computing new stats and append a new column to the dataframe\n",
    "#df['ratio_compl_triggered_to_pyedit'] = df.completions_triggered / df.python_edit\n",
    "#df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualization tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.717936Z",
     "start_time": "2019-12-31T07:36:04.704136Z"
    }
   },
   "outputs": [],
   "source": [
    "# Add Plotly"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Python_edit total for a prior calendar period"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.732918Z",
     "start_time": "2019-12-31T07:36:04.722481Z"
    }
   },
   "outputs": [],
   "source": [
    "#df.loc[:,['date', 'python_edit']][df.date < '2019-10-21']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:04.756493Z",
     "start_time": "2019-12-31T07:36:04.746351Z"
    }
   },
   "outputs": [],
   "source": [
    "# use datetime library for relative time operations\n",
    "now = datetime.now()\n",
    "one_week = timedelta(days=7)\n",
    "\n",
    "one_week_ago = now - one_week\n",
    "print(one_week_ago.month, one_week_ago.day)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:05.075427Z",
     "start_time": "2019-12-31T07:36:04.763488Z"
    }
   },
   "outputs": [],
   "source": [
    "# Dataframe with events from last week only\n",
    "df_lastweek = df.loc[:,['email','date', 'python_edit']][df.date > '2019-10-21']\n",
    "\n",
    "# Dataframe with events from this calendar year-to-date\n",
    "df_ytd = df.loc[:,:][df.date >= '2019-01-01']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Counts by week per user (email)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T07:36:07.090608Z",
     "start_time": "2019-12-31T07:36:05.356209Z"
    }
   },
   "outputs": [],
   "source": [
    "df_weekly_totals = df.groupby(['email', 'week_num']).sum()\n",
    "df_weekly_totals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Counts Year-to-Date per user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:33:42.774044Z",
     "start_time": "2019-12-31T09:33:41.739699Z"
    }
   },
   "outputs": [],
   "source": [
    "df_ytd_totals = df_ytd.groupby(['email']).sum()\n",
    "#df_ytd_totals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:33:19.079013Z",
     "start_time": "2019-12-31T09:33:19.074966Z"
    }
   },
   "source": [
    "## Counts for 2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:32:16.015014Z",
     "start_time": "2019-12-31T09:32:15.329247Z"
    }
   },
   "outputs": [],
   "source": [
    "#Dataframe with all data from 2019\n",
    "df_2019 = df.loc[:,:][(df.date >= '2019-01-01') & (df.date <= '2019-12-31')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:32:32.296433Z",
     "start_time": "2019-12-31T09:32:31.568585Z"
    }
   },
   "outputs": [],
   "source": [
    "df_2019.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### _How does completions_selected trend daily over 2019?_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Outlier Analysis & Removal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T08:45:20.241840Z",
     "start_time": "2019-12-31T08:45:16.918249Z"
    }
   },
   "outputs": [],
   "source": [
    "df_2019.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")\n",
    "# This plot shows abnormal spikes on a few dates in Feb, June, July, and Aug"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T08:45:20.308304Z",
     "start_time": "2019-12-31T08:45:20.249025Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "outlier_rows = df_2019[(stats.zscore(df_2019['completions_num_selected']) > 5)]\n",
    "outlier_dates = pd.Series(outlier_rows['date'].unique()).dt.date.sort_values()\n",
    "\n",
    "outlier_dates.apply(lambda x: x.strftime('%Y-%m-%d')).tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The list of outlier dates was obtained by this method:\n",
    "- For each column, first it computes the Z-score of each value in the column, \n",
    "relative to the column mean and standard deviation.\n",
    "- Then it takes the absolute of Z-score because the direction does not matter.\n",
    "- Z-score must be within 5 (i.e., value not more than 5 standard deviations from col mean)\n",
    "- Below, the result is used to index a new dataframe designated \"cleaned\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T08:45:20.856070Z",
     "start_time": "2019-12-31T08:45:20.492709Z"
    }
   },
   "outputs": [],
   "source": [
    "df_2019_cleaned = df_2019[(np.abs(stats.zscore(df_2019['completions_num_selected'])) < 5)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T08:45:21.200314Z",
     "start_time": "2019-12-31T08:45:21.155143Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_2019_cleaned"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2019 Dataframe with outlier rows removed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T08:45:53.574080Z",
     "start_time": "2019-12-31T08:45:50.166408Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df_2019_cleaned.plot(y='completions_num_selected', x='date', title=\"Completions Selected 2019\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T08:51:27.168014Z",
     "start_time": "2019-12-31T08:51:27.028061Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df_2019_cleaned['completions_num_selected'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:10:40.043465Z",
     "start_time": "2019-12-31T09:10:39.803603Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Sum all activity for each date to a single aggregate value, for each column\n",
    "date_grouped_df = df_2019_cleaned.groupby('date').sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:37:10.962355Z",
     "start_time": "2019-12-31T09:37:09.591125Z"
    }
   },
   "outputs": [],
   "source": [
    "# Rolling 14 day average completions_num_selected\n",
    "date_grouped_df['completions_num_selected'].rolling(window=14).mean().plot(title=\"Rolling 14 Day Average completions_num_selected\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:39:56.159332Z",
     "start_time": "2019-12-31T09:39:55.084351Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Rolling 14 day average completions_num_selected\n",
    "date_grouped_df['completions_num_selected'].rolling(window=1).max().plot(title=\"Daily max completions_num_selected\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:41:23.473299Z",
     "start_time": "2019-12-31T09:41:23.467552Z"
    }
   },
   "outputs": [],
   "source": [
    "# Note: likely more individual outliers to remove from Feb, Jul, Aug based on above chart"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### _What’s the total number of completions_selected by all Kite users in 2019?_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-12-31T09:35:55.162325Z",
     "start_time": "2019-12-31T09:35:55.146261Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(\"Total completions_num_selected in 2019: \", df_2019_cleaned['completions_num_selected'].sum())\n",
    "# approx. 70 million before / 50 million after removing some outliers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Q4 2019\n",
    "Ranking users by completion feature usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Dataframe with all data from Q4 2019 only\n",
    "df_q42019 = df_2019_cleaned.loc[:,:][(df_2019_cleaned.date >= '2019-10-01') & (df_2019_cleaned.date <= '2019-12-31')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### python_edit top ranked users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_q42019 = df_q42019.sort_values('python_edit', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Top 10 users ranked by highest single day python_edit total\n",
    "df_q42019.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Top 10 users ranked by highest python_edit total (sum of user's activity for dates in Q4)\n",
    "df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Just these users' email addresseses as a rank ordered list\n",
    "df_q42019.groupby('email').sum().sort_values('python_edit', ascending=False).index.tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### completions_num_selected top ranked users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Top 10 users ranked by highest completions_num_selected total (sum of user's activity for dates in Q4)\n",
    "df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#The top 100 users' email addresseses as a rank ordered list\n",
    "top_user_emails = df_q42019.groupby('email').sum().sort_values('completions_num_selected', ascending=False).index.tolist()[:100]\n",
    "for email in top_user_emails:\n",
    "    print(email)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}