kiteco-public/jupyter-notebooks/Kite_User_Completions_Analysis-Weekly-Monthly-YTD_CaelanUrquhart_20200504.ipynb
2021-12-31 23:54:19 -08:00

475 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PACKAGE IMPORTS #\n",
"from pprint import pprint\n",
"from datetime import datetime, timedelta\n",
"import numpy as np\n",
"import pandas as pd\n",
"import csv\n",
"import json\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"import bokeh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create dataframe object from data in local file (.csv)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_and_preprocess_data(filename, emails_req=True):\n",
" \"\"\"Load data from file and apply transformations\"\"\"\n",
" #Load data from csv\n",
" df = pd.read_csv(filename)\n",
" #pre-process data cleaning\n",
" df['date'] = pd.to_datetime(df['date']) # convert to datetime object\n",
" df['week_num'] = df['date'].dt.week # get WEEK number\n",
" df['month_num'] = df['date'].dt.month # get MONTH number\n",
" df = df.dropna(subset=['email']) # remove records with NO email address\n",
" #filter to relevant columns\n",
" col_list = [\n",
" 'user_distinct_id', \n",
" 'email', \n",
" 'date',\n",
" 'week_num',\n",
" 'month_num',\n",
" 'python_edit',\n",
" 'completions_num_selected',\n",
" 'num_events_with_python'\n",
" ]\n",
" df = df[col_list]\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_ks1d_all = load_and_preprocess_data('Completions_Analysis_ks1d.csv')\n",
"print(df_ks1d_all.columns.tolist())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_ks1d_all.isnull().sum() # check for NULL values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_ks1d_all.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('Completions_Analysis_ks1d.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Analyze kite_status_1d events in Dataframe"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#dataframe for ks1d events including all results from JQL query\n",
"df = df_ks1d_all.copy()\n",
"\n",
"#working dataframe object for exploration: df\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# example of how to create new dataframe objects that contain only desired rows, cols\n",
"# df_recent = df.loc[df['date'] > '2020-04-01']\n",
"\n",
"#df_recent.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#df.loc[df['email'] == 'caelan@kite.com'].sort_values('python_edit', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example of computing new stats and append a new column to the dataframe\n",
"#df['ratio_compl_triggered_to_pyedit'] = df.completions_triggered / df.python_edit\n",
"#df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Python_edit total for a prior calendar period"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#df.loc[:,['date', 'python_edit']][df.date < '2019-10-21']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# use datetime library for relative time operations\n",
"now = datetime.now()\n",
"one_week = timedelta(days=7)\n",
"\n",
"one_week_ago = now - one_week - timedelta(days=0) # use last member in equation to adjust date\n",
"print(one_week_ago.month, one_week_ago.day)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Dataframe with events after a specified date\n",
"# df_lastweek = df.loc[:,['email','date', 'python_edit']][df.date > '2019-10-21']\n",
"\n",
"# Dataframe with events from this calendar year-to-date\n",
"\n",
"df_ytd = df.loc[:,['email','date', 'python_edit']][df.date >= '2020-01-01']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Counts by week per user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_weekly_totals = df.groupby(['email', 'week_num']).sum()\n",
"df_weekly_totals # num_events_with_python"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Counts by month per user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_monthly_totals = df.groupby(['email', 'month_num']).sum()\n",
"df_monthly_totals # num_events_with_python"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Defining last week and month"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# WEEK LOGIC\n",
"# one_week = timedelta(days=7)\n",
"# one_week_ago = datetime.now() - one_week\n",
"last_full_week_num = one_week_ago.isocalendar()[1] \n",
"\n",
"# MONTH LOGIC\n",
"# one_month = timedelta(month = 1)\n",
"last_full_month_num = datetime.now().month - 1\n",
"\n",
"# Note: ISO calendar weeks begin Monday, end Sunday\n",
"\n",
"# Check dates:\n",
"print(\"One week ago:\", (one_week_ago))\n",
"print(last_full_week_num)\n",
"\n",
"print(\"One month ago:\", last_full_month_num)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Counts for last week per user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# groupby week_num table\n",
"df_last_week = df[df['week_num'] == last_full_week_num] \\\n",
" .groupby(['email', 'week_num']).sum().drop(['month_num'], axis=1)\n",
"\n",
"df_last_week"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Weekly: Only include users who had positive weekly completions selected totals \n",
"df_last_week = df_last_week[df_last_week.completions_num_selected > 0]\n",
"df_last_week"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Counts for last month per user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# groupby month_num table\n",
"df_last_month = df[df['month_num'] == last_full_month_num] \\\n",
" .groupby(['email', 'month_num']).sum().drop(['week_num'], axis=1)\n",
"\n",
"df_last_month"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Monthly: Only include users who had positive monthly completions selected totals \n",
"df_last_month = df_last_month[df_last_month.completions_num_selected > 0]\n",
"df_last_month"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Counts year-to-date per user (email) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_ytd_totals = df.groupby(['email']).sum().drop(['week_num', 'month_num'], axis=1)\n",
"df_ytd_totals.columns = [str(col) + '_ytd' for col in df_ytd_totals.columns] #rename columns to include ytd\n",
"df_ytd_totals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combine Weekly with YTD and export to CSV"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"combine_with_weekly_df = pd.merge(df_last_week, df_ytd_totals, how='inner', on='email')\n",
"\n",
"time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')\n",
"print(time_stamp)\n",
"\n",
"combine_with_weekly_df.to_csv('completion_stats_per_user_last_week_' + time_stamp + '.csv') # Note: Date of export in the filename\n",
"combine_with_weekly_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combine Monthly with YTD and export to CSV"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"combine_with_monthly_df = pd.merge(df_last_month, df_ytd_totals, how='inner', on='email')\n",
"\n",
"time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')\n",
"print(time_stamp)\n",
"\n",
"combine_with_monthly_df.to_csv('completion_stats_per_user_last_month_' + time_stamp + '.csv') # Note: Date of export in the filename\n",
"combine_with_monthly_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check final joined table by searching for a user (email)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For spot-checking data and individual queries\n",
"individual_email = 'alexander.nilsson@evado.se'\n",
"indiv_stats = combine_with_monthly_df[combine_with_monthly_df.index == individual_email]\n",
"\n",
"print(indiv_stats)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data Import Option: Get JQL query results directly from Mixpanel API (requires config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#from mixpanel_jql import JQL, Reducer, Events\n",
"\n",
"#mixpanel_api_secret = '...'\n",
"\n",
"# query = JQL(\n",
"# mixpanel_api_secret,\n",
"# events=Events({\n",
"# 'event_selectors': [{'event': \"X\"}],\n",
"# 'from_date': datetime(2016, 5, 1),\n",
"# 'to_date': datetime(2016, 5, 31)\n",
"# })\n",
"# ).group_by(\n",
"# keys=[\n",
"# \"new Date(e.time).toISOString().split('T')[0]\",\n",
"# ],\n",
"# accumulator=Reducer.count()\n",
"# )\n",
"\n",
"### Build dataframe from mixpanel-jql query which streams back results\n",
"#for row in query.send():\n",
"# df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}