{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# PACKAGE IMPORTS #\n", "from pprint import pprint\n", "from datetime import datetime, timedelta\n", "import numpy as np\n", "import pandas as pd\n", "import csv\n", "import json\n", "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "import bokeh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe object from data in local file (.csv)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_and_preprocess_data(filename, emails_req=True):\n", " \"\"\"Load data from file and apply transformations\"\"\"\n", " #Load data from csv\n", " df = pd.read_csv(filename)\n", " #pre-process data cleaning\n", " df['date'] = pd.to_datetime(df['date']) # convert to datetime object\n", " df['week_num'] = df['date'].dt.week # get WEEK number\n", " df['month_num'] = df['date'].dt.month # get MONTH number\n", " df = df.dropna(subset=['email']) # remove records with NO email address\n", " #filter to relevant columns\n", " col_list = [\n", " 'user_distinct_id', \n", " 'email', \n", " 'date',\n", " 'week_num',\n", " 'month_num',\n", " 'python_edit',\n", " 'completions_num_selected',\n", " 'num_events_with_python'\n", " ]\n", " df = df[col_list]\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_ks1d_all = load_and_preprocess_data('Completions_Analysis_ks1d.csv')\n", "print(df_ks1d_all.columns.tolist())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_ks1d_all.isnull().sum() # check for NULL values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_ks1d_all.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('Completions_Analysis_ks1d.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Analyze kite_status_1d events in Dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#dataframe for ks1d events including all results from JQL query\n", "df = df_ks1d_all.copy()\n", "\n", "#working dataframe object for exploration: df\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# example of how to create new dataframe objects that contain only desired rows, cols\n", "# df_recent = df.loc[df['date'] > '2020-04-01']\n", "\n", "#df_recent.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#df.loc[df['email'] == 'caelan@kite.com'].sort_values('python_edit', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Example of computing new stats and append a new column to the dataframe\n", "#df['ratio_compl_triggered_to_pyedit'] = df.completions_triggered / df.python_edit\n", "#df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Python_edit total for a prior calendar period" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#df.loc[:,['date', 'python_edit']][df.date < '2019-10-21']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# use datetime library for relative time operations\n", "now = datetime.now()\n", "one_week = timedelta(days=7)\n", "\n", "one_week_ago = now - one_week - timedelta(days=0) # use last member in equation to adjust date\n", "print(one_week_ago.month, one_week_ago.day)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Dataframe with events after a specified date\n", "# df_lastweek = df.loc[:,['email','date', 'python_edit']][df.date > '2019-10-21']\n", "\n", "# Dataframe with events from this calendar year-to-date\n", "\n", "df_ytd = df.loc[:,['email','date', 'python_edit']][df.date >= '2020-01-01']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Counts by week per user (email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "df_weekly_totals = df.groupby(['email', 'week_num']).sum()\n", "df_weekly_totals # num_events_with_python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Counts by month per user (email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_monthly_totals = df.groupby(['email', 'month_num']).sum()\n", "df_monthly_totals # num_events_with_python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Defining last week and month" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# WEEK LOGIC\n", "# one_week = timedelta(days=7)\n", "# one_week_ago = datetime.now() - one_week\n", "last_full_week_num = one_week_ago.isocalendar()[1] \n", "\n", "# MONTH LOGIC\n", "# one_month = timedelta(month = 1)\n", "last_full_month_num = datetime.now().month - 1\n", "\n", "# Note: ISO calendar weeks begin Monday, end Sunday\n", "\n", "# Check dates:\n", "print(\"One week ago:\", (one_week_ago))\n", "print(last_full_week_num)\n", "\n", "print(\"One month ago:\", last_full_month_num)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Counts for last week per user (email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# groupby week_num table\n", "df_last_week = df[df['week_num'] == last_full_week_num] \\\n", " .groupby(['email', 'week_num']).sum().drop(['month_num'], axis=1)\n", "\n", "df_last_week" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Weekly: Only include users who had positive weekly completions selected totals \n", "df_last_week = df_last_week[df_last_week.completions_num_selected > 0]\n", "df_last_week" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Counts for last month per user (email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# groupby month_num table\n", "df_last_month = df[df['month_num'] == last_full_month_num] \\\n", " .groupby(['email', 'month_num']).sum().drop(['week_num'], axis=1)\n", "\n", "df_last_month" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Monthly: Only include users who had positive monthly completions selected totals \n", "df_last_month = df_last_month[df_last_month.completions_num_selected > 0]\n", "df_last_month" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Counts year-to-date per user (email) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_ytd_totals = df.groupby(['email']).sum().drop(['week_num', 'month_num'], axis=1)\n", "df_ytd_totals.columns = [str(col) + '_ytd' for col in df_ytd_totals.columns] #rename columns to include ytd\n", "df_ytd_totals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combine Weekly with YTD and export to CSV" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combine_with_weekly_df = pd.merge(df_last_week, df_ytd_totals, how='inner', on='email')\n", "\n", "time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')\n", "print(time_stamp)\n", "\n", "combine_with_weekly_df.to_csv('completion_stats_per_user_last_week_' + time_stamp + '.csv') # Note: Date of export in the filename\n", "combine_with_weekly_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combine Monthly with YTD and export to CSV" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combine_with_monthly_df = pd.merge(df_last_month, df_ytd_totals, how='inner', on='email')\n", "\n", "time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')\n", "print(time_stamp)\n", "\n", "combine_with_monthly_df.to_csv('completion_stats_per_user_last_month_' + time_stamp + '.csv') # Note: Date of export in the filename\n", "combine_with_monthly_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check final joined table by searching for a user (email)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# For spot-checking data and individual queries\n", "individual_email = 'alexander.nilsson@evado.se'\n", "indiv_stats = combine_with_monthly_df[combine_with_monthly_df.index == individual_email]\n", "\n", "print(indiv_stats)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Import Option: Get JQL query results directly from Mixpanel API (requires config)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#from mixpanel_jql import JQL, Reducer, Events\n", "\n", "#mixpanel_api_secret = '...'\n", "\n", "# query = JQL(\n", "# mixpanel_api_secret,\n", "# events=Events({\n", "# 'event_selectors': [{'event': \"X\"}],\n", "# 'from_date': datetime(2016, 5, 1),\n", "# 'to_date': datetime(2016, 5, 31)\n", "# })\n", "# ).group_by(\n", "# keys=[\n", "# \"new Date(e.time).toISOString().split('T')[0]\",\n", "# ],\n", "# accumulator=Reducer.count()\n", "# )\n", "\n", "### Build dataframe from mixpanel-jql query which streams back results\n", "#for row in query.send():\n", "# df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }