This is a exploratory data analysis for global reimbursement amount.
Target:
Benefit:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
def fxn():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fxn()
%matplotlib inline
sns.set(font_scale = 1.2)
#plotly.offline doesn't push your charts to the clouds
import plotly.offline as pyo
#allows us to create the Data and Figure objects
from plotly.graph_objs import *
import plotly.graph_objs as go
#plotly.plotly pushes your charts to the cloud
import plotly.plotly as py
#lets us see the charts in an iPython Notebook
pyo.offline.init_notebook_mode() # run at the start of every ipython
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
# read and transfer date
df = pd.read_excel('sfpd_expenseType - Copy.xlsx',header=1)
df.drop(df.tail(1).index,inplace=True)
df['Year'] = df['Sent for Payment Date'].apply(lambda x: x.split('-')[0])
df['Month'] = df['Sent for Payment Date'].apply(lambda x: x.split('-')[1])
def func(x):
return datetime.strptime(x, "%Y/%m/%d")
df['date'] = (df['Year'] + '/' + df['Month'] + '/1').apply(func)
# out of pocket dataframe
oop = df[df['Payment Type'] == 'Out-of-Pocket']
# preparing data
total_df = oop.pivot_table('Expense Amount (rpt)','date',aggfunc=sum)
total_df.drop(total_df.index[0:2],inplace=True)
total_rolling = oop.pivot_table('Expense Amount (rpt)','date',aggfunc=sum).rolling(3).mean()
std = total_df.values.std()
# plot
sns.set_style('white')
plt.subplots(figsize=(18,6))
g = sns.lineplot(total_df.index,total_df['Expense Amount (rpt)'],label='OOP Amount')
g1= sns.lineplot(total_rolling.index,total_rolling['Expense Amount (rpt)'],label='Moving Avr (3)')
plt.fill_between(total_df.index,
total_df['Expense Amount (rpt)'] - std,
total_df['Expense Amount (rpt)'] + std,
alpha=0.1,label='Std')
# format
g.set_title('Global Reimbursement Trend (add moving avr and standard deviation)',fontdict={'fontsize':16},pad=15)
g.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show()
# plot
plt.subplots(figsize=(18,6))
g = sns.barplot(total_df.reset_index()['date'].dt.date,total_df.diff()['Expense Amount (rpt)'].values)
g.set_title('Rolling Differences for Monthly Reimbursement Amount',fontdict={'fontsize':16},pad=15)
plt.setp(g.get_xticklabels(), rotation=35)
g.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
sns.despine()
plt.show()
# print out some information
print('Mean fluctuation each month: ${:,.2f}'.format(total_df.diff().apply(abs).mean()[0]))
print('Standard deviation: ${:,.2f}'.format(total_df.diff().std()[0]))
From the historical data, we can know:
Distribution of Total Reimbursement Amount Each Month
total_oop = oop.pivot_table('Expense Amount (rpt)','date',aggfunc=sum)
total_oop.drop(total_oop.index[0:2],inplace=True)
sns.distplot(oop.pivot_table('Expense Amount (rpt)','date',aggfunc=sum))
print(total_oop.describe(percentiles=[.05,.25,.5,.75,.95]).apply(lambda x: '{:,.0f}'.format(x.values[0]),axis=1))
In this section, we break down total expense spending into different expense types. Firstly, let's see the amount of each expense type.
Not surprisingly, transportation and travel expenses (should these two types be together?) are the largest parts in employee reimbursement. Following is the Meal & Entertainment, which seems like 20% of the total. Communication (phone and internets) and HR benefit take another part. Surprisingly to see that marketing expenses only take like 2% or less.
# df
oop2018 = oop[oop['Sent for Payment Date'].apply(lambda x: '2018' in x)]
parent_et_df = (oop2018.pivot_table('Expense Amount (rpt)','Parent Expense Type', aggfunc=sum)
.sort_values('Expense Amount (rpt)',ascending=False))
# plot
plt.subplots(figsize=(18,6))
g = sns.barplot(parent_et_df.index,parent_et_df['Expense Amount (rpt)'],palette='Set2')
# format
plt.setp(g.get_xticklabels(), rotation=35)
sns.despine()
g.set_title('2018 Out-Of-Pocket Spending by Categories',fontdict={'fontsize':18},pad=15)
g.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# annotate
for ind, val in enumerate(parent_et_df.values):
g.annotate('{:,.2f}'.format(val[0]),(ind-.3,val[0]+10000),size=14)
parent_et_df
Let's then take a look at the picture for different subsidiaries. Three colors catch me at the first glance, which are orange, green and blue. We see that different subs have different portion of that content.
To see the different percentage of categories for each sub, I made a stacked bar plot and a heatmap to visualize it. In these charts, we can see it clearly and say that:
tem_df = oop2018.pivot_table('Expense Amount (rpt)','Parent Expense Type','Sub', aggfunc=sum)
# total column
tem_df['Total'] = tem_df.apply(lambda x: x.sum(),axis=1)
# sort this column
tem_df.sort_values('Total',ascending=False,inplace=True)
# total row
tem_df.loc['Total',:] = tem_df.apply(lambda x: x.sum(),axis=0)
# sort this row
tem_df.sort_values('Total',axis=1,ascending=False,inplace=True)
# drop
tem_df.drop('Total',axis=0,inplace=True)
tem_df.drop('Total',axis=1,inplace=True)
# transform it into percentile
for col in tem_df.columns:
total = tem_df[col].sum()
tem_df[col] = tem_df[col].apply(lambda x: round(x/total,2))
# plot
sns.set(font_scale = 1)
plt.subplots(figsize=(12,7))
ax = sns.heatmap(tem_df.fillna(0)*100,cmap='coolwarm',annot=True)
ax.set_title('Percentage Of Expense Type',fontdict={'fontsize':16},pad=15)
for t in ax.texts: t.set_text(t.get_text() + "%")
plt.show()
One of the most important thing to create a model and predict a future value is that, the data you are predicting needs to have either seasonal pattern or a predictable trend.
Line Chart for Amount
oop['date'] = oop.date.dt.date
test = oop.pivot_table('Expense Amount (rpt)','date','Parent Expense Type', aggfunc=sum)
for col in test.columns:
test[col] = test[col].apply(lambda x: round(x,2))
def visible(col):
ls = ['01', '02', '03', '04']
for i in ls:
if i in col:
return True
return 'legendonly'
traces = []
for col in test.columns:
traces.append({'type' : 'scatter',
'x' : test.index,
'y' : test[col],
'name' : col,
'mode' : 'lines',
'visible' : visible(col)})
data = Data(traces)
layout = {'title' : 'Interesting Trend in 2018',
'xaxis' : {'title' : 'Month','tickformat':'%b %y'},
'yaxis' : {'title' : 'Out-of-Pocket','tickformat':'$,0'}}
fig = Figure(traces, layout = layout)
pyo.iplot(fig)
In the line chart above, we can see:
Stacked bar chart and Line chart for percentage
# df
tem_df = oop.pivot_table('Expense Amount (rpt)','date','Parent Expense Type', aggfunc=sum).transpose()
for col in tem_df.columns:
total = tem_df[col].sum()
tem_df[col] = tem_df[col].apply(lambda x: round(x/total,2))
tem_df2 = tem_df.transpose()
tem_df2.index = [str(i)[:7] for i in tem_df2.index]
# plot
sns.set(font_scale = 1.5)
sns.set_style('white')
ax = tem_df2.plot(kind='bar',stacked=True,figsize=(18,8),legend=False,cmap='Set2')
# legend
patches, labels = ax.get_legend_handles_labels()
ax.legend(patches, labels, loc=5)
ax.legend(bbox_to_anchor=(1.02, 1.0))
# format
ax.set_ylabel('Percentage per Month',fontdict={'fontsize':16})
sns.despine()
plt.show()
# trying to use plotly... but the cmap is not cool
# traces = []
# for col in tem_df2.columns:
# traces.append({'type': 'bar',
# 'x': tem_df2.index,
# 'y': tem_df2[col],
# 'opacity' : 0.7,
# 'name':col})
# data = Data(traces)
# layout = {'title' : "Proportion of Parent Expense Types, Jun 2017 to April 2019",
# 'xaxis' : {'title' : 'Month', 'tickformat':'%b %y'},
# 'yaxis' : {'title' : 'Proportion of Month Total','tickformat': '.0%'},
# 'barmode' :'stack'}
# fig = Figure(data = data, layout = layout)
# pyo.iplot(fig)
We can see the most unstable category is Meal & Entertainment. The rest of them are all good.
From the analysis of Parent Expense Types, we know that the three expense types below have most of the spendings. Their trend over time also shows a sudden growth in the end of 2018, and we are interested in this trend and would like to dig deeper into that.
Now, we plot the data on the image for above selected parent types and classify them according to sub categories.
# shorten the list of expense types
mapping_dic = {
# 'Groud': ['Taxi','Public Transport','Parking','Train','Personal Car Mileage','Fuel','Car Rental','Tolls/Road Charges'],
'Meal': ['Meal While Traveling (employee only)','zNOTUSEDLunch',
'Meal out-of-office (client/partners)','zNOTUSEDDinner',
'Team Meal Out-of-office (employee only)','Meal in-office (employee only)'],
'Entertainment': ['Entertainment Out-of-office (employee only)','Entertainment Out-of-office (with Client)']}
def my_func(x):
for category, item in mapping_dic.items():
for i in item:
if i == x:
return category
return x
oop2018['Expense Type'] = oop2018['Expense Type'].map(my_func)
# define a problem dataframe, which contains 3 parent types only
problem_df = (oop2018[oop2018['Parent Expense Type']
.apply(lambda x: x in ('01. Travel Expenses','02. Transportation', '03. Meals & Entertainment'))])
# temporary dataframe to show: amount, date and expense types
tem_df = (problem_df
.pivot_table('Expense Amount (rpt)','date', 'Expense Type', aggfunc=sum)
)
# amount rounded to 2 decimal places
for col in tem_df.columns:
tem_df[col] = tem_df[col].apply(lambda x: round(x,2))
# fill na
tem_df.fillna(0, inplace=True)
# vertical sum and sort by that row
tem_df.loc['Total',:] = tem_df.sum(axis=0)
tem_df = tem_df.sort_values('Total',ascending=False,axis=1)
tem_df.drop('Total', inplace=True)
# create another df for ranking
sup_df = tem_df.transpose()
for col in sup_df.columns:
sup_df[col] = sup_df[col].rank(ascending=False).astype(int)
sup_df = sup_df.transpose()
# change the column names
sup_df.columns = [col + '_rank' for col in sup_df.columns]
# append the original df
sup_df = pd.concat([tem_df,sup_df],axis=1)
# create text support for the chart
for col in tem_df:
sup_df[col + '_text'] = (sup_df
.apply(lambda x: "{date}<br><b>{Type}</b><br>Amount: ${amount:,.2f}<br>Ranking: {rank}"
.format(date=pd.to_datetime(x.name).strftime('%b %Y'),Type=col,amount=x[col],rank=x[col + '_rank']),axis=1))
ls = ['Meal','Taxi', 'Hotel','Entertainment', 'Personal Car Mileage',]
def chooseColour(type):
if type in ls:
return 'Red'
else:
return 'Grey'
def width(type):
if type in ls:
return 2
else:
return 1
traces = []
for col in tem_df.columns:
if col != 'date' and 'rank' not in col:
traces.append({'type' : 'scatter',
'x' : tem_df.index,
'y' : tem_df[col],
'name' : col,
'text' : sup_df[col + '_text'],
'line' : {'width' : width(col)},
'marker' : {'color' : chooseColour(col)},
'mode' : 'lines',
'opacity' : 0.7,
'hoverinfo' : 'text'
})
data = Data(traces)
layout = {'title' : 'Interesting Trend in 2018',
'xaxis' : {'title' : 'Month'},
'yaxis' : {'title' : 'Out-of-Pocket','tickformat':'$,0'},
'hovermode' : 'closest'}
fig = Figure(data = data, layout = layout)
pyo.iplot(fig)
In the first glance, we notice that:
For those types marked RED, I filter them out and create a new plot below for a clearer view.
def visible(col):
ls = ['Meal','Taxi', 'Hotel','Entertainment', 'Personal Car Mileage',]
for i in ls:
if i == col:
return True
return 'legendonly'
traces = []
for col in tem_df.columns:
if col != 'date' and 'rank' not in col:
traces.append({'type' : 'scatter',
'x' : tem_df.index,
'y' : tem_df[col],
'name' : col,
'mode' : 'lines',
'text' : sup_df[col + '_text'],
'hoverinfo' : 'text',
'visible' : visible(col)})
data = Data(traces)
layout = {'title' : 'Interesting Trend in 2018',
'xaxis' : {'title' : 'Month','tickformat':'%b %y'},
'yaxis' : {'title' : 'Out-of-Pocket','tickformat':'$,0'},
'hovermode' : 'closest'}
fig = Figure(data = data, layout = layout)
pyo.iplot(fig)
# get top n expense types
ls_exp_types = (problem_df
.groupby(['Expense Type'])['Expense Amount (rpt)']
.sum()
.sort_values(ascending=False)
.index[0:7])
# group smaller expense types into 'other misc types'
tem_df = problem_df.copy()
tem_df['Expense Type'] = tem_df['Expense Type'].apply(lambda x: x if x in ls_exp_types else 'other misc types')
# get pivot table
tem_df1 = (tem_df
.groupby(['Sent for Payment Date','Expense Type'])['Expense Amount (rpt)']
.sum()
.unstack())
# vertical sum and sort by that row
tem_df1.loc['Total',:] = tem_df1.sum(axis=0)
tem_df1 = tem_df1.sort_values('Total',ascending=False,axis=1)
tem_df1.drop('Total', inplace=True)
# round the numbers for formatting
for col in tem_df1.columns:
tem_df1[col] = tem_df1[col].apply(lambda x: round(x,2))
# plot
traces = []
for col in tem_df1.columns:
if col != 'Total':
traces.append({'type' : 'bar',
'name' : col,
'x' : tem_df1.index,
'y' : tem_df1[col],
'opacity' : 0.7})
layout = {'title' : "Top 6 Child Expense Types in 2018",
'xaxis' : {'title' : 'Employee','tickformat':'%b %y'},
'yaxis' : {'title' : 'Out-of-Pocket','tickformat':'$,0'},
'barmode' : 'stack'}
fig = {'data' : traces,
'layout' : layout}
pyo.iplot(fig)
tem_df1.index = pd.to_datetime(tem_df1.reset_index()['Sent for Payment Date']).dt.date
tem_df1.sort_index().style.background_gradient()
Data mining for the interesting trend
We want to know why there is a sudden increase at the end of the year. In other words, we need the root cause of them. In order to know that, I want to see the things listed:
Business Purposes for Out-Of-Pocket Expenses, Aug 2018 to Dec 2018
From the word cloud, we know that we have a large portion of expense entries in the end of 2018 spent for China GM Summit.
df.info()
df = pd.read_excel('Source Data - Reimbursement EDA - copy.xlsx')
filter_for_text = ['January','February','March','April','May','June','July',
'August','September','October','November','December','Dinner','Lunch','Breakfast','Expense Report','Clara Lim']
import numpy as np
def filter_function(x):
if pd.isna(x):
return np.nan
else:
for i in filter_for_text:
if i.lower() not in x.lower():
pass
else:
return np.nan
return x
import sys
from os import path
from PIL import Image
currdir = path.curdir
mask = np.array(Image.open(path.join(currdir, "cloud.png")))
time_ls = ['2018-12','2018-11','2018-10','2018-9']
endof2018_df = df[df['Sent for Payment Date'].apply(lambda x: x in time_ls)][df['Payment Type'] == 'Out-of-Pocket']
endof2018_df['Expense Type'] = endof2018_df['Expense Type'].map(my_func)
endof2018_df['Report Name'] = endof2018_df['Report Name'].astype(str)
endof2018_df['Purpose'] = endof2018_df['Purpose'].astype(str)
from wordcloud import WordCloud
text = " ".join(endof2018_df['Purpose'].apply(filter_function).dropna()) + \
" ".join(endof2018_df['Report Name'].apply(filter_function).dropna())
plt.subplots(figsize=(20,20))
wordcloud = WordCloud(background_color='white',
# colormap='ocean',
mask=mask,
max_words=100).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
ls_exp_types = (endof2018_df
.groupby(['Expense Type'])['Expense Amount (rpt)']
.sum()
.sort_values(ascending=False)
.index[0:5])
tem_df = endof2018_df.copy()
tem_df['Expense Type'] = tem_df['Expense Type'].apply(lambda x: x if x in ls_exp_types else 'other misc types')
tem_df1 = (tem_df
.groupby(['Employee','Expense Type'])['Expense Amount (rpt)']
.sum()
.unstack())
tem_df1['Total'] = tem_df1.sum(axis=1)
tem_df1 = tem_df1.sort_values('Total',ascending=False).iloc[0:10,:]
for col in tem_df1.columns:
tem_df1[col] = tem_df1[col].apply(lambda x: round(x,2))
traces = []
for col in tem_df1.columns:
if col != 'Total':
traces.append({'type' : 'bar',
'name' : col,
'x' : tem_df1.index,
'y' : tem_df1[col],
'opacity' : 0.7})
layout = {'title' : "Top 10 Employee Spending, Sep 2018 to Dec 2018",
'xaxis' : {'title' : 'Employee','tickformat':'%b %y'},
'yaxis' : {'title' : 'USD Amount','tickformat':'$,0'},
'barmode' : 'stack'}
fig = {'data' : traces,
'layout' : layout}
pyo.iplot(fig)
Modification needed