Python Sql Tableau

Python Sql Tableau Integration

February 25, 2021 · 51 mins read
import pandas as pd
raw_csv_data = pd.read_csv('D:\\OneDrive - office365hubs.com\\!Python + SQL + Tableau\\Course_Resources\\5. Preprocessing\Original\\Absenteeism_data.csv')
raw_csv_data
ID Reason for Absence Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 11 26 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 36 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 3 23 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 7 7 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 11 23 23/07/2015 289 36 33 239.554 30 1 2 1 2
... ... ... ... ... ... ... ... ... ... ... ... ...
695 17 10 23/05/2018 179 22 40 237.656 22 2 2 0 8
696 28 6 23/05/2018 225 26 28 237.656 24 1 1 2 3
697 18 10 24/05/2018 330 16 28 237.656 25 2 0 0 8
698 25 23 24/05/2018 235 16 32 237.656 25 3 0 0 2
699 15 28 31/05/2018 291 31 40 237.656 25 1 1 1 2

700 rows × 12 columns

df = raw_csv_data.copy()
df
ID Reason for Absence Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 11 26 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 36 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 3 23 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 7 7 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 11 23 23/07/2015 289 36 33 239.554 30 1 2 1 2
... ... ... ... ... ... ... ... ... ... ... ... ...
695 17 10 23/05/2018 179 22 40 237.656 22 2 2 0 8
696 28 6 23/05/2018 225 26 28 237.656 24 1 1 2 3
697 18 10 24/05/2018 330 16 28 237.656 25 2 0 0 8
698 25 23 24/05/2018 235 16 32 237.656 25 3 0 0 2
699 15 28 31/05/2018 291 31 40 237.656 25 1 1 1 2

700 rows × 12 columns

type(df)
pandas.core.frame.DataFrame
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB
display(df)
ID Reason for Absence Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 11 26 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 36 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 3 23 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 7 7 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 11 23 23/07/2015 289 36 33 239.554 30 1 2 1 2
... ... ... ... ... ... ... ... ... ... ... ... ...
695 17 10 23/05/2018 179 22 40 237.656 22 2 2 0 8
696 28 6 23/05/2018 225 26 28 237.656 24 1 1 2 3
697 18 10 24/05/2018 330 16 28 237.656 25 2 0 0 8
698 25 23 24/05/2018 235 16 32 237.656 25 3 0 0 2
699 15 28 31/05/2018 291 31 40 237.656 25 1 1 1 2

700 rows × 12 columns

df.drop(['ID'],axis = 1)
Reason for Absence Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 26 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 23 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 7 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 23 23/07/2015 289 36 33 239.554 30 1 2 1 2
... ... ... ... ... ... ... ... ... ... ... ...
695 10 23/05/2018 179 22 40 237.656 22 2 2 0 8
696 6 23/05/2018 225 26 28 237.656 24 1 1 2 3
697 10 24/05/2018 330 16 28 237.656 25 2 0 0 8
698 23 24/05/2018 235 16 32 237.656 25 3 0 0 2
699 28 31/05/2018 291 31 40 237.656 25 1 1 1 2

700 rows × 11 columns

df = df.drop(['ID'],axis=1)
df
Reason for Absence Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 26 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 23 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 7 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 23 23/07/2015 289 36 33 239.554 30 1 2 1 2
... ... ... ... ... ... ... ... ... ... ... ...
695 10 23/05/2018 179 22 40 237.656 22 2 2 0 8
696 6 23/05/2018 225 26 28 237.656 24 1 1 2 3
697 10 24/05/2018 330 16 28 237.656 25 2 0 0 8
698 23 24/05/2018 235 16 32 237.656 25 3 0 0 2
699 28 31/05/2018 291 31 40 237.656 25 1 1 1 2

700 rows × 11 columns

raw_csv_data
ID Reason for Absence Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 11 26 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 36 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 3 23 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 7 7 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 11 23 23/07/2015 289 36 33 239.554 30 1 2 1 2
... ... ... ... ... ... ... ... ... ... ... ... ...
695 17 10 23/05/2018 179 22 40 237.656 22 2 2 0 8
696 28 6 23/05/2018 225 26 28 237.656 24 1 1 2 3
697 18 10 24/05/2018 330 16 28 237.656 25 2 0 0 8
698 25 23 24/05/2018 235 16 32 237.656 25 3 0 0 2
699 15 28 31/05/2018 291 31 40 237.656 25 1 1 1 2

700 rows × 12 columns

df['Reason for Absence']
0      26
1       0
2      23
3       7
4      23
       ..
695    10
696     6
697    10
698    23
699    28
Name: Reason for Absence, Length: 700, dtype: int64
#Sorting a list of numbers
sorted(df['Reason for Absence'].unique())
[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]
#Splitting a column in multiple dummies
#converts categorical variable into dummy variable get_dummies()
reason_columns = pd.get_dummies(df['Reason for Absence'])
reason_columns.head(10)
0 1 2 3 4 5 6 7 8 9 ... 18 19 21 22 23 24 25 26 27 28
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0

10 rows × 28 columns

reason_columns['check'] = reason_columns.sum(axis=1)
reason_columns.head(1)
0 1 2 3 4 5 6 7 8 9 ... 19 21 22 23 24 25 26 27 28 check
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 1

1 rows × 29 columns

reason_columns['check'].sum(axis=0)
700
reason_columns['check'].unique()
array([1], dtype=int64)
reason_columns = reason_columns.drop(['check'],axis=1)
reason_columns.head(2)
0 1 2 3 4 5 6 7 8 9 ... 18 19 21 22 23 24 25 26 27 28
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 28 columns

#Dummy variable and their statistical importance
#We are gona grop the first column with the reason 0
reason_columns = pd.get_dummies(df['Reason for Absence'],drop_first=True)
reason_columns
1 2 3 4 5 6 7 8 9 10 ... 18 19 21 22 23 24 25 26 27 28
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
3 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
695 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
696 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
697 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
698 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
699 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

700 rows × 27 columns

#Grouping-Transforming dummy var in categorical variable
#group the reason of Absence
df.columns.values
array(['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)
reason_columns.columns.values
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)
df = df.drop(['Reason for Absence'],axis=1)
df.head()
Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 23/07/2015 289 36 33 239.554 30 1 2 1 2
#Adding reason_columns data set to df data est
#Clasification or grouping
#Split the reason_columns data frame
reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:,22:].max(axis=1)
#Concatenating columns in Python
df.columns.values
array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'], dtype=object)
reason_columns.columns.values
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)
#joinnig the columns 
df = pd.concat([df,reason_type_1,reason_type_2,
                reason_type_3,reason_type_4],axis=1)
df.head()
Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours 0 1 2 3
0 07/07/2015 289 36 33 239.554 30 1 2 1 4 0 0 0 1
1 14/07/2015 118 13 50 239.554 31 1 1 0 0 0 0 0 0
2 15/07/2015 179 51 38 239.554 31 1 0 0 2 0 0 0 1
3 16/07/2015 279 5 39 239.554 24 1 2 0 4 1 0 0 0
4 23/07/2015 289 36 33 239.554 30 1 2 1 2 0 0 0 1
df.columns.values
array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
df.columns =column_names
df.head()
Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours Reason_1 Reason_2 Reason_3 Reason_4
0 07/07/2015 289 36 33 239.554 30 1 2 1 4 0 0 0 1
1 14/07/2015 118 13 50 239.554 31 1 1 0 0 0 0 0 0
2 15/07/2015 179 51 38 239.554 31 1 0 0 2 0 0 0 1
3 16/07/2015 279 5 39 239.554 24 1 2 0 4 1 0 0 0
4 23/07/2015 289 36 33 239.554 30 1 2 1 2 0 0 0 1
#Reorder the columns of a data frame
column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
                          'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df = df[column_names_reordered]
df.head()
Reason_1 Reason_2 Reason_3 Reason_4 Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 0 0 0 1 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 0 0 0 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 0 0 0 1 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 1 0 0 0 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 0 0 0 1 23/07/2015 289 36 33 239.554 30 1 2 1 2

Create a checkpoint

#Create a checkp[oint]
#For a checkpoint you have to create a copy of the data frame in the state that is now
df_reason_mod = df.copy()
df_reason_mod.head()
Reason_1 Reason_2 Reason_3 Reason_4 Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 0 0 0 1 07/07/2015 289 36 33 239.554 30 1 2 1 4
1 0 0 0 0 14/07/2015 118 13 50 239.554 31 1 1 0 0
2 0 0 0 1 15/07/2015 179 51 38 239.554 31 1 0 0 2
3 1 0 0 0 16/07/2015 279 5 39 239.554 24 1 2 0 4
4 0 0 0 1 23/07/2015 289 36 33 239.554 30 1 2 1 2
df_reason_mod.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'], dtype=object)
df_reason_mod['Date']
0      07/07/2015
1      14/07/2015
2      15/07/2015
3      16/07/2015
4      23/07/2015
          ...    
695    23/05/2018
696    23/05/2018
697    24/05/2018
698    24/05/2018
699    31/05/2018
Name: Date, Length: 700, dtype: object
type(df_reason_mod['Date'][0])
str
#using pandas timestamp
#when doing the conversion from text to date specify the proper 
#format of the date you will be working on
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'],
                                       format='%d/%m/%Y')
type(df_reason_mod['Date'])
pandas.core.series.Series
df_reason_mod.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Reason_1                   700 non-null    uint8         
 1   Reason_2                   700 non-null    uint8         
 2   Reason_3                   700 non-null    uint8         
 3   Reason_4                   700 non-null    uint8         
 4   Date                       700 non-null    datetime64[ns]
 5   Transportation Expense     700 non-null    int64         
 6   Distance to Work           700 non-null    int64         
 7   Age                        700 non-null    int64         
 8   Daily Work Load Average    700 non-null    float64       
 9   Body Mass Index            700 non-null    int64         
 10  Education                  700 non-null    int64         
 11  Children                   700 non-null    int64         
 12  Pets                       700 non-null    int64         
 13  Absenteeism Time in Hours  700 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(8), uint8(4)
memory usage: 57.5 KB

Extract the Month value

df_reason_mod['Date'][0]
Timestamp('2015-07-07 00:00:00')
df_reason_mod['Date'][0].month
7
df_reason_mod.shape
(700, 14)
list_months = []
for i in range(700):
    list_months.append(df_reason_mod['Date'][i].month)
list_months
[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]
len(list_months)
700
df_reason_mod['Month Value'] = list_months
df_reason_mod.head()
Reason_1 Reason_2 Reason_3 Reason_4 Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours Month Value
0 0 0 0 1 2015-07-07 289 36 33 239.554 30 1 2 1 4 7
1 0 0 0 0 2015-07-14 118 13 50 239.554 31 1 1 0 0 7
2 0 0 0 1 2015-07-15 179 51 38 239.554 31 1 0 0 2 7
3 1 0 0 0 2015-07-16 279 5 39 239.554 24 1 2 0 4 7
4 0 0 0 1 2015-07-23 289 36 33 239.554 30 1 2 1 2 7
df_reason_mod.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value'],
      dtype=object)

Extracting the day of the week

df_reason_mod['Date'][699].weekday()
3
df_reason_mod['Date'][699]
Timestamp('2018-05-31 00:00:00')
def date_to_weekday(date_value):
    return date_value.weekday()
df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(date_to_weekday)
df_reason_mod.head()
Reason_1 Reason_2 Reason_3 Reason_4 Date Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours Month Value Day of the Week
0 0 0 0 1 2015-07-07 289 36 33 239.554 30 1 2 1 4 7 1
1 0 0 0 0 2015-07-14 118 13 50 239.554 31 1 1 0 0 7 1
2 0 0 0 1 2015-07-15 179 51 38 239.554 31 1 0 0 2 7 2
3 1 0 0 0 2015-07-16 279 5 39 239.554 24 1 2 0 4 7 3
4 0 0 0 1 2015-07-23 289 36 33 239.554 30 1 2 1 2 7 3
df_reason_mod = df_reason_mod.drop(['Date'],axis=1)
df_reason_mod.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
       'Day of the Week'], dtype=object)
col_mod = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df_reason_mod = df_reason_mod[col_mod]
df_reason_mod.head()
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Day of the Week Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 0 0 0 1 7 1 289 36 33 239.554 30 1 2 1 4
1 0 0 0 0 7 1 118 13 50 239.554 31 1 1 0 0
2 0 0 0 1 7 2 179 51 38 239.554 31 1 0 0 2
3 1 0 0 0 7 3 279 5 39 239.554 24 1 2 0 4
4 0 0 0 1 7 3 289 36 33 239.554 30 1 2 1 2

Checkpoint after Date drop

df_reason_date_mod = df_reason_mod.copy()
df_reason_date_mod.head()
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Day of the Week Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 0 0 0 1 7 1 289 36 33 239.554 30 1 2 1 4
1 0 0 0 0 7 1 118 13 50 239.554 31 1 1 0 0
2 0 0 0 1 7 2 179 51 38 239.554 31 1 0 0 2
3 1 0 0 0 7 3 279 5 39 239.554 24 1 2 0 4
4 0 0 0 1 7 3 289 36 33 239.554 30 1 2 1 2
type(df_reason_date_mod['Transportation Expense'][0])
numpy.int64
type(df_reason_date_mod['Distance to Work'][0])
numpy.int64
type(df_reason_date_mod['Age'][0])
numpy.int64
type(df_reason_date_mod['Body Mass Index'][0])
numpy.int64
#transform Education into a dummy variable
df_reason_date_mod['Education'].unique()
array([1, 3, 2, 4], dtype=int64)
df_reason_date_mod['Education'].value_counts()
1    583
3     73
2     40
4      4
Name: Education, dtype: int64
df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0,2:1,3:1,4:1})
df_reason_date_mod['Education'].unique()
array([0, 1], dtype=int64)
df_reason_date_mod['Education'].value_counts()
0    583
1    117
Name: Education, dtype: int64

Final Checkpoint

df_preprocessed = df_reason_date_mod.copy()
df_preprocessed.head(10)
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Day of the Week Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 0 0 0 1 7 1 289 36 33 239.554 30 0 2 1 4
1 0 0 0 0 7 1 118 13 50 239.554 31 0 1 0 0
2 0 0 0 1 7 2 179 51 38 239.554 31 0 0 0 2
3 1 0 0 0 7 3 279 5 39 239.554 24 0 2 0 4
4 0 0 0 1 7 3 289 36 33 239.554 30 0 2 1 2
5 0 0 0 1 7 4 179 51 38 239.554 31 0 0 0 2
6 0 0 0 1 7 4 361 52 28 239.554 27 0 1 4 8
7 0 0 0 1 7 4 260 50 36 239.554 23 0 4 0 4
8 0 0 1 0 7 0 155 12 34 239.554 25 0 2 0 40
9 0 0 0 1 7 0 235 11 37 239.554 29 1 1 1 8
df_preprocessed.to_csv('Absenteeism_preprocessed.csv',index=False)
df_preprocessed.to_csv('D:\\OneDrive - office365hubs.com\\!Python + SQL + Tableau\\Absenteeism_preprocessed.csv',index=False)