import pandas as pd
raw_csv_data = pd.read_csv('D:\\OneDrive - office365hubs.com\\!Python + SQL + Tableau\\Course_Resources\\5. Preprocessing\Original\\Absenteeism_data.csv')
raw_csv_data
| ID | Reason for Absence | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 26 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 36 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 3 | 23 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 7 | 7 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 11 | 23 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 17 | 10 | 23/05/2018 | 179 | 22 | 40 | 237.656 | 22 | 2 | 2 | 0 | 8 |
| 696 | 28 | 6 | 23/05/2018 | 225 | 26 | 28 | 237.656 | 24 | 1 | 1 | 2 | 3 |
| 697 | 18 | 10 | 24/05/2018 | 330 | 16 | 28 | 237.656 | 25 | 2 | 0 | 0 | 8 |
| 698 | 25 | 23 | 24/05/2018 | 235 | 16 | 32 | 237.656 | 25 | 3 | 0 | 0 | 2 |
| 699 | 15 | 28 | 31/05/2018 | 291 | 31 | 40 | 237.656 | 25 | 1 | 1 | 1 | 2 |
700 rows × 12 columns
df = raw_csv_data.copy()
df
| ID | Reason for Absence | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 26 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 36 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 3 | 23 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 7 | 7 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 11 | 23 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 17 | 10 | 23/05/2018 | 179 | 22 | 40 | 237.656 | 22 | 2 | 2 | 0 | 8 |
| 696 | 28 | 6 | 23/05/2018 | 225 | 26 | 28 | 237.656 | 24 | 1 | 1 | 2 | 3 |
| 697 | 18 | 10 | 24/05/2018 | 330 | 16 | 28 | 237.656 | 25 | 2 | 0 | 0 | 8 |
| 698 | 25 | 23 | 24/05/2018 | 235 | 16 | 32 | 237.656 | 25 | 3 | 0 | 0 | 2 |
| 699 | 15 | 28 | 31/05/2018 | 291 | 31 | 40 | 237.656 | 25 | 1 | 1 | 1 | 2 |
700 rows × 12 columns
type(df)
pandas.core.frame.DataFrame
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 700 non-null int64
1 Reason for Absence 700 non-null int64
2 Date 700 non-null object
3 Transportation Expense 700 non-null int64
4 Distance to Work 700 non-null int64
5 Age 700 non-null int64
6 Daily Work Load Average 700 non-null float64
7 Body Mass Index 700 non-null int64
8 Education 700 non-null int64
9 Children 700 non-null int64
10 Pets 700 non-null int64
11 Absenteeism Time in Hours 700 non-null int64
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB
display(df)
| ID | Reason for Absence | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 26 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 36 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 3 | 23 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 7 | 7 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 11 | 23 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 17 | 10 | 23/05/2018 | 179 | 22 | 40 | 237.656 | 22 | 2 | 2 | 0 | 8 |
| 696 | 28 | 6 | 23/05/2018 | 225 | 26 | 28 | 237.656 | 24 | 1 | 1 | 2 | 3 |
| 697 | 18 | 10 | 24/05/2018 | 330 | 16 | 28 | 237.656 | 25 | 2 | 0 | 0 | 8 |
| 698 | 25 | 23 | 24/05/2018 | 235 | 16 | 32 | 237.656 | 25 | 3 | 0 | 0 | 2 |
| 699 | 15 | 28 | 31/05/2018 | 291 | 31 | 40 | 237.656 | 25 | 1 | 1 | 1 | 2 |
700 rows × 12 columns
df.drop(['ID'],axis = 1)
| Reason for Absence | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 23 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 7 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 23 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 10 | 23/05/2018 | 179 | 22 | 40 | 237.656 | 22 | 2 | 2 | 0 | 8 |
| 696 | 6 | 23/05/2018 | 225 | 26 | 28 | 237.656 | 24 | 1 | 1 | 2 | 3 |
| 697 | 10 | 24/05/2018 | 330 | 16 | 28 | 237.656 | 25 | 2 | 0 | 0 | 8 |
| 698 | 23 | 24/05/2018 | 235 | 16 | 32 | 237.656 | 25 | 3 | 0 | 0 | 2 |
| 699 | 28 | 31/05/2018 | 291 | 31 | 40 | 237.656 | 25 | 1 | 1 | 1 | 2 |
700 rows × 11 columns
df = df.drop(['ID'],axis=1)
df
| Reason for Absence | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 23 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 7 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 23 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 10 | 23/05/2018 | 179 | 22 | 40 | 237.656 | 22 | 2 | 2 | 0 | 8 |
| 696 | 6 | 23/05/2018 | 225 | 26 | 28 | 237.656 | 24 | 1 | 1 | 2 | 3 |
| 697 | 10 | 24/05/2018 | 330 | 16 | 28 | 237.656 | 25 | 2 | 0 | 0 | 8 |
| 698 | 23 | 24/05/2018 | 235 | 16 | 32 | 237.656 | 25 | 3 | 0 | 0 | 2 |
| 699 | 28 | 31/05/2018 | 291 | 31 | 40 | 237.656 | 25 | 1 | 1 | 1 | 2 |
700 rows × 11 columns
raw_csv_data
| ID | Reason for Absence | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | 26 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 36 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 3 | 23 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 7 | 7 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 11 | 23 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 17 | 10 | 23/05/2018 | 179 | 22 | 40 | 237.656 | 22 | 2 | 2 | 0 | 8 |
| 696 | 28 | 6 | 23/05/2018 | 225 | 26 | 28 | 237.656 | 24 | 1 | 1 | 2 | 3 |
| 697 | 18 | 10 | 24/05/2018 | 330 | 16 | 28 | 237.656 | 25 | 2 | 0 | 0 | 8 |
| 698 | 25 | 23 | 24/05/2018 | 235 | 16 | 32 | 237.656 | 25 | 3 | 0 | 0 | 2 |
| 699 | 15 | 28 | 31/05/2018 | 291 | 31 | 40 | 237.656 | 25 | 1 | 1 | 1 | 2 |
700 rows × 12 columns
df['Reason for Absence']
0 26
1 0
2 23
3 7
4 23
..
695 10
696 6
697 10
698 23
699 28
Name: Reason for Absence, Length: 700, dtype: int64
#Sorting a list of numbers
sorted(df['Reason for Absence'].unique())
[0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
21,
22,
23,
24,
25,
26,
27,
28]
#Splitting a column in multiple dummies
#converts categorical variable into dummy variable get_dummies()
reason_columns = pd.get_dummies(df['Reason for Absence'])
reason_columns.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
10 rows × 28 columns
reason_columns['check'] = reason_columns.sum(axis=1)
reason_columns.head(1)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
1 rows × 29 columns
reason_columns['check'].sum(axis=0)
700
reason_columns['check'].unique()
array([1], dtype=int64)
reason_columns = reason_columns.drop(['check'],axis=1)
reason_columns.head(2)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 rows × 28 columns
#Dummy variable and their statistical importance
#We are gona grop the first column with the reason 0
reason_columns = pd.get_dummies(df['Reason for Absence'],drop_first=True)
reason_columns
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 696 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 697 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 698 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 699 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
700 rows × 27 columns
#Grouping-Transforming dummy var in categorical variable
#group the reason of Absence
df.columns.values
array(['Reason for Absence', 'Date', 'Transportation Expense',
'Distance to Work', 'Age', 'Daily Work Load Average',
'Body Mass Index', 'Education', 'Children', 'Pets',
'Absenteeism Time in Hours'], dtype=object)
reason_columns.columns.values
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)
df = df.drop(['Reason for Absence'],axis=1)
df.head()
| Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
#Adding reason_columns data set to df data est
#Clasification or grouping
#Split the reason_columns data frame
reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:,22:].max(axis=1)
#Concatenating columns in Python
df.columns.values
array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours'], dtype=object)
reason_columns.columns.values
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)
#joinnig the columns
df = pd.concat([df,reason_type_1,reason_type_2,
reason_type_3,reason_type_4],axis=1)
df.head()
| Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | 0 | 1 | 2 | 3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 | 0 | 0 | 0 | 1 |
| 1 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 1 |
| 3 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 | 1 | 0 | 0 | 0 |
| 4 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 1 |
df.columns.values
array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
dtype=object)
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
df.columns =column_names
df.head()
| Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | Reason_1 | Reason_2 | Reason_3 | Reason_4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 | 0 | 0 | 0 | 1 |
| 1 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 1 |
| 3 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 | 1 | 0 | 0 | 0 |
| 4 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 1 |
#Reorder the columns of a data frame
column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
'Date', 'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours']
df = df[column_names_reordered]
df.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 0 | 0 | 0 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 1 | 0 | 0 | 0 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 0 | 0 | 0 | 1 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
#Create a checkp[oint]
#For a checkpoint you have to create a copy of the data frame in the state that is now
df_reason_mod = df.copy()
df_reason_mod.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 07/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 0 | 0 | 0 | 0 | 14/07/2015 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 15/07/2015 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 1 | 0 | 0 | 0 | 16/07/2015 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 0 | 0 | 0 | 1 | 23/07/2015 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
df_reason_mod.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours'], dtype=object)
df_reason_mod['Date']
0 07/07/2015
1 14/07/2015
2 15/07/2015
3 16/07/2015
4 23/07/2015
...
695 23/05/2018
696 23/05/2018
697 24/05/2018
698 24/05/2018
699 31/05/2018
Name: Date, Length: 700, dtype: object
type(df_reason_mod['Date'][0])
str
#using pandas timestamp
#when doing the conversion from text to date specify the proper
#format of the date you will be working on
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'],
format='%d/%m/%Y')
type(df_reason_mod['Date'])
pandas.core.series.Series
df_reason_mod.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Reason_1 700 non-null uint8
1 Reason_2 700 non-null uint8
2 Reason_3 700 non-null uint8
3 Reason_4 700 non-null uint8
4 Date 700 non-null datetime64[ns]
5 Transportation Expense 700 non-null int64
6 Distance to Work 700 non-null int64
7 Age 700 non-null int64
8 Daily Work Load Average 700 non-null float64
9 Body Mass Index 700 non-null int64
10 Education 700 non-null int64
11 Children 700 non-null int64
12 Pets 700 non-null int64
13 Absenteeism Time in Hours 700 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(8), uint8(4)
memory usage: 57.5 KB
df_reason_mod['Date'][0]
Timestamp('2015-07-07 00:00:00')
df_reason_mod['Date'][0].month
7
df_reason_mod.shape
(700, 14)
list_months = []
for i in range(700):
list_months.append(df_reason_mod['Date'][i].month)
list_months
[7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
9,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
10,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
11,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
12,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5]
len(list_months)
700
df_reason_mod['Month Value'] = list_months
df_reason_mod.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | Month Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 2015-07-07 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 | 7 |
| 1 | 0 | 0 | 0 | 0 | 2015-07-14 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 | 7 |
| 2 | 0 | 0 | 0 | 1 | 2015-07-15 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 | 7 |
| 3 | 1 | 0 | 0 | 0 | 2015-07-16 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 | 7 |
| 4 | 0 | 0 | 0 | 1 | 2015-07-23 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 | 7 |
df_reason_mod.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value'],
dtype=object)
df_reason_mod['Date'][699].weekday()
3
df_reason_mod['Date'][699]
Timestamp('2018-05-31 00:00:00')
def date_to_weekday(date_value):
return date_value.weekday()
df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(date_to_weekday)
df_reason_mod.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Date | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | Month Value | Day of the Week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 2015-07-07 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 | 7 | 1 |
| 1 | 0 | 0 | 0 | 0 | 2015-07-14 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 | 7 | 1 |
| 2 | 0 | 0 | 0 | 1 | 2015-07-15 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 | 7 | 2 |
| 3 | 1 | 0 | 0 | 0 | 2015-07-16 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 | 7 | 3 |
| 4 | 0 | 0 | 0 | 1 | 2015-07-23 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 | 7 | 3 |
df_reason_mod = df_reason_mod.drop(['Date'],axis=1)
df_reason_mod.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
'Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
'Day of the Week'], dtype=object)
col_mod = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
'Day of the Week','Transportation Expense', 'Distance to Work', 'Age',
'Daily Work Load Average', 'Body Mass Index', 'Education',
'Children', 'Pets', 'Absenteeism Time in Hours']
df_reason_mod = df_reason_mod[col_mod]
df_reason_mod.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Day of the Week | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 1 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 0 | 0 | 0 | 0 | 7 | 1 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 2 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 1 | 0 | 0 | 0 | 7 | 3 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 0 | 0 | 0 | 1 | 7 | 3 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
df_reason_date_mod = df_reason_mod.copy()
df_reason_date_mod.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Day of the Week | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 1 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 4 |
| 1 | 0 | 0 | 0 | 0 | 7 | 1 | 118 | 13 | 50 | 239.554 | 31 | 1 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 2 | 179 | 51 | 38 | 239.554 | 31 | 1 | 0 | 0 | 2 |
| 3 | 1 | 0 | 0 | 0 | 7 | 3 | 279 | 5 | 39 | 239.554 | 24 | 1 | 2 | 0 | 4 |
| 4 | 0 | 0 | 0 | 1 | 7 | 3 | 289 | 36 | 33 | 239.554 | 30 | 1 | 2 | 1 | 2 |
type(df_reason_date_mod['Transportation Expense'][0])
numpy.int64
type(df_reason_date_mod['Distance to Work'][0])
numpy.int64
type(df_reason_date_mod['Age'][0])
numpy.int64
type(df_reason_date_mod['Body Mass Index'][0])
numpy.int64
#transform Education into a dummy variable
df_reason_date_mod['Education'].unique()
array([1, 3, 2, 4], dtype=int64)
df_reason_date_mod['Education'].value_counts()
1 583
3 73
2 40
4 4
Name: Education, dtype: int64
df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0,2:1,3:1,4:1})
df_reason_date_mod['Education'].unique()
array([0, 1], dtype=int64)
df_reason_date_mod['Education'].value_counts()
0 583
1 117
Name: Education, dtype: int64
df_preprocessed = df_reason_date_mod.copy()
df_preprocessed.head(10)
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Day of the Week | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 1 | 289 | 36 | 33 | 239.554 | 30 | 0 | 2 | 1 | 4 |
| 1 | 0 | 0 | 0 | 0 | 7 | 1 | 118 | 13 | 50 | 239.554 | 31 | 0 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 2 | 179 | 51 | 38 | 239.554 | 31 | 0 | 0 | 0 | 2 |
| 3 | 1 | 0 | 0 | 0 | 7 | 3 | 279 | 5 | 39 | 239.554 | 24 | 0 | 2 | 0 | 4 |
| 4 | 0 | 0 | 0 | 1 | 7 | 3 | 289 | 36 | 33 | 239.554 | 30 | 0 | 2 | 1 | 2 |
| 5 | 0 | 0 | 0 | 1 | 7 | 4 | 179 | 51 | 38 | 239.554 | 31 | 0 | 0 | 0 | 2 |
| 6 | 0 | 0 | 0 | 1 | 7 | 4 | 361 | 52 | 28 | 239.554 | 27 | 0 | 1 | 4 | 8 |
| 7 | 0 | 0 | 0 | 1 | 7 | 4 | 260 | 50 | 36 | 239.554 | 23 | 0 | 4 | 0 | 4 |
| 8 | 0 | 0 | 1 | 0 | 7 | 0 | 155 | 12 | 34 | 239.554 | 25 | 0 | 2 | 0 | 40 |
| 9 | 0 | 0 | 0 | 1 | 7 | 0 | 235 | 11 | 37 | 239.554 | 29 | 1 | 1 | 1 | 8 |
df_preprocessed.to_csv('Absenteeism_preprocessed.csv',index=False)
df_preprocessed.to_csv('D:\\OneDrive - office365hubs.com\\!Python + SQL + Tableau\\Absenteeism_preprocessed.csv',index=False)