import pandas as pd

raw_csv_data = pd.read_csv('D:\\OneDrive - office365hubs.com\\!Python + SQL + Tableau\\Course_Resources\\5. Preprocessing\Original\\Absenteeism_data.csv')

raw_csv_data

	ID	Reason for Absence	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	11	26	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	36	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	3	23	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	7	7	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	11	23	23/07/2015	289	36	33	239.554	30	1	2	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...
695	17	10	23/05/2018	179	22	40	237.656	22	2	2	0	8
696	28	6	23/05/2018	225	26	28	237.656	24	1	1	2	3
697	18	10	24/05/2018	330	16	28	237.656	25	2	0	0	8
698	25	23	24/05/2018	235	16	32	237.656	25	3	0	0	2
699	15	28	31/05/2018	291	31	40	237.656	25	1	1	1	2

700 rows × 12 columns

df = raw_csv_data.copy()

df

	ID	Reason for Absence	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	11	26	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	36	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	3	23	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	7	7	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	11	23	23/07/2015	289	36	33	239.554	30	1	2	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...
695	17	10	23/05/2018	179	22	40	237.656	22	2	2	0	8
696	28	6	23/05/2018	225	26	28	237.656	24	1	1	2	3
697	18	10	24/05/2018	330	16	28	237.656	25	2	0	0	8
698	25	23	24/05/2018	235	16	32	237.656	25	3	0	0	2
699	15	28	31/05/2018	291	31	40	237.656	25	1	1	1	2

700 rows × 12 columns

type(df)

pandas.core.frame.DataFrame

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB

display(df)

	ID	Reason for Absence	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	11	26	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	36	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	3	23	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	7	7	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	11	23	23/07/2015	289	36	33	239.554	30	1	2	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...
695	17	10	23/05/2018	179	22	40	237.656	22	2	2	0	8
696	28	6	23/05/2018	225	26	28	237.656	24	1	1	2	3
697	18	10	24/05/2018	330	16	28	237.656	25	2	0	0	8
698	25	23	24/05/2018	235	16	32	237.656	25	3	0	0	2
699	15	28	31/05/2018	291	31	40	237.656	25	1	1	1	2

700 rows × 12 columns

df.drop(['ID'],axis = 1)

	Reason for Absence	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	26	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	23	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	7	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	23	23/07/2015	289	36	33	239.554	30	1	2	1	2
...	...	...	...	...	...	...	...	...	...	...	...
695	10	23/05/2018	179	22	40	237.656	22	2	2	0	8
696	6	23/05/2018	225	26	28	237.656	24	1	1	2	3
697	10	24/05/2018	330	16	28	237.656	25	2	0	0	8
698	23	24/05/2018	235	16	32	237.656	25	3	0	0	2
699	28	31/05/2018	291	31	40	237.656	25	1	1	1	2

700 rows × 11 columns

df = df.drop(['ID'],axis=1)

df

	Reason for Absence	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	26	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	23	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	7	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	23	23/07/2015	289	36	33	239.554	30	1	2	1	2
...	...	...	...	...	...	...	...	...	...	...	...
695	10	23/05/2018	179	22	40	237.656	22	2	2	0	8
696	6	23/05/2018	225	26	28	237.656	24	1	1	2	3
697	10	24/05/2018	330	16	28	237.656	25	2	0	0	8
698	23	24/05/2018	235	16	32	237.656	25	3	0	0	2
699	28	31/05/2018	291	31	40	237.656	25	1	1	1	2

700 rows × 11 columns

raw_csv_data

	ID	Reason for Absence	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	11	26	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	36	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	3	23	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	7	7	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	11	23	23/07/2015	289	36	33	239.554	30	1	2	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...
695	17	10	23/05/2018	179	22	40	237.656	22	2	2	0	8
696	28	6	23/05/2018	225	26	28	237.656	24	1	1	2	3
697	18	10	24/05/2018	330	16	28	237.656	25	2	0	0	8
698	25	23	24/05/2018	235	16	32	237.656	25	3	0	0	2
699	15	28	31/05/2018	291	31	40	237.656	25	1	1	1	2

700 rows × 12 columns

df['Reason for Absence']

    26
     0
    23
     7
    23
       ..
  10
   6
  10
  23
  28
Name: Reason for Absence, Length: 700, dtype: int64

#Sorting a list of numbers
sorted(df['Reason for Absence'].unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

#Splitting a column in multiple dummies
#converts categorical variable into dummy variable get_dummies()
reason_columns = pd.get_dummies(df['Reason for Absence'])
reason_columns.head(10)

	0	7	...	19	22	23	26
0	0	0	...	0	0	0	1
1	1	0	...	0	0	0	0
2	0	0	...	0	0	1	0
3	0	1	...	0	0	0	0
4	0	0	...	0	0	1	0
5	0	0	...	0	0	1	0
6	0	0	...	0	1	0	0
7	0	0	...	0	0	1	0
8	0	0	...	1	0	0	0
9	0	0	...	0	1	0	0

10 rows × 28 columns

reason_columns['check'] = reason_columns.sum(axis=1)

reason_columns.head(1)

	0	1	2	3	4	5	6	7	8	9	...	19	21	22	23	24	25	26	27	28	check
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	1

1 rows × 29 columns

reason_columns['check'].sum(axis=0)

reason_columns['check'].unique()

array([1], dtype=int64)

reason_columns = reason_columns.drop(['check'],axis=1)

reason_columns.head(2)

	0	1	2	3	4	5	6	7	8	9	...	18	19	21	22	23	24	25	26	27	28
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
1	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

2 rows × 28 columns

#Dummy variable and their statistical importance
#We are gona grop the first column with the reason 0
reason_columns = pd.get_dummies(df['Reason for Absence'],drop_first=True)

reason_columns

	1	2	3	4	5	6	7	8	9	10	...	18	19	21	22	23	24	25	26	27	28
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
695	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
696	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
697	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
698	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
699	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1

700 rows × 27 columns

#Grouping-Transforming dummy var in categorical variable
#group the reason of Absence
df.columns.values

array(['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

reason_columns.columns.values

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

df = df.drop(['Reason for Absence'],axis=1)

df.head()

	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	23/07/2015	289	36	33	239.554	30	1	2	1	2

#Adding reason_columns data set to df data est
#Clasification or grouping
#Split the reason_columns data frame
reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:,22:].max(axis=1)

#Concatenating columns in Python
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'], dtype=object)

reason_columns.columns.values

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

#joinnig the columns 
df = pd.concat([df,reason_type_1,reason_type_2,
                reason_type_3,reason_type_4],axis=1)

df.head()

	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours	0	3
0	07/07/2015	289	36	33	239.554	30	1	2	1	4	0	1
1	14/07/2015	118	13	50	239.554	31	1	1	0	0	0	0
2	15/07/2015	179	51	38	239.554	31	1	0	0	2	0	1
3	16/07/2015	279	5	39	239.554	24	1	2	0	4	1	0
4	23/07/2015	289	36	33	239.554	30	1	2	1	2	0	1

df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

df.columns =column_names

df.head()

	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours	Reason_1	Reason_4
0	07/07/2015	289	36	33	239.554	30	1	2	1	4	0	1
1	14/07/2015	118	13	50	239.554	31	1	1	0	0	0	0
2	15/07/2015	179	51	38	239.554	31	1	0	0	2	0	1
3	16/07/2015	279	5	39	239.554	24	1	2	0	4	1	0
4	23/07/2015	289	36	33	239.554	30	1	2	1	2	0	1

#Reorder the columns of a data frame
column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
                          'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df = df[column_names_reordered]

df.head()

	Reason_1	Reason_4	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	0	1	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	0	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	0	1	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	1	0	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	0	1	23/07/2015	289	36	33	239.554	30	1	2	1	2

Create a checkpoint

#Create a checkp[oint]
#For a checkpoint you have to create a copy of the data frame in the state that is now
df_reason_mod = df.copy()

df_reason_mod.head()

	Reason_1	Reason_4	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	0	1	07/07/2015	289	36	33	239.554	30	1	2	1	4
1	0	0	14/07/2015	118	13	50	239.554	31	1	1	0	0
2	0	1	15/07/2015	179	51	38	239.554	31	1	0	0	2
3	1	0	16/07/2015	279	5	39	239.554	24	1	2	0	4
4	0	1	23/07/2015	289	36	33	239.554	30	1	2	1	2

df_reason_mod.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'], dtype=object)

df_reason_mod['Date']

    07/07/2015
    14/07/2015
    15/07/2015
    16/07/2015
    23/07/2015
          ...    
  23/05/2018
  23/05/2018
  24/05/2018
  24/05/2018
  31/05/2018
Name: Date, Length: 700, dtype: object

type(df_reason_mod['Date'][0])

str

#using pandas timestamp
#when doing the conversion from text to date specify the proper 
#format of the date you will be working on
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'],
                                       format='%d/%m/%Y')

type(df_reason_mod['Date'])

pandas.core.series.Series

df_reason_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Reason_1                   700 non-null    uint8         
 1   Reason_2                   700 non-null    uint8         
 2   Reason_3                   700 non-null    uint8         
 3   Reason_4                   700 non-null    uint8         
 4   Date                       700 non-null    datetime64[ns]
 5   Transportation Expense     700 non-null    int64         
 6   Distance to Work           700 non-null    int64         
 7   Age                        700 non-null    int64         
 8   Daily Work Load Average    700 non-null    float64       
 9   Body Mass Index            700 non-null    int64         
 10  Education                  700 non-null    int64         
 11  Children                   700 non-null    int64         
 12  Pets                       700 non-null    int64         
 13  Absenteeism Time in Hours  700 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(8), uint8(4)
memory usage: 57.5 KB

Extract the Month value

df_reason_mod['Date'][0]

Timestamp('2015-07-07 00:00:00')

df_reason_mod['Date'][0].month

df_reason_mod.shape

(700, 14)

list_months = []
for i in range(700):
    list_months.append(df_reason_mod['Date'][i].month)

list_months

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]

len(list_months)

df_reason_mod['Month Value'] = list_months

df_reason_mod.head()

	Reason_1	Reason_4	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours	Month Value
0	0	1	2015-07-07	289	36	33	239.554	30	1	2	1	4	7
1	0	0	2015-07-14	118	13	50	239.554	31	1	1	0	0	7
2	0	1	2015-07-15	179	51	38	239.554	31	1	0	0	2	7
3	1	0	2015-07-16	279	5	39	239.554	24	1	2	0	4	7
4	0	1	2015-07-23	289	36	33	239.554	30	1	2	1	2	7

df_reason_mod.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value'],
      dtype=object)

Extracting the day of the week

df_reason_mod['Date'][699].weekday()

df_reason_mod['Date'][699]

Timestamp('2018-05-31 00:00:00')

def date_to_weekday(date_value):
    return date_value.weekday()

df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(date_to_weekday)

df_reason_mod.head()

	Reason_1	Reason_4	Date	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours	Month Value	Day of the Week
0	0	1	2015-07-07	289	36	33	239.554	30	1	2	1	4	7	1
1	0	0	2015-07-14	118	13	50	239.554	31	1	1	0	0	7	1
2	0	1	2015-07-15	179	51	38	239.554	31	1	0	0	2	7	2
3	1	0	2015-07-16	279	5	39	239.554	24	1	2	0	4	7	3
4	0	1	2015-07-23	289	36	33	239.554	30	1	2	1	2	7	3

df_reason_mod = df_reason_mod.drop(['Date'],axis=1)

df_reason_mod.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
       'Day of the Week'], dtype=object)

col_mod = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df_reason_mod = df_reason_mod[col_mod]

df_reason_mod.head()

	Reason_1	Reason_4	Month Value	Day of the Week	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	0	1	7	1	289	36	33	239.554	30	1	2	1	4
1	0	0	7	1	118	13	50	239.554	31	1	1	0	0
2	0	1	7	2	179	51	38	239.554	31	1	0	0	2
3	1	0	7	3	279	5	39	239.554	24	1	2	0	4
4	0	1	7	3	289	36	33	239.554	30	1	2	1	2

Checkpoint after Date drop

df_reason_date_mod = df_reason_mod.copy()

df_reason_date_mod.head()

	Reason_1	Reason_4	Month Value	Day of the Week	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	0	1	7	1	289	36	33	239.554	30	1	2	1	4
1	0	0	7	1	118	13	50	239.554	31	1	1	0	0
2	0	1	7	2	179	51	38	239.554	31	1	0	0	2
3	1	0	7	3	279	5	39	239.554	24	1	2	0	4
4	0	1	7	3	289	36	33	239.554	30	1	2	1	2

type(df_reason_date_mod['Transportation Expense'][0])

numpy.int64

type(df_reason_date_mod['Distance to Work'][0])

numpy.int64

type(df_reason_date_mod['Age'][0])

numpy.int64

type(df_reason_date_mod['Body Mass Index'][0])

numpy.int64

#transform Education into a dummy variable
df_reason_date_mod['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

df_reason_date_mod['Education'].value_counts()

  583
   73
   40
    4
Name: Education, dtype: int64

df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0,2:1,3:1,4:1})

df_reason_date_mod['Education'].unique()

array([0, 1], dtype=int64)

df_reason_date_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

Final Checkpoint

df_preprocessed = df_reason_date_mod.copy()

df_preprocessed.head(10)

	Reason_1	Reason_3	Reason_4	Month Value	Day of the Week	Transportation Expense	Distance to Work	Age	Daily Work Load Average	Body Mass Index	Education	Children	Pets	Absenteeism Time in Hours
0	0	0	1	7	1	289	36	33	239.554	30	0	2	1	4
1	0	0	0	7	1	118	13	50	239.554	31	0	1	0	0
2	0	0	1	7	2	179	51	38	239.554	31	0	0	0	2
3	1	0	0	7	3	279	5	39	239.554	24	0	2	0	4
4	0	0	1	7	3	289	36	33	239.554	30	0	2	1	2
5	0	0	1	7	4	179	51	38	239.554	31	0	0	0	2
6	0	0	1	7	4	361	52	28	239.554	27	0	1	4	8
7	0	0	1	7	4	260	50	36	239.554	23	0	4	0	4
8	0	1	0	7	0	155	12	34	239.554	25	0	2	0	40
9	0	0	1	7	0	235	11	37	239.554	29	1	1	1	8

df_preprocessed.to_csv('Absenteeism_preprocessed.csv',index=False)

df_preprocessed.to_csv('D:\\OneDrive - office365hubs.com\\!Python + SQL + Tableau\\Absenteeism_preprocessed.csv',index=False)

← Previous Post Next Post →

Python Sql Tableau

Python Sql Tableau Integration

Create a checkpoint

Extract the Month value

Extracting the day of the week

Checkpoint after Date drop

Final Checkpoint

	0	7	...	19	22	23	26
0	0	0	...	0	0	0	1
1	1	0	...	0	0	0	0
2	0	0	...	0	0	1	0
3	0	1	...	0	0	0	0
4	0	0	...	0	0	1	0
5	0	0	...	0	0	1	0
6	0	0	...	0	1	0	0
7	0	0	...	0	0	1	0
8	0	0	...	1	0	0	0
9	0	0	...	0	1	0	0

	1	2	3	4	5	6	7	8	9	10	...	18	19	21	22	23	24	25	26	27	28
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
695	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
696	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
697	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
698	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
699	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1

	0	7	...	19	22	23	26
0	0	0	...	0	0	0	1
1	1	0	...	0	0	0	0
2	0	0	...	0	0	1	0
3	0	1	...	0	0	0	0
4	0	0	...	0	0	1	0
5	0	0	...	0	0	1	0
6	0	0	...	0	1	0	0
7	0	0	...	0	0	1	0
8	0	0	...	1	0	0	0
9	0	0	...	0	1	0	0

	1	2	3	4	5	6	7	8	9	10	...	18	19	21	22	23	24	25	26	27	28
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
695	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
696	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
697	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
698	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
699	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1

	0	7	...	19	22	23	26
0	0	0	...	0	0	0	1
1	1	0	...	0	0	0	0
2	0	0	...	0	0	1	0
3	0	1	...	0	0	0	0
4	0	0	...	0	0	1	0
5	0	0	...	0	0	1	0
6	0	0	...	0	1	0	0
7	0	0	...	0	0	1	0
8	0	0	...	1	0	0	0
9	0	0	...	0	1	0	0

	1	2	3	4	5	6	7	8	9	10	...	18	19	21	22	23	24	25	26	27	28
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
695	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
696	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
697	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
698	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	1	0	0	0	0	0
699	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1