# import the relevant libraries
import pandas as pd
import numpy as np
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
# eyeball the data
data_preprocessed.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Day of the Week | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 1 | 289 | 36 | 33 | 239.554 | 30 | 0 | 2 | 1 | 4 |
| 1 | 0 | 0 | 0 | 0 | 7 | 1 | 118 | 13 | 50 | 239.554 | 31 | 0 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 2 | 179 | 51 | 38 | 239.554 | 31 | 0 | 0 | 0 | 2 |
| 3 | 1 | 0 | 0 | 0 | 7 | 3 | 279 | 5 | 39 | 239.554 | 24 | 0 | 2 | 0 | 4 |
| 4 | 0 | 0 | 0 | 1 | 7 | 3 | 289 | 36 | 33 | 239.554 | 30 | 0 | 2 | 1 | 2 |
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()
3.0
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median
# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off
# initial code from the lecture
# targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)
# parameterized code
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
# eyeball the targets
targets
array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets
# check what happened
# maybe manually see how the targets were created
data_preprocessed.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Day of the Week | Transportation Expense | Distance to Work | Age | Daily Work Load Average | Body Mass Index | Education | Children | Pets | Absenteeism Time in Hours | Excessive Absenteeism | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 1 | 289 | 36 | 33 | 239.554 | 30 | 0 | 2 | 1 | 4 | 1 |
| 1 | 0 | 0 | 0 | 0 | 7 | 1 | 118 | 13 | 50 | 239.554 | 31 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 2 | 179 | 51 | 38 | 239.554 | 31 | 0 | 0 | 0 | 2 | 0 |
| 3 | 1 | 0 | 0 | 0 | 7 | 3 | 279 | 5 | 39 | 239.554 | 24 | 0 | 2 | 0 | 4 | 1 |
| 4 | 0 | 0 | 0 | 1 | 7 | 3 | 289 | 36 | 33 | 239.554 | 30 | 0 | 2 | 1 | 2 | 0 |
# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets array
targets.sum() / targets.shape[0]
0.45571428571428574
# create a checkpoint by dropping the unnecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
'Daily Work Load Average','Distance to Work'],axis=1)
# check if the line above is a checkpoint :)
# if data_with_targets is data_preprocessed = True, then the two are pointing to the same object
# if it is False, then the two variables are completely different and this is in fact a checkpoint
data_with_targets is data_preprocessed
False
# check what's inside
data_with_targets.head()
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Transportation Expense | Age | Body Mass Index | Education | Children | Pets | Excessive Absenteeism | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 289 | 33 | 30 | 0 | 2 | 1 | 1 |
| 1 | 0 | 0 | 0 | 0 | 7 | 118 | 50 | 31 | 0 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 179 | 38 | 31 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 7 | 279 | 39 | 24 | 0 | 2 | 0 | 1 |
| 4 | 0 | 0 | 0 | 1 | 7 | 289 | 33 | 30 | 0 | 2 | 1 | 0 |
data_with_targets.shape
(700, 12)
# Selects all rows and all columns until 14 (excluding)
data_with_targets.iloc[:,:14]
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Transportation Expense | Age | Body Mass Index | Education | Children | Pets | Excessive Absenteeism | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 289 | 33 | 30 | 0 | 2 | 1 | 1 |
| 1 | 0 | 0 | 0 | 0 | 7 | 118 | 50 | 31 | 0 | 1 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 179 | 38 | 31 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 7 | 279 | 39 | 24 | 0 | 2 | 0 | 1 |
| 4 | 0 | 0 | 0 | 1 | 7 | 289 | 33 | 30 | 0 | 2 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 1 | 0 | 0 | 0 | 5 | 179 | 40 | 22 | 1 | 2 | 0 | 1 |
| 696 | 1 | 0 | 0 | 0 | 5 | 225 | 28 | 24 | 0 | 1 | 2 | 0 |
| 697 | 1 | 0 | 0 | 0 | 5 | 330 | 28 | 25 | 1 | 0 | 0 | 1 |
| 698 | 0 | 0 | 0 | 1 | 5 | 235 | 32 | 25 | 1 | 0 | 0 | 0 |
| 699 | 0 | 0 | 0 | 1 | 5 | 291 | 40 | 25 | 0 | 1 | 1 | 0 |
700 rows × 12 columns
# Selects all rows and all columns but the last one (basically the same operation)
data_with_targets.iloc[:,:-1]
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Transportation Expense | Age | Body Mass Index | Education | Children | Pets | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 7 | 289 | 33 | 30 | 0 | 2 | 1 |
| 1 | 0 | 0 | 0 | 0 | 7 | 118 | 50 | 31 | 0 | 1 | 0 |
| 2 | 0 | 0 | 0 | 1 | 7 | 179 | 38 | 31 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 7 | 279 | 39 | 24 | 0 | 2 | 0 |
| 4 | 0 | 0 | 0 | 1 | 7 | 289 | 33 | 30 | 0 | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 1 | 0 | 0 | 0 | 5 | 179 | 40 | 22 | 1 | 2 | 0 |
| 696 | 1 | 0 | 0 | 0 | 5 | 225 | 28 | 24 | 0 | 1 | 2 |
| 697 | 1 | 0 | 0 | 0 | 5 | 330 | 28 | 25 | 1 | 0 | 0 |
| 698 | 0 | 0 | 0 | 1 | 5 | 235 | 32 | 25 | 1 | 0 | 0 |
| 699 | 0 | 0 | 0 | 1 | 5 | 291 | 40 | 25 | 0 | 1 | 1 |
700 rows × 11 columns
# Create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_with_targets.iloc[:,:-1]
# standardize the inputs
# standardization is one of the most common preprocessing tools
# since data of different magnitude (scale) can be biased towards high values,
# we want all inputs to be of similar magnitude
# this is a peculiarity of machine learning in general - most (but not all) algorithms do badly with unscaled data
# a very useful module we can use is StandardScaler
# it has much more capabilities than the straightforward 'preprocessing' method
from sklearn.preprocessing import StandardScaler
# we will create a variable that will contain the scaling information for this particular dataset
# here's the full documentation: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# define scaler as an object
absenteeism_scaler = StandardScaler()
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module,
# so you can imagine that the Custom Scaler is build on it
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
# create the Custom Scaler class
class CustomScaler(BaseEstimator,TransformerMixin):
# init or what information we need to declare a CustomScaler object
# and what is calculated/declared as we do
def __init__(self,columns,copy=True,with_mean=True,with_std=True):
# scaler is nothing but a Standard Scaler object
self.scaler = StandardScaler(copy,with_mean,with_std)
# with some columns 'twist'
self.columns = columns
self.mean_ = None
self.var_ = None
# the fit method, which, again based on StandardScale
def fit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
self.mean_ = np.mean(X[self.columns])
self.var_ = np.var(X[self.columns])
return self
# the transform method which does the actual scaling
def transform(self, X, y=None, copy=None):
# record the initial order of the columns
init_col_order = X.columns
# scale all features that you chose when creating the instance of the class
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
# declare a variable containing all information that was not scaled
X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
# return a data frame which contains all scaled features and all 'not scaled' features
# use the original order (that you recorded in the beginning)
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
# check what are all columns that we've got
unscaled_inputs.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
'Children', 'Pets'], dtype=object)
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
#'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object
absenteeism_scaler.fit(unscaled_inputs)
c:\python\python38\lib\site-packages\sklearn\base.py:193: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
warnings.warn('From version 0.24, get_params will raise an '
CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
'Body Mass Index', 'Children', 'Pets'],
copy=None, with_mean=None, with_std=None)
# standardizes the data, using the transform method
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data.
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
# the scaled_inputs are now an ndarray, because sklearn works with ndarrays
scaled_inputs
| Reason_1 | Reason_2 | Reason_3 | Reason_4 | Month Value | Transportation Expense | Age | Body Mass Index | Education | Children | Pets | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0.182726 | 1.005844 | -0.536062 | 0.767431 | 0 | 0.880469 | 0.268487 |
| 1 | 0 | 0 | 0 | 0 | 0.182726 | -1.574681 | 2.130803 | 1.002633 | 0 | -0.019280 | -0.589690 |
| 2 | 0 | 0 | 0 | 1 | 0.182726 | -0.654143 | 0.248310 | 1.002633 | 0 | -0.919030 | -0.589690 |
| 3 | 1 | 0 | 0 | 0 | 0.182726 | 0.854936 | 0.405184 | -0.643782 | 0 | 0.880469 | -0.589690 |
| 4 | 0 | 0 | 0 | 1 | 0.182726 | 1.005844 | -0.536062 | 0.767431 | 0 | 0.880469 | 0.268487 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 695 | 1 | 0 | 0 | 0 | -0.388293 | -0.654143 | 0.562059 | -1.114186 | 1 | 0.880469 | -0.589690 |
| 696 | 1 | 0 | 0 | 0 | -0.388293 | 0.040034 | -1.320435 | -0.643782 | 0 | -0.019280 | 1.126663 |
| 697 | 1 | 0 | 0 | 0 | -0.388293 | 1.624567 | -1.320435 | -0.408580 | 1 | -0.919030 | -0.589690 |
| 698 | 0 | 0 | 0 | 1 | -0.388293 | 0.190942 | -0.692937 | -0.408580 | 1 | -0.919030 | -0.589690 |
| 699 | 0 | 0 | 0 | 1 | -0.388293 | 1.036026 | 0.562059 | -0.408580 | 0 | -0.019280 | 0.268487 |
700 rows × 11 columns
# check the shape of the inputs
scaled_inputs.shape
(700, 11)
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split
# check how this method works
train_test_split(scaled_inputs, targets)
[ Reason_1 Reason_2 Reason_3 Reason_4 Month Value \
208 0 0 1 0 -0.388293
593 0 0 0 1 -1.244823
209 1 0 0 0 -0.388293
11 1 0 0 0 0.182726
290 0 0 0 1 1.039256
.. ... ... ... ... ...
363 0 0 0 1 -1.530333
566 0 0 0 1 1.610276
475 0 0 0 1 0.182726
167 1 0 0 0 -0.959313
351 0 0 0 1 1.610276
Transportation Expense Age Body Mass Index Education Children \
208 0.040034 -1.320435 -0.643782 0 -0.019280
593 -0.654143 0.248310 1.002633 0 -0.919030
209 -0.578689 -1.477309 -1.349389 0 -0.919030
11 0.568211 -0.065439 -0.878984 0 2.679969
290 0.190942 0.091435 0.532229 1 -0.019280
.. ... ... ... ... ...
363 1.005844 -0.536062 0.767431 0 0.880469
566 0.040034 -1.320435 -0.643782 0 -0.019280
475 1.005844 -0.536062 0.767431 0 0.880469
167 -1.016322 -0.379188 -0.408580 0 0.880469
351 -0.654143 0.248310 1.002633 0 -0.919030
Pets
208 1.126663
593 -0.589690
209 -0.589690
11 -0.589690
290 0.268487
.. ...
363 0.268487
566 1.126663
475 0.268487
167 -0.589690
351 -0.589690
[525 rows x 11 columns],
Reason_1 Reason_2 Reason_3 Reason_4 Month Value \
653 0 0 0 1 -0.959313
592 0 0 0 1 -1.244823
396 0 0 0 1 -0.959313
498 0 0 0 1 0.753746
694 0 0 0 1 -0.388293
.. ... ... ... ... ...
18 1 0 0 0 0.182726
327 1 0 0 0 1.324766
3 1 0 0 0 0.182726
437 0 0 0 1 -0.388293
150 0 0 0 1 -1.244823
Transportation Expense Age Body Mass Index Education Children \
653 2.213108 -0.849811 -0.408580 0 1.780219
592 0.040034 0.718933 0.297027 1 0.880469
396 -0.654143 0.562059 -1.114186 1 0.880469
498 -1.016322 -0.379188 -0.408580 0 0.880469
694 1.036026 0.562059 -0.408580 0 -0.019280
.. ... ... ... ... ...
18 -0.503235 -0.536062 -0.408580 0 0.880469
327 -1.574681 0.091435 0.297027 0 -0.919030
3 0.854936 0.405184 -0.643782 0 0.880469
437 -0.654143 -1.006686 -1.819793 1 -0.919030
150 0.040034 -1.320435 -0.643782 0 -0.019280
Pets
653 -0.589690
592 1.126663
396 -0.589690
498 -0.589690
694 0.268487
.. ...
18 1.126663
327 -0.589690
3 -0.589690
437 -0.589690
150 1.126663
[175 rows x 11 columns],
array([1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0]),
array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0])]
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8,
test_size = 0.2, random_state = 20)
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)
(560, 11) (560,)
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)
(140, 11) (140,)
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression
# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics
# create a logistic regression object
reg = LogisticRegression()
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
# assess the train accuracy of the model
reg.score(x_train,y_train)
0.7732142857142857
# find the model outputs according to our model
model_outputs = reg.predict(x_train)
model_outputs
array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
0, 1, 0, 1, 1, 1, 0, 0, 0, 0])
# compare them with the targets
y_train
array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 0, 1, 0])
# ACTUALLY compare the two variables
model_outputs == y_train
array([ True, True, True, True, True, True, True, True, True,
True, False, True, False, False, True, True, True, True,
False, True, False, True, False, False, True, True, True,
False, True, True, True, True, True, True, True, True,
False, False, False, False, True, True, True, True, True,
True, True, True, True, True, False, True, True, True,
True, True, True, True, True, False, True, True, True,
True, True, True, True, True, True, False, True, True,
True, True, True, False, True, True, True, True, True,
False, True, False, True, True, False, False, False, True,
True, True, True, True, True, True, True, False, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, False, True, True, True, True,
False, True, True, True, True, False, True, True, True,
True, True, True, True, True, False, True, True, True,
True, False, True, True, True, True, True, True, False,
True, False, True, False, True, True, True, True, False,
False, False, True, True, False, True, False, True, True,
True, False, True, False, True, False, True, False, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, False, True, True, True,
True, False, True, True, True, True, True, True, True,
True, True, True, True, True, True, False, True, False,
False, True, True, True, True, True, True, True, False,
True, False, True, False, True, True, True, True, False,
True, False, False, True, True, True, True, True, False,
False, False, True, False, True, True, True, False, True,
True, True, True, True, True, True, False, True, True,
True, True, True, True, True, True, False, True, True,
True, False, False, True, True, True, True, True, True,
False, True, True, True, True, True, True, False, False,
False, True, True, True, True, False, True, False, True,
True, True, True, True, True, True, False, True, False,
False, True, True, True, True, True, False, True, True,
True, True, False, False, True, False, True, True, True,
True, True, True, True, True, False, True, True, False,
False, True, True, True, True, True, False, True, False,
True, True, True, False, False, True, True, True, False,
True, False, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
False, True, True, False, True, False, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, False, True, True, True, False, True,
True, False, True, False, True, True, True, False, True,
True, True, True, True, True, False, True, True, True,
True, True, True, True, True, True, True, True, True,
False, True, True, False, True, False, True, True, True,
True, True, True, False, True, True, False, True, False,
True, True, True, True, True, False, False, True, True,
True, True, False, True, True, True, True, False, True,
False, True, True, True, False, False, True, True, True,
True, False, True, True, True, True, True, True, True,
True, True, False, True, True, False, False, True, True,
False, True, True, True, True, True, True, False, False,
True, True, False, True, True, True, True, False, True,
True, True, True, True, False, True, True, True, True,
True, False, True, True, False, False, True, True, True,
False, True, True, True, True, True, False, True, True,
False, False, False, True, True, False, True, True, True,
False, True, True, True, True, True, True, True, True,
True, False, True, True, True, True, True, True, True,
True, True, False, True, True, True, True, False, True,
False, True])
# find out in how many instances we predicted correctly
np.sum((model_outputs==y_train))
433
# get the total number of instances
model_outputs.shape[0]
560
# calculate the accuracy of the model
np.sum((model_outputs==y_train)) / model_outputs.shape[0]
0.7732142857142857
# get the intercept (bias) of our model
reg.intercept_
array([-1.6474549])
# get the coefficients (weights) of our model
reg.coef_
array([[ 2.80019733, 0.95188356, 3.11555338, 0.83900082, 0.1589299 ,
0.60528415, -0.16989096, 0.27981088, -0.21053312, 0.34826214,
-0.27739602]])
# check what were the names of our columns
unscaled_inputs.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
'Children', 'Pets'], dtype=object)
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later)
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)
# display the summary table
summary_table
| Feature name | Coefficient | |
|---|---|---|
| 0 | Reason_1 | 2.800197 |
| 1 | Reason_2 | 0.951884 |
| 2 | Reason_3 | 3.115553 |
| 3 | Reason_4 | 0.839001 |
| 4 | Month Value | 0.158930 |
| 5 | Transportation Expense | 0.605284 |
| 6 | Age | -0.169891 |
| 7 | Body Mass Index | 0.279811 |
| 8 | Education | -0.210533 |
| 9 | Children | 0.348262 |
| 10 | Pets | -0.277396 |
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1
# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
# sort the df by index
summary_table = summary_table.sort_index()
summary_table
| Feature name | Coefficient | |
|---|---|---|
| 0 | Intercept | -1.647455 |
| 1 | Reason_1 | 2.800197 |
| 2 | Reason_2 | 0.951884 |
| 3 | Reason_3 | 3.115553 |
| 4 | Reason_4 | 0.839001 |
| 5 | Month Value | 0.158930 |
| 6 | Transportation Expense | 0.605284 |
| 7 | Age | -0.169891 |
| 8 | Body Mass Index | 0.279811 |
| 9 | Education | -0.210533 |
| 10 | Children | 0.348262 |
| 11 | Pets | -0.277396 |
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
# display the df
summary_table
| Feature name | Coefficient | Odds_ratio | |
|---|---|---|---|
| 0 | Intercept | -1.647455 | 0.192539 |
| 1 | Reason_1 | 2.800197 | 16.447892 |
| 2 | Reason_2 | 0.951884 | 2.590585 |
| 3 | Reason_3 | 3.115553 | 22.545903 |
| 4 | Reason_4 | 0.839001 | 2.314054 |
| 5 | Month Value | 0.158930 | 1.172256 |
| 6 | Transportation Expense | 0.605284 | 1.831773 |
| 7 | Age | -0.169891 | 0.843757 |
| 8 | Body Mass Index | 0.279811 | 1.322880 |
| 9 | Education | -0.210533 | 0.810152 |
| 10 | Children | 0.348262 | 1.416604 |
| 11 | Pets | -0.277396 | 0.757754 |
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)
| Feature name | Coefficient | Odds_ratio | |
|---|---|---|---|
| 3 | Reason_3 | 3.115553 | 22.545903 |
| 1 | Reason_1 | 2.800197 | 16.447892 |
| 2 | Reason_2 | 0.951884 | 2.590585 |
| 4 | Reason_4 | 0.839001 | 2.314054 |
| 6 | Transportation Expense | 0.605284 | 1.831773 |
| 10 | Children | 0.348262 | 1.416604 |
| 8 | Body Mass Index | 0.279811 | 1.322880 |
| 5 | Month Value | 0.158930 | 1.172256 |
| 7 | Age | -0.169891 | 0.843757 |
| 9 | Education | -0.210533 | 0.810152 |
| 11 | Pets | -0.277396 | 0.757754 |
| 0 | Intercept | -1.647455 | 0.192539 |
# assess the test accuracy of the model
reg.score(x_test,y_test)
0.75
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg.predict_proba(x_test)
# let's check that out
predicted_proba
array([[0.71340413, 0.28659587],
[0.58724228, 0.41275772],
[0.44020821, 0.55979179],
[0.78159464, 0.21840536],
[0.08410854, 0.91589146],
[0.33487603, 0.66512397],
[0.29984576, 0.70015424],
[0.13103971, 0.86896029],
[0.78625404, 0.21374596],
[0.74903632, 0.25096368],
[0.49397598, 0.50602402],
[0.22484913, 0.77515087],
[0.07129151, 0.92870849],
[0.73178133, 0.26821867],
[0.30934135, 0.69065865],
[0.5471671 , 0.4528329 ],
[0.55052275, 0.44947725],
[0.5392707 , 0.4607293 ],
[0.40201117, 0.59798883],
[0.05361575, 0.94638425],
[0.7003009 , 0.2996991 ],
[0.78159464, 0.21840536],
[0.42037128, 0.57962872],
[0.42037128, 0.57962872],
[0.24783565, 0.75216435],
[0.74566259, 0.25433741],
[0.51017274, 0.48982726],
[0.85690195, 0.14309805],
[0.20349733, 0.79650267],
[0.78159464, 0.21840536],
[0.63043442, 0.36956558],
[0.32093965, 0.67906035],
[0.31497433, 0.68502567],
[0.47131917, 0.52868083],
[0.78159464, 0.21840536],
[0.46493449, 0.53506551],
[0.77852919, 0.22147081],
[0.26307895, 0.73692105],
[0.59501956, 0.40498044],
[0.39494012, 0.60505988],
[0.78924152, 0.21075848],
[0.54775534, 0.45224466],
[0.76248708, 0.23751292],
[0.60166502, 0.39833498],
[0.17244553, 0.82755447],
[0.43202425, 0.56797575],
[0.30886675, 0.69113325],
[0.71340413, 0.28659587],
[0.78064733, 0.21935267],
[0.7966903 , 0.2033097 ],
[0.42371744, 0.57628256],
[0.6705336 , 0.3294664 ],
[0.33487603, 0.66512397],
[0.73050501, 0.26949499],
[0.16678032, 0.83321968],
[0.56508475, 0.43491525],
[0.11625388, 0.88374612],
[0.76872928, 0.23127072],
[0.66584142, 0.33415858],
[0.65567061, 0.34432939],
[0.30090655, 0.69909345],
[0.34505737, 0.65494263],
[0.70755059, 0.29244941],
[0.20799242, 0.79200758],
[0.79249724, 0.20750276],
[0.73159442, 0.26840558],
[0.91291434, 0.08708566],
[0.77852919, 0.22147081],
[0.26754583, 0.73245417],
[0.69469781, 0.30530219],
[0.77852919, 0.22147081],
[0.70985592, 0.29014408],
[0.09561809, 0.90438191],
[0.53938703, 0.46061297],
[0.39825313, 0.60174687],
[0.78159464, 0.21840536],
[0.22645293, 0.77354707],
[0.26837292, 0.73162708],
[0.25831165, 0.74168835],
[0.32855571, 0.67144429],
[0.75417184, 0.24582816],
[0.92387058, 0.07612942],
[0.76872928, 0.23127072],
[0.24741155, 0.75258845],
[0.56558369, 0.43441631],
[0.87775043, 0.12224957],
[0.29572951, 0.70427049],
[0.42037128, 0.57962872],
[0.75881641, 0.24118359],
[0.32093965, 0.67906035],
[0.82488593, 0.17511407],
[0.84950183, 0.15049817],
[0.77374985, 0.22625015],
[0.73159442, 0.26840558],
[0.74903632, 0.25096368],
[0.14944055, 0.85055945],
[0.70403751, 0.29596249],
[0.23713867, 0.76286133],
[0.75645005, 0.24354995],
[0.78203264, 0.21796736],
[0.37671823, 0.62328177],
[0.32093965, 0.67906035],
[0.30409309, 0.69590691],
[0.25542876, 0.74457124],
[0.55063861, 0.44936139],
[0.51677447, 0.48322553],
[0.72259112, 0.27740888],
[0.14944055, 0.85055945],
[0.2220274 , 0.7779726 ],
[0.85521156, 0.14478844],
[0.93001302, 0.06998698],
[0.09640551, 0.90359449],
[0.33736668, 0.66263332],
[0.66154998, 0.33845002],
[0.47623876, 0.52376124],
[0.43903262, 0.56096738],
[0.21378105, 0.78621895],
[0.17783793, 0.82216207],
[0.45811539, 0.54188461],
[0.6906917 , 0.3093083 ],
[0.74041049, 0.25958951],
[0.8393858 , 0.1606142 ],
[0.18777875, 0.81222125],
[0.54775534, 0.45224466],
[0.78159464, 0.21840536],
[0.64457904, 0.35542096],
[0.78924152, 0.21075848],
[0.89029009, 0.10970991],
[0.25937462, 0.74062538],
[0.7003009 , 0.2996991 ],
[0.38415023, 0.61584977],
[0.78924152, 0.21075848],
[0.70403751, 0.29596249],
[0.64457904, 0.35542096],
[0.73490357, 0.26509643],
[0.47623876, 0.52376124],
[0.5392707 , 0.4607293 ],
[0.68498978, 0.31501022],
[0.75645005, 0.24354995],
[0.53938703, 0.46061297]])
predicted_proba.shape
(140, 2)
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]
array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174687,
0.21840536, 0.77354707, 0.73162708, 0.74168835, 0.67144429,
0.24582816, 0.07612942, 0.23127072, 0.75258845, 0.43441631,
0.12224957, 0.70427049, 0.57962872, 0.24118359, 0.67906035,
0.17511407, 0.15049817, 0.22625015, 0.26840558, 0.25096368,
0.85055945, 0.29596249, 0.76286133, 0.24354995, 0.21796736,
0.62328177, 0.67906035, 0.69590691, 0.74457124, 0.44936139,
0.48322553, 0.27740888, 0.85055945, 0.7779726 , 0.14478844,
0.06998698, 0.90359449, 0.66263332, 0.33845002, 0.52376124,
0.56096738, 0.78621895, 0.82216207, 0.54188461, 0.3093083 ,
0.25958951, 0.1606142 , 0.81222125, 0.45224466, 0.21840536,
0.35542096, 0.21075848, 0.10970991, 0.74062538, 0.2996991 ,
0.61584977, 0.21075848, 0.29596249, 0.35542096, 0.26509643,
0.52376124, 0.4607293 , 0.31501022, 0.24354995, 0.46061297])
# import the relevant module
import pickle
# pickle the model file
with open('model', 'wb') as file:
pickle.dump(reg, file)
# pickle the scaler file
with open('scaler','wb') as file:
pickle.dump(absenteeism_scaler, file)