Logistic Regression In Python

Developed Logistic Regression

February 23, 2021 · 61 mins read

Creating a logistic regression to predict absenteeism

Import the relevant libraries

# import the relevant libraries
import pandas as pd
import numpy as np

Load the data

# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
# eyeball the data
data_preprocessed.head()
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Day of the Week Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours
0 0 0 0 1 7 1 289 36 33 239.554 30 0 2 1 4
1 0 0 0 0 7 1 118 13 50 239.554 31 0 1 0 0
2 0 0 0 1 7 2 179 51 38 239.554 31 0 0 0 2
3 1 0 0 0 7 3 279 5 39 239.554 24 0 2 0 4
4 0 0 0 1 7 3 289 36 33 239.554 30 0 2 1 2

Create the targets

# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()
3.0
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

# initial code from the lecture
# targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

# parameterized code
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
# eyeball the targets
targets
array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets
# check what happened
# maybe manually see how the targets were created
data_preprocessed.head()
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Day of the Week Transportation Expense Distance to Work Age Daily Work Load Average Body Mass Index Education Children Pets Absenteeism Time in Hours Excessive Absenteeism
0 0 0 0 1 7 1 289 36 33 239.554 30 0 2 1 4 1
1 0 0 0 0 7 1 118 13 50 239.554 31 0 1 0 0 0
2 0 0 0 1 7 2 179 51 38 239.554 31 0 0 0 2 0
3 1 0 0 0 7 3 279 5 39 239.554 24 0 2 0 4 1
4 0 0 0 1 7 3 289 36 33 239.554 30 0 2 1 2 0

A comment on the targets

# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets array
targets.sum() / targets.shape[0]
0.45571428571428574
# create a checkpoint by dropping the unnecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)
# check if the line above is a checkpoint :)

# if data_with_targets is data_preprocessed = True, then the two are pointing to the same object
# if it is False, then the two variables are completely different and this is in fact a checkpoint
data_with_targets is data_preprocessed
False
# check what's inside
data_with_targets.head()
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Transportation Expense Age Body Mass Index Education Children Pets Excessive Absenteeism
0 0 0 0 1 7 289 33 30 0 2 1 1
1 0 0 0 0 7 118 50 31 0 1 0 0
2 0 0 0 1 7 179 38 31 0 0 0 0
3 1 0 0 0 7 279 39 24 0 2 0 1
4 0 0 0 1 7 289 33 30 0 2 1 0

Select the inputs for the regression

data_with_targets.shape
(700, 12)
# Selects all rows and all columns until 14 (excluding)
data_with_targets.iloc[:,:14]
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Transportation Expense Age Body Mass Index Education Children Pets Excessive Absenteeism
0 0 0 0 1 7 289 33 30 0 2 1 1
1 0 0 0 0 7 118 50 31 0 1 0 0
2 0 0 0 1 7 179 38 31 0 0 0 0
3 1 0 0 0 7 279 39 24 0 2 0 1
4 0 0 0 1 7 289 33 30 0 2 1 0
... ... ... ... ... ... ... ... ... ... ... ... ...
695 1 0 0 0 5 179 40 22 1 2 0 1
696 1 0 0 0 5 225 28 24 0 1 2 0
697 1 0 0 0 5 330 28 25 1 0 0 1
698 0 0 0 1 5 235 32 25 1 0 0 0
699 0 0 0 1 5 291 40 25 0 1 1 0

700 rows × 12 columns

# Selects all rows and all columns but the last one (basically the same operation)
data_with_targets.iloc[:,:-1]
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Transportation Expense Age Body Mass Index Education Children Pets
0 0 0 0 1 7 289 33 30 0 2 1
1 0 0 0 0 7 118 50 31 0 1 0
2 0 0 0 1 7 179 38 31 0 0 0
3 1 0 0 0 7 279 39 24 0 2 0
4 0 0 0 1 7 289 33 30 0 2 1
... ... ... ... ... ... ... ... ... ... ... ...
695 1 0 0 0 5 179 40 22 1 2 0
696 1 0 0 0 5 225 28 24 0 1 2
697 1 0 0 0 5 330 28 25 1 0 0
698 0 0 0 1 5 235 32 25 1 0 0
699 0 0 0 1 5 291 40 25 0 1 1

700 rows × 11 columns

# Create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_with_targets.iloc[:,:-1]

Standardize the data

# standardize the inputs

# standardization is one of the most common preprocessing tools
# since data of different magnitude (scale) can be biased towards high values,
# we want all inputs to be of similar magnitude
# this is a peculiarity of machine learning in general - most (but not all) algorithms do badly with unscaled data

# a very useful module we can use is StandardScaler 
# it has much more capabilities than the straightforward 'preprocessing' method
from sklearn.preprocessing import StandardScaler


# we will create a variable that will contain the scaling information for this particular dataset
# here's the full documentation: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

# define scaler as an object
absenteeism_scaler = StandardScaler()
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
# check what are all columns that we've got
unscaled_inputs.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
absenteeism_scaler.fit(unscaled_inputs)
c:\python\python38\lib\site-packages\sklearn\base.py:193: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
  warnings.warn('From version 0.24, get_params will raise an '





CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
# the scaled_inputs are now an ndarray, because sklearn works with ndarrays
scaled_inputs
Reason_1 Reason_2 Reason_3 Reason_4 Month Value Transportation Expense Age Body Mass Index Education Children Pets
0 0 0 0 1 0.182726 1.005844 -0.536062 0.767431 0 0.880469 0.268487
1 0 0 0 0 0.182726 -1.574681 2.130803 1.002633 0 -0.019280 -0.589690
2 0 0 0 1 0.182726 -0.654143 0.248310 1.002633 0 -0.919030 -0.589690
3 1 0 0 0 0.182726 0.854936 0.405184 -0.643782 0 0.880469 -0.589690
4 0 0 0 1 0.182726 1.005844 -0.536062 0.767431 0 0.880469 0.268487
... ... ... ... ... ... ... ... ... ... ... ...
695 1 0 0 0 -0.388293 -0.654143 0.562059 -1.114186 1 0.880469 -0.589690
696 1 0 0 0 -0.388293 0.040034 -1.320435 -0.643782 0 -0.019280 1.126663
697 1 0 0 0 -0.388293 1.624567 -1.320435 -0.408580 1 -0.919030 -0.589690
698 0 0 0 1 -0.388293 0.190942 -0.692937 -0.408580 1 -0.919030 -0.589690
699 0 0 0 1 -0.388293 1.036026 0.562059 -0.408580 0 -0.019280 0.268487

700 rows × 11 columns

# check the shape of the inputs
scaled_inputs.shape
(700, 11)

Split the data into train & test and shuffle

Import the relevant module

# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

Split

# check how this method works
train_test_split(scaled_inputs, targets)
[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 208         0         0         1         0    -0.388293   
 593         0         0         0         1    -1.244823   
 209         1         0         0         0    -0.388293   
 11          1         0         0         0     0.182726   
 290         0         0         0         1     1.039256   
 ..        ...       ...       ...       ...          ...   
 363         0         0         0         1    -1.530333   
 566         0         0         0         1     1.610276   
 475         0         0         0         1     0.182726   
 167         1         0         0         0    -0.959313   
 351         0         0         0         1     1.610276   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 208                0.040034 -1.320435        -0.643782          0 -0.019280   
 593               -0.654143  0.248310         1.002633          0 -0.919030   
 209               -0.578689 -1.477309        -1.349389          0 -0.919030   
 11                 0.568211 -0.065439        -0.878984          0  2.679969   
 290                0.190942  0.091435         0.532229          1 -0.019280   
 ..                      ...       ...              ...        ...       ...   
 363                1.005844 -0.536062         0.767431          0  0.880469   
 566                0.040034 -1.320435        -0.643782          0 -0.019280   
 475                1.005844 -0.536062         0.767431          0  0.880469   
 167               -1.016322 -0.379188        -0.408580          0  0.880469   
 351               -0.654143  0.248310         1.002633          0 -0.919030   
 
          Pets  
 208  1.126663  
 593 -0.589690  
 209 -0.589690  
 11  -0.589690  
 290  0.268487  
 ..        ...  
 363  0.268487  
 566  1.126663  
 475  0.268487  
 167 -0.589690  
 351 -0.589690  
 
 [525 rows x 11 columns],
      Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 653         0         0         0         1    -0.959313   
 592         0         0         0         1    -1.244823   
 396         0         0         0         1    -0.959313   
 498         0         0         0         1     0.753746   
 694         0         0         0         1    -0.388293   
 ..        ...       ...       ...       ...          ...   
 18          1         0         0         0     0.182726   
 327         1         0         0         0     1.324766   
 3           1         0         0         0     0.182726   
 437         0         0         0         1    -0.388293   
 150         0         0         0         1    -1.244823   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 653                2.213108 -0.849811        -0.408580          0  1.780219   
 592                0.040034  0.718933         0.297027          1  0.880469   
 396               -0.654143  0.562059        -1.114186          1  0.880469   
 498               -1.016322 -0.379188        -0.408580          0  0.880469   
 694                1.036026  0.562059        -0.408580          0 -0.019280   
 ..                      ...       ...              ...        ...       ...   
 18                -0.503235 -0.536062        -0.408580          0  0.880469   
 327               -1.574681  0.091435         0.297027          0 -0.919030   
 3                  0.854936  0.405184        -0.643782          0  0.880469   
 437               -0.654143 -1.006686        -1.819793          1 -0.919030   
 150                0.040034 -1.320435        -0.643782          0 -0.019280   
 
          Pets  
 653 -0.589690  
 592  1.126663  
 396 -0.589690  
 498 -0.589690  
 694  0.268487  
 ..        ...  
 18   1.126663  
 327 -0.589690  
 3   -0.589690  
 437 -0.589690  
 150  1.126663  
 
 [175 rows x 11 columns],
 array([1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
        0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
        0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
        0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
        1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
        1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0]),
 array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0])]
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)
(560, 11) (560,)
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)
(140, 11) (140,)

Logistic regression with sklearn

# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

Training the model

# create a logistic regression object
reg = LogisticRegression()
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
# assess the train accuracy of the model
reg.score(x_train,y_train)
0.7732142857142857

Manually check the accuracy

# find the model outputs according to our model
model_outputs = reg.predict(x_train)
model_outputs
array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0])
# compare them with the targets
y_train
array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0])
# ACTUALLY compare the two variables
model_outputs == y_train
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True, False,
        True, False,  True, False,  True,  True,  True,  True, False,
       False, False,  True,  True, False,  True, False,  True,  True,
        True, False,  True, False,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True, False,
        True, False,  True, False,  True,  True,  True,  True, False,
        True, False, False,  True,  True,  True,  True,  True, False,
       False, False,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False, False,
       False,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True, False, False,  True,  True,  True, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True, False,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True, False,  True,  True, False,  True, False,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True, False,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False, False,
        True,  True, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False,  True,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
       False, False, False,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True, False,  True,
       False,  True])
# find out in how many instances we predicted correctly
np.sum((model_outputs==y_train))
433
# get the total number of instances
model_outputs.shape[0]
560
# calculate the accuracy of the model
np.sum((model_outputs==y_train)) / model_outputs.shape[0]
0.7732142857142857

Finding the intercept and coefficients

# get the intercept (bias) of our model
reg.intercept_
array([-1.6474549])
# get the coefficients (weights) of our model
reg.coef_
array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])
# check what were the names of our columns
unscaled_inputs.columns.values
array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table
Feature name Coefficient
0 Reason_1 2.800197
1 Reason_2 0.951884
2 Reason_3 3.115553
3 Reason_4 0.839001
4 Month Value 0.158930
5 Transportation Expense 0.605284
6 Age -0.169891
7 Body Mass Index 0.279811
8 Education -0.210533
9 Children 0.348262
10 Pets -0.277396
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table
Feature name Coefficient
0 Intercept -1.647455
1 Reason_1 2.800197
2 Reason_2 0.951884
3 Reason_3 3.115553
4 Reason_4 0.839001
5 Month Value 0.158930
6 Transportation Expense 0.605284
7 Age -0.169891
8 Body Mass Index 0.279811
9 Education -0.210533
10 Children 0.348262
11 Pets -0.277396

Interpreting the coefficients

# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
# display the df
summary_table
Feature name Coefficient Odds_ratio
0 Intercept -1.647455 0.192539
1 Reason_1 2.800197 16.447892
2 Reason_2 0.951884 2.590585
3 Reason_3 3.115553 22.545903
4 Reason_4 0.839001 2.314054
5 Month Value 0.158930 1.172256
6 Transportation Expense 0.605284 1.831773
7 Age -0.169891 0.843757
8 Body Mass Index 0.279811 1.322880
9 Education -0.210533 0.810152
10 Children 0.348262 1.416604
11 Pets -0.277396 0.757754
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)
Feature name Coefficient Odds_ratio
3 Reason_3 3.115553 22.545903
1 Reason_1 2.800197 16.447892
2 Reason_2 0.951884 2.590585
4 Reason_4 0.839001 2.314054
6 Transportation Expense 0.605284 1.831773
10 Children 0.348262 1.416604
8 Body Mass Index 0.279811 1.322880
5 Month Value 0.158930 1.172256
7 Age -0.169891 0.843757
9 Education -0.210533 0.810152
11 Pets -0.277396 0.757754
0 Intercept -1.647455 0.192539

Testing the model

# assess the test accuracy of the model
reg.score(x_test,y_test)
0.75
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg.predict_proba(x_test)

# let's check that out
predicted_proba
array([[0.71340413, 0.28659587],
       [0.58724228, 0.41275772],
       [0.44020821, 0.55979179],
       [0.78159464, 0.21840536],
       [0.08410854, 0.91589146],
       [0.33487603, 0.66512397],
       [0.29984576, 0.70015424],
       [0.13103971, 0.86896029],
       [0.78625404, 0.21374596],
       [0.74903632, 0.25096368],
       [0.49397598, 0.50602402],
       [0.22484913, 0.77515087],
       [0.07129151, 0.92870849],
       [0.73178133, 0.26821867],
       [0.30934135, 0.69065865],
       [0.5471671 , 0.4528329 ],
       [0.55052275, 0.44947725],
       [0.5392707 , 0.4607293 ],
       [0.40201117, 0.59798883],
       [0.05361575, 0.94638425],
       [0.7003009 , 0.2996991 ],
       [0.78159464, 0.21840536],
       [0.42037128, 0.57962872],
       [0.42037128, 0.57962872],
       [0.24783565, 0.75216435],
       [0.74566259, 0.25433741],
       [0.51017274, 0.48982726],
       [0.85690195, 0.14309805],
       [0.20349733, 0.79650267],
       [0.78159464, 0.21840536],
       [0.63043442, 0.36956558],
       [0.32093965, 0.67906035],
       [0.31497433, 0.68502567],
       [0.47131917, 0.52868083],
       [0.78159464, 0.21840536],
       [0.46493449, 0.53506551],
       [0.77852919, 0.22147081],
       [0.26307895, 0.73692105],
       [0.59501956, 0.40498044],
       [0.39494012, 0.60505988],
       [0.78924152, 0.21075848],
       [0.54775534, 0.45224466],
       [0.76248708, 0.23751292],
       [0.60166502, 0.39833498],
       [0.17244553, 0.82755447],
       [0.43202425, 0.56797575],
       [0.30886675, 0.69113325],
       [0.71340413, 0.28659587],
       [0.78064733, 0.21935267],
       [0.7966903 , 0.2033097 ],
       [0.42371744, 0.57628256],
       [0.6705336 , 0.3294664 ],
       [0.33487603, 0.66512397],
       [0.73050501, 0.26949499],
       [0.16678032, 0.83321968],
       [0.56508475, 0.43491525],
       [0.11625388, 0.88374612],
       [0.76872928, 0.23127072],
       [0.66584142, 0.33415858],
       [0.65567061, 0.34432939],
       [0.30090655, 0.69909345],
       [0.34505737, 0.65494263],
       [0.70755059, 0.29244941],
       [0.20799242, 0.79200758],
       [0.79249724, 0.20750276],
       [0.73159442, 0.26840558],
       [0.91291434, 0.08708566],
       [0.77852919, 0.22147081],
       [0.26754583, 0.73245417],
       [0.69469781, 0.30530219],
       [0.77852919, 0.22147081],
       [0.70985592, 0.29014408],
       [0.09561809, 0.90438191],
       [0.53938703, 0.46061297],
       [0.39825313, 0.60174687],
       [0.78159464, 0.21840536],
       [0.22645293, 0.77354707],
       [0.26837292, 0.73162708],
       [0.25831165, 0.74168835],
       [0.32855571, 0.67144429],
       [0.75417184, 0.24582816],
       [0.92387058, 0.07612942],
       [0.76872928, 0.23127072],
       [0.24741155, 0.75258845],
       [0.56558369, 0.43441631],
       [0.87775043, 0.12224957],
       [0.29572951, 0.70427049],
       [0.42037128, 0.57962872],
       [0.75881641, 0.24118359],
       [0.32093965, 0.67906035],
       [0.82488593, 0.17511407],
       [0.84950183, 0.15049817],
       [0.77374985, 0.22625015],
       [0.73159442, 0.26840558],
       [0.74903632, 0.25096368],
       [0.14944055, 0.85055945],
       [0.70403751, 0.29596249],
       [0.23713867, 0.76286133],
       [0.75645005, 0.24354995],
       [0.78203264, 0.21796736],
       [0.37671823, 0.62328177],
       [0.32093965, 0.67906035],
       [0.30409309, 0.69590691],
       [0.25542876, 0.74457124],
       [0.55063861, 0.44936139],
       [0.51677447, 0.48322553],
       [0.72259112, 0.27740888],
       [0.14944055, 0.85055945],
       [0.2220274 , 0.7779726 ],
       [0.85521156, 0.14478844],
       [0.93001302, 0.06998698],
       [0.09640551, 0.90359449],
       [0.33736668, 0.66263332],
       [0.66154998, 0.33845002],
       [0.47623876, 0.52376124],
       [0.43903262, 0.56096738],
       [0.21378105, 0.78621895],
       [0.17783793, 0.82216207],
       [0.45811539, 0.54188461],
       [0.6906917 , 0.3093083 ],
       [0.74041049, 0.25958951],
       [0.8393858 , 0.1606142 ],
       [0.18777875, 0.81222125],
       [0.54775534, 0.45224466],
       [0.78159464, 0.21840536],
       [0.64457904, 0.35542096],
       [0.78924152, 0.21075848],
       [0.89029009, 0.10970991],
       [0.25937462, 0.74062538],
       [0.7003009 , 0.2996991 ],
       [0.38415023, 0.61584977],
       [0.78924152, 0.21075848],
       [0.70403751, 0.29596249],
       [0.64457904, 0.35542096],
       [0.73490357, 0.26509643],
       [0.47623876, 0.52376124],
       [0.5392707 , 0.4607293 ],
       [0.68498978, 0.31501022],
       [0.75645005, 0.24354995],
       [0.53938703, 0.46061297]])
predicted_proba.shape
(140, 2)
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]
array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174687,
       0.21840536, 0.77354707, 0.73162708, 0.74168835, 0.67144429,
       0.24582816, 0.07612942, 0.23127072, 0.75258845, 0.43441631,
       0.12224957, 0.70427049, 0.57962872, 0.24118359, 0.67906035,
       0.17511407, 0.15049817, 0.22625015, 0.26840558, 0.25096368,
       0.85055945, 0.29596249, 0.76286133, 0.24354995, 0.21796736,
       0.62328177, 0.67906035, 0.69590691, 0.74457124, 0.44936139,
       0.48322553, 0.27740888, 0.85055945, 0.7779726 , 0.14478844,
       0.06998698, 0.90359449, 0.66263332, 0.33845002, 0.52376124,
       0.56096738, 0.78621895, 0.82216207, 0.54188461, 0.3093083 ,
       0.25958951, 0.1606142 , 0.81222125, 0.45224466, 0.21840536,
       0.35542096, 0.21075848, 0.10970991, 0.74062538, 0.2996991 ,
       0.61584977, 0.21075848, 0.29596249, 0.35542096, 0.26509643,
       0.52376124, 0.4607293 , 0.31501022, 0.24354995, 0.46061297])

Save the model

# import the relevant module
import pickle
# pickle the model file
with open('model', 'wb') as file:
    pickle.dump(reg, file)
# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)