
Python - 50 Startups
- 31st Aug, 2021
- 16:30 PM
#!/usr/bin/env python # coding: utf-8 # In[10]: import numpy as np import matplotlib.pyplot as plt import pandas as pd # In[25]: # loading the data df = pd.read_csv("50_Startups-Assignment 5 (1).csv") # In[83]: print(df.shape) df.describe() # In[68]: #Splitting the data into Regressor(independent variables) and Target(dependent variable) Regressor = df.iloc[:, :-1].values Target = df.iloc[:, 4].values # In[27]: #Looking at the Dataset we can see that "State" is a String type variable, thus we cannot feed String type variables #into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables #using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder # In[70]: from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct =ColumnTransformer( [('one_hot_encodder',OneHotEncoder(),[3])], remainder='passthrough' ) Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float) # In[72]: Regressor=Regressor[:, 1:] # In[73]: #Splitting the dataset into the Training and Test dataset from sklearn.model_selection import train_test_split Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0) # In[74]: #Fitting Multiple Linear Regression to the Training Set from sklearn.linear_model import LinearRegression Model = LinearRegression() Model.fit(Regressor_train, Target_train) # In[75]: #Predicted vector of the test set Target_pred = First_eq.predict(Regressor_test) # In[76]: Target_pred # In[86]: #R-square from sklearn.metrics import r2_score r2_score(Target_test, Target_pred) # In[ ]: #Now we will be looking at the measure errors with different metrics # In[78]: #mean squared error from sklearn.metrics import mean_squared_error mean_squared_error(Target_test,Target_pred) # In[79]: #RMSE from math import sqrt rms = sqrt(mean_squared_error(Target_test,Target_pred)) rms # In[80]: #MAE from sklearn.metrics import mean_absolute_error mean_absolute_error(Target_test,Target_pred) # In[ ]: #MinMax scaler # In[81]: from sklearn import preprocessing mm_scaler = preprocessing.MinMaxScaler() Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train) mm_scaler.transform(Regressor_test) # In[ ]: #Ridge and Lasso # Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of # the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of # the coefficients. The loss function for Ridge regression is given by: # Loss function = OLS + alpha * summation (squared coefficient values) # In[82]: from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso # In[88]: rr = Ridge(alpha=0.01) rr.fit(Regressor_train, Target_train) pred_train_rr= rr.predict(Regressor_train) print(np.sqrt(mean_squared_error(Target_train,pred_train_rr))) print(r2_score(Target_train, pred_train_rr)) pred_test_rr= rr.predict(Regressor_test) print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) print(r2_score(Target_test, pred_test_rr)) # Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression. # In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values # of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by: # Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients) # In[90]: model_lasso = Lasso(alpha=0.01) model_lasso.fit(Regressor_train, Target_train) pred_train_lasso= model_lasso.predict(Regressor_train) print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso))) print(r2_score(Target_train, pred_train_lasso)) pred_test_lasso= model_lasso.predict(Regressor_test) print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) print(r2_score(Target_test, pred_test_lasso)) # Conclusion: # Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent. # # Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent. # # Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent. # # In[ ]: { "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# loading the data\n", "df = pd.read_csv(\"50_Startups-Assignment 5 (1).csv\")" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(50, 5)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
| R&D Spend | Administration | Marketing Spend | Profit |
---|---|---|---|---|
count | 50.000000 | 50.00000 | 50.000000 | 50.000000 |
mean | 95405.964000 | 157588.45116 | 271597.767660 | 146350.607120 |
std | 59925.318556 | 38608.90335 | 157969.099247 | 56580.157734 |
min | 0.000000 | 62091.78000 | 0.000000 | 16149.540000 |
25% | 52284.300250 | 128550.68550 | 160745.698500 | 112362.504000 |
50% | 93677.806000 | 167104.70400 | 275226.401500 | 144088.595000 |
75% | 128704.215500 | 178316.72850 | 391928.623500 | 181438.118500 |
max | 213161.010000 | 219174.67200 | 549252.630000 | 288392.745000 |
\n", "
" ], "text/plain": [ " R&D Spend Administration Marketing Spend Profit\n", "count 50.000000 50.00000 50.000000 50.000000\n", "mean 95405.964000 157588.45116 271597.767660 146350.607120\n", "std 59925.318556 38608.90335 157969.099247 56580.157734\n", "min 0.000000 62091.78000 0.000000 16149.540000\n", "25% 52284.300250 128550.68550 160745.698500 112362.504000\n", "50% 93677.806000 167104.70400 275226.401500 144088.595000\n", "75% 128704.215500 178316.72850 391928.623500 181438.118500\n", "max 213161.010000 219174.67200 549252.630000 288392.745000" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(df.shape)\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "#Splitting the data into Regressor(independent variables) and Target(dependent variable)\n", "Regressor = df.iloc[:, :-1].values\n", "Target = df.iloc[:, 4].values" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "#Looking at the Dataset we can see that \"State\" is a String type variable, thus we cannot feed String type variables\n", "#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables\n", "#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder\n", "\n" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "\n", "ct =ColumnTransformer(\n", " [('one_hot_encodder',OneHotEncoder(),[3])],\n", " remainder='passthrough'\n", " )\n", "Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "Regressor=Regressor[:, 1:]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "#Splitting the dataset into the Training and Test dataset\n", "from sklearn.model_selection import train_test_split\n", "Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Fitting Multiple Linear Regression to the Training Set\n", "from sklearn.linear_model import LinearRegression\n", "Model = LinearRegression()\n", "Model.fit(Regressor_train, Target_train)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "#Predicted vector of the test set\n", "Target_pred = First_eq.predict(Regressor_test)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([151477.04933493, 148373.55906895, 145612.04691522, 73551.78938889,\n", " 204859.29909382, 168287.41891873, 71694.86703419, 146474.04078268,\n", " 146705.22744893, 199856.05396648, 110629.61038141, 104083.82566741,\n", " 141706.54794344])" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Target_pred" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.35268545008708907" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#R-square\n", "from sklearn.metrics import r2_score\n", "r2_score(Target_test, Target_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Now we will be looking at the measure errors with different metrics" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1229498158.0398107" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#mean squared error\n", "from sklearn.metrics import mean_squared_error\n", "mean_squared_error(Target_test,Target_pred)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "35064.20051904521" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#RMSE\n", "from math import sqrt\n", "rms = sqrt(mean_squared_error(Target_test,Target_pred))\n", "rms" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "28760.20722757087" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#MAE\n", "from sklearn.metrics import mean_absolute_error\n", "mean_absolute_error(Target_test,Target_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#MinMax scaler" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0. , 0.45750791, 1.02540534, 0.30832024],\n", " [0. , 0. , 0.59769212, 0.43354417, 0.55862988],\n", " [1. , 0. , 0.55463896, 0.67758139, 0.46987268],\n", " [1. , 0. , 0.16560101, 0.3688421 , 0.36788892],\n", " [1. , 0. , 0.98690188, 0.45301242, 0.98850922],\n", " [0. , 1. , 0.46378014, 0.67975351, 0.85583695],\n", " [0. , 1. , 0.14012089, 0.11127112, 0.4834689 ],\n", " [0. , 1. , 0.36296832, 0.99020756, 0.19732699],\n", " [1. , 0. , 0.47591665, 0.63662866, 0.7350049 ],\n", " [1. , 0. , 1.05461885, 0.25092311, 1.02380923],\n", " [1. , 0. , 0.38438058, 0.40196356, 0.56011227],\n", " [0. , 1. , 0.25042093, 0.42743615, 0.42139433],\n", " [1. , 0. , 0.44722951, 0.91192415, 0.29984388]])" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "mm_scaler = preprocessing.MinMaxScaler()\n", "Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)\n", "mm_scaler.transform(Regressor_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Ridge and Lasso" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of \n", "the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of \n", "the coefficients. The loss function for Ridge regression is given by:\n", "Loss function = OLS + alpha * summation (squared coefficient values)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", "from sklearn.linear_model import Lasso" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25700.802829932003\n", "0.813462230059857\n", "35061.01135727592\n", "0.3528031939343419\n" ] } ], "source": [ "rr = Ridge(alpha=0.01)\n", "rr.fit(Regressor_train, Target_train) \n", "pred_train_rr= rr.predict(Regressor_train)\n", "print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))\n", "print(r2_score(Target_train, pred_train_rr))\n", "\n", "pred_test_rr= rr.predict(Regressor_test)\n", "print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) \n", "print(r2_score(Target_test, pred_test_rr))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.\n", "In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values\n", "of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:\n", "Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25700.801702551787\n", "0.8134622464250256\n", "35064.18862097854\n", "0.35268588938359413\n" ] } ], "source": [ "model_lasso = Lasso(alpha=0.01)\n", "model_lasso.fit(Regressor_train, Target_train) \n", "pred_train_lasso= model_lasso.predict(Regressor_train)\n", "print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))\n", "print(r2_score(Target_train, pred_train_lasso))\n", "\n", "pred_test_lasso= model_lasso.predict(Regressor_test)\n", "print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) \n", "print(r2_score(Target_test, pred_test_lasso))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Conclusion:\n", "Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.\n", "\n", "Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.\n", "\n", "Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }