Python - 50 Startups

31st Aug 2021
04:00 am

#!/usr/bin/env python
# coding: utf-8

# In[10]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# In[25]:

# loading the data
df = pd.read_csv("50_Startups-Assignment 5 (1).csv")

# In[83]:

print(df.shape)
df.describe()

# In[68]:

#Splitting the data into Regressor(independent variables) and Target(dependent variable)
Regressor = df.iloc[:, :-1].values
Target = df.iloc[:, 4].values

# In[27]:

#Looking at the Dataset we can see that "State" is a String type variable, thus we cannot feed String type variables
#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables
#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder

# In[70]:

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct =ColumnTransformer(
    [('one_hot_encodder',OneHotEncoder(),[3])],
    remainder='passthrough'
    )
Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)

# In[72]:

Regressor=Regressor[:, 1:]

# In[73]:

#Splitting the dataset into the Training and Test dataset
from sklearn.model_selection import train_test_split
Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)

# In[74]:

#Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
Model = LinearRegression()
Model.fit(Regressor_train, Target_train)

# In[75]:

#Predicted vector of the test set
Target_pred = First_eq.predict(Regressor_test)

# In[76]:

Target_pred

# In[86]:

#R-square
from sklearn.metrics import r2_score
r2_score(Target_test, Target_pred)

# In[ ]:

#Now we will be looking at the measure errors with different metrics

# In[78]:

#mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(Target_test,Target_pred)

# In[79]:

#RMSE
from math import sqrt
rms = sqrt(mean_squared_error(Target_test,Target_pred))
rms

# In[80]:

#MAE
from sklearn.metrics import mean_absolute_error
mean_absolute_error(Target_test,Target_pred)

# In[ ]:

#MinMax scaler

# In[81]:

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)
mm_scaler.transform(Regressor_test)

# In[ ]:

#Ridge and Lasso

# Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of 
# the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of 
# the coefficients. The loss function for Ridge regression is given by:
# Loss function = OLS + alpha * summation (squared coefficient values)

# In[82]:

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# In[88]:

rr = Ridge(alpha=0.01)
rr.fit(Regressor_train, Target_train) 
pred_train_rr= rr.predict(Regressor_train)
print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))
print(r2_score(Target_train, pred_train_rr))

pred_test_rr= rr.predict(Regressor_test)
print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) 
print(r2_score(Target_test, pred_test_rr))

# Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.
# In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values
# of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:
# Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)

# In[90]:

model_lasso = Lasso(alpha=0.01)
model_lasso.fit(Regressor_train, Target_train) 
pred_train_lasso= model_lasso.predict(Regressor_train)
print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))
print(r2_score(Target_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(Regressor_test)
print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) 
print(r2_score(Target_test, pred_test_lasso))

# Conclusion:
# Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.
# 
# Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.
# 
# Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.
# 

# In[ ]:

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loading the data\n",
    "df = pd.read_csv(\"50_Startups-Assignment 5 (1).csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "

\n",
       "\n",
       "\n",
       "  \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "  \n",
       "  \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "  \n",
       "

	R&D Spend	Administration	Marketing Spend	Profit
count	50.000000	50.00000	50.000000	50.000000
mean	95405.964000	157588.45116	271597.767660	146350.607120
std	59925.318556	38608.90335	157969.099247	56580.157734
min	0.000000	62091.78000	0.000000	16149.540000
25%	52284.300250	128550.68550	160745.698500	112362.504000
50%	93677.806000	167104.70400	275226.401500	144088.595000
75%	128704.215500	178316.72850	391928.623500	181438.118500
max	213161.010000	219174.67200	549252.630000	288392.745000

\n",
       "

"
      ],
      "text/plain": [
       "           R&D Spend  Administration  Marketing Spend         Profit\n",
       "count      50.000000        50.00000        50.000000      50.000000\n",
       "mean    95405.964000    157588.45116    271597.767660  146350.607120\n",
       "std     59925.318556     38608.90335    157969.099247   56580.157734\n",
       "min         0.000000     62091.78000         0.000000   16149.540000\n",
       "25%     52284.300250    128550.68550    160745.698500  112362.504000\n",
       "50%     93677.806000    167104.70400    275226.401500  144088.595000\n",
       "75%    128704.215500    178316.72850    391928.623500  181438.118500\n",
       "max    213161.010000    219174.67200    549252.630000  288392.745000"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(df.shape)\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting the data into Regressor(independent variables) and Target(dependent variable)\n",
    "Regressor = df.iloc[:, :-1].values\n",
    "Target = df.iloc[:, 4].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Looking at the Dataset we can see that \"State\" is a String type variable, thus we cannot feed String type variables\n",
    "#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables\n",
    "#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "\n",
    "ct =ColumnTransformer(\n",
    "    [('one_hot_encodder',OneHotEncoder(),[3])],\n",
    "    remainder='passthrough'\n",
    "    )\n",
    "Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "Regressor=Regressor[:, 1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting the dataset into the Training and Test dataset\n",
    "from sklearn.model_selection import train_test_split\n",
    "Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Fitting Multiple Linear Regression to the Training Set\n",
    "from sklearn.linear_model import LinearRegression\n",
    "Model = LinearRegression()\n",
    "Model.fit(Regressor_train, Target_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Predicted vector of the test set\n",
    "Target_pred = First_eq.predict(Regressor_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([151477.04933493, 148373.55906895, 145612.04691522,  73551.78938889,\n",
       "       204859.29909382, 168287.41891873,  71694.86703419, 146474.04078268,\n",
       "       146705.22744893, 199856.05396648, 110629.61038141, 104083.82566741,\n",
       "       141706.54794344])"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Target_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.35268545008708907"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#R-square\n",
    "from sklearn.metrics import r2_score\n",
    "r2_score(Target_test, Target_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Now we will be looking at the measure errors with different metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1229498158.0398107"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#mean squared error\n",
    "from sklearn.metrics import mean_squared_error\n",
    "mean_squared_error(Target_test,Target_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35064.20051904521"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#RMSE\n",
    "from math import sqrt\n",
    "rms = sqrt(mean_squared_error(Target_test,Target_pred))\n",
    "rms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28760.20722757087"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#MAE\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "mean_absolute_error(Target_test,Target_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#MinMax scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.        , 0.45750791, 1.02540534, 0.30832024],\n",
       "       [0.        , 0.        , 0.59769212, 0.43354417, 0.55862988],\n",
       "       [1.        , 0.        , 0.55463896, 0.67758139, 0.46987268],\n",
       "       [1.        , 0.        , 0.16560101, 0.3688421 , 0.36788892],\n",
       "       [1.        , 0.        , 0.98690188, 0.45301242, 0.98850922],\n",
       "       [0.        , 1.        , 0.46378014, 0.67975351, 0.85583695],\n",
       "       [0.        , 1.        , 0.14012089, 0.11127112, 0.4834689 ],\n",
       "       [0.        , 1.        , 0.36296832, 0.99020756, 0.19732699],\n",
       "       [1.        , 0.        , 0.47591665, 0.63662866, 0.7350049 ],\n",
       "       [1.        , 0.        , 1.05461885, 0.25092311, 1.02380923],\n",
       "       [1.        , 0.        , 0.38438058, 0.40196356, 0.56011227],\n",
       "       [0.        , 1.        , 0.25042093, 0.42743615, 0.42139433],\n",
       "       [1.        , 0.        , 0.44722951, 0.91192415, 0.29984388]])"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "mm_scaler = preprocessing.MinMaxScaler()\n",
    "Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)\n",
    "mm_scaler.transform(Regressor_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Ridge and Lasso"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of \n",
    "the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of \n",
    "the coefficients. The loss function for Ridge regression is given by:\n",
    "Loss function = OLS + alpha * summation (squared coefficient values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.linear_model import Lasso"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25700.802829932003\n",
      "0.813462230059857\n",
      "35061.01135727592\n",
      "0.3528031939343419\n"
     ]
    }
   ],
   "source": [
    "rr = Ridge(alpha=0.01)\n",
    "rr.fit(Regressor_train, Target_train) \n",
    "pred_train_rr= rr.predict(Regressor_train)\n",
    "print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))\n",
    "print(r2_score(Target_train, pred_train_rr))\n",
    "\n",
    "pred_test_rr= rr.predict(Regressor_test)\n",
    "print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) \n",
    "print(r2_score(Target_test, pred_test_rr))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.\n",
    "In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values\n",
    "of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:\n",
    "Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25700.801702551787\n",
      "0.8134622464250256\n",
      "35064.18862097854\n",
      "0.35268588938359413\n"
     ]
    }
   ],
   "source": [
    "model_lasso = Lasso(alpha=0.01)\n",
    "model_lasso.fit(Regressor_train, Target_train) \n",
    "pred_train_lasso= model_lasso.predict(Regressor_train)\n",
    "print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))\n",
    "print(r2_score(Target_train, pred_train_lasso))\n",
    "\n",
    "pred_test_lasso= model_lasso.predict(Regressor_test)\n",
    "print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) \n",
    "print(r2_score(Target_test, pred_test_lasso))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Conclusion:\n",
    "Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.\n",
    "\n",
    "Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.\n",
    "\n",
    "Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

Python - 50 Startups

Python - 50 Startups

Share this post

Recent Blogs

The Programming Assignment Help - Instant and Affordable Coding Help from the Top-Rated Tutors!