Login
Order Now
Support
Python - 50 Startups

Python - 50 Startups

  • 31st Aug, 2021
  • 16:30 PM

#!/usr/bin/env python
# coding: utf-8

# In[10]:


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# In[25]:


# loading the data
df = pd.read_csv("50_Startups-Assignment 5 (1).csv")


# In[83]:


print(df.shape)
df.describe()


# In[68]:


#Splitting the data into Regressor(independent variables) and Target(dependent variable)
Regressor = df.iloc[:, :-1].values
Target = df.iloc[:, 4].values


# In[27]:


#Looking at the Dataset we can see that "State" is a String type variable, thus we cannot feed String type variables
#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables
#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder


# In[70]:


from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct =ColumnTransformer(
    [('one_hot_encodder',OneHotEncoder(),[3])],
    remainder='passthrough'
    )
Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)


# In[72]:


Regressor=Regressor[:, 1:]


# In[73]:


#Splitting the dataset into the Training and Test dataset
from sklearn.model_selection import train_test_split
Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)


# In[74]:


#Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
Model = LinearRegression()
Model.fit(Regressor_train, Target_train)


# In[75]:


#Predicted vector of the test set
Target_pred = First_eq.predict(Regressor_test)


# In[76]:


Target_pred


# In[86]:


#R-square
from sklearn.metrics import r2_score
r2_score(Target_test, Target_pred)


# In[ ]:


#Now we will be looking at the measure errors with different metrics


# In[78]:


#mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(Target_test,Target_pred)


# In[79]:


#RMSE
from math import sqrt
rms = sqrt(mean_squared_error(Target_test,Target_pred))
rms


# In[80]:


#MAE
from sklearn.metrics import mean_absolute_error
mean_absolute_error(Target_test,Target_pred)


# In[ ]:


#MinMax scaler


# In[81]:


from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)
mm_scaler.transform(Regressor_test)


# In[ ]:


#Ridge and Lasso


# Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of 
# the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of 
# the coefficients. The loss function for Ridge regression is given by:
# Loss function = OLS + alpha * summation (squared coefficient values)

# In[82]:


from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


# In[88]:


rr = Ridge(alpha=0.01)
rr.fit(Regressor_train, Target_train) 
pred_train_rr= rr.predict(Regressor_train)
print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))
print(r2_score(Target_train, pred_train_rr))

pred_test_rr= rr.predict(Regressor_test)
print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) 
print(r2_score(Target_test, pred_test_rr))


# Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.
# In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values
# of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:
# Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)

# In[90]:


model_lasso = Lasso(alpha=0.01)
model_lasso.fit(Regressor_train, Target_train) 
pred_train_lasso= model_lasso.predict(Regressor_train)
print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))
print(r2_score(Target_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(Regressor_test)
print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) 
print(r2_score(Target_test, pred_test_lasso))


# Conclusion:
# Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.

# Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.

# Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.

# In[ ]:


{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loading the data\n",
    "df = pd.read_csv(\"50_Startups-Assignment 5 (1).csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(50, 5)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "

\n",
       "\n",
       "\n",
       "  \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "  \n",
       "  \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "    \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "      \n",
       "    \n",
       "  \n",
       "
  R&D Spend Administration Marketing Spend Profit
count 50.000000 50.00000 50.000000 50.000000
mean 95405.964000 157588.45116 271597.767660 146350.607120
std 59925.318556 38608.90335 157969.099247 56580.157734
min 0.000000 62091.78000 0.000000 16149.540000
25% 52284.300250 128550.68550 160745.698500 112362.504000
50% 93677.806000 167104.70400 275226.401500 144088.595000
75% 128704.215500 178316.72850 391928.623500 181438.118500
max 213161.010000 219174.67200 549252.630000 288392.745000
\n",
       "

"
      ],
      "text/plain": [
       "           R&D Spend  Administration  Marketing Spend         Profit\n",
       "count      50.000000        50.00000        50.000000      50.000000\n",
       "mean    95405.964000    157588.45116    271597.767660  146350.607120\n",
       "std     59925.318556     38608.90335    157969.099247   56580.157734\n",
       "min         0.000000     62091.78000         0.000000   16149.540000\n",
       "25%     52284.300250    128550.68550    160745.698500  112362.504000\n",
       "50%     93677.806000    167104.70400    275226.401500  144088.595000\n",
       "75%    128704.215500    178316.72850    391928.623500  181438.118500\n",
       "max    213161.010000    219174.67200    549252.630000  288392.745000"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(df.shape)\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting the data into Regressor(independent variables) and Target(dependent variable)\n",
    "Regressor = df.iloc[:, :-1].values\n",
    "Target = df.iloc[:, 4].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Looking at the Dataset we can see that \"State\" is a String type variable, thus we cannot feed String type variables\n",
    "#into our Machine Learning model.To overcome this problem we use the Label Encoder object and create Dummy Variables\n",
    "#using the OneHotEncoder object. So we import ColumnTransformer along with OneHotEncoder\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "\n",
    "ct =ColumnTransformer(\n",
    "    [('one_hot_encodder',OneHotEncoder(),[3])],\n",
    "    remainder='passthrough'\n",
    "    )\n",
    "Regressor = np.array(ct.fit_transform(Regressor),dtype = np.float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "Regressor=Regressor[:, 1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Splitting the dataset into the Training and Test dataset\n",
    "from sklearn.model_selection import train_test_split\n",
    "Regressor_train, Regressor_test, Target_train, Target_test = train_test_split( Regressor, Target, test_size=0.25, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Fitting Multiple Linear Regression to the Training Set\n",
    "from sklearn.linear_model import LinearRegression\n",
    "Model = LinearRegression()\n",
    "Model.fit(Regressor_train, Target_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Predicted vector of the test set\n",
    "Target_pred = First_eq.predict(Regressor_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([151477.04933493, 148373.55906895, 145612.04691522,  73551.78938889,\n",
       "       204859.29909382, 168287.41891873,  71694.86703419, 146474.04078268,\n",
       "       146705.22744893, 199856.05396648, 110629.61038141, 104083.82566741,\n",
       "       141706.54794344])"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Target_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.35268545008708907"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#R-square\n",
    "from sklearn.metrics import r2_score\n",
    "r2_score(Target_test, Target_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Now we will be looking at the measure errors with different metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1229498158.0398107"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#mean squared error\n",
    "from sklearn.metrics import mean_squared_error\n",
    "mean_squared_error(Target_test,Target_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35064.20051904521"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#RMSE\n",
    "from math import sqrt\n",
    "rms = sqrt(mean_squared_error(Target_test,Target_pred))\n",
    "rms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28760.20722757087"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#MAE\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "mean_absolute_error(Target_test,Target_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#MinMax scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.        , 0.45750791, 1.02540534, 0.30832024],\n",
       "       [0.        , 0.        , 0.59769212, 0.43354417, 0.55862988],\n",
       "       [1.        , 0.        , 0.55463896, 0.67758139, 0.46987268],\n",
       "       [1.        , 0.        , 0.16560101, 0.3688421 , 0.36788892],\n",
       "       [1.        , 0.        , 0.98690188, 0.45301242, 0.98850922],\n",
       "       [0.        , 1.        , 0.46378014, 0.67975351, 0.85583695],\n",
       "       [0.        , 1.        , 0.14012089, 0.11127112, 0.4834689 ],\n",
       "       [0.        , 1.        , 0.36296832, 0.99020756, 0.19732699],\n",
       "       [1.        , 0.        , 0.47591665, 0.63662866, 0.7350049 ],\n",
       "       [1.        , 0.        , 1.05461885, 0.25092311, 1.02380923],\n",
       "       [1.        , 0.        , 0.38438058, 0.40196356, 0.56011227],\n",
       "       [0.        , 1.        , 0.25042093, 0.42743615, 0.42139433],\n",
       "       [1.        , 0.        , 0.44722951, 0.91192415, 0.29984388]])"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "mm_scaler = preprocessing.MinMaxScaler()\n",
    "Regressor_train_minmax = mm_scaler.fit_transform(Regressor_train)\n",
    "mm_scaler.transform(Regressor_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Ridge and Lasso"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ridge regression is an extension of linear regression where the loss function is modified to minimize the complexity of \n",
    "the model. This modification is done by adding a penalty parameter that is equivalent to the square of the magnitude of \n",
    "the coefficients. The loss function for Ridge regression is given by:\n",
    "Loss function = OLS + alpha * summation (squared coefficient values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.linear_model import Lasso"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25700.802829932003\n",
      "0.813462230059857\n",
      "35061.01135727592\n",
      "0.3528031939343419\n"
     ]
    }
   ],
   "source": [
    "rr = Ridge(alpha=0.01)\n",
    "rr.fit(Regressor_train, Target_train) \n",
    "pred_train_rr= rr.predict(Regressor_train)\n",
    "print(np.sqrt(mean_squared_error(Target_train,pred_train_rr)))\n",
    "print(r2_score(Target_train, pred_train_rr))\n",
    "\n",
    "pred_test_rr= rr.predict(Regressor_test)\n",
    "print(np.sqrt(mean_squared_error(Target_test,pred_test_rr))) \n",
    "print(r2_score(Target_test, pred_test_rr))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lasso regression, or the Least Absolute Shrinkage and Selection Operator, is also a modification of linear regression.\n",
    "In Lasso, the loss function is modified to minimize the complexity of the model by limiting the sum of the absolute values\n",
    "of the model coefficients (also called the l1-norm). The loss function for Lasso Regression is given by:\n",
    "Loss function = OLS + alpha * summation (absolute values of the magnitude of the coefficients)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25700.801702551787\n",
      "0.8134622464250256\n",
      "35064.18862097854\n",
      "0.35268588938359413\n"
     ]
    }
   ],
   "source": [
    "model_lasso = Lasso(alpha=0.01)\n",
    "model_lasso.fit(Regressor_train, Target_train) \n",
    "pred_train_lasso= model_lasso.predict(Regressor_train)\n",
    "print(np.sqrt(mean_squared_error(Target_train,pred_train_lasso)))\n",
    "print(r2_score(Target_train, pred_train_lasso))\n",
    "\n",
    "pred_test_lasso= model_lasso.predict(Regressor_test)\n",
    "print(np.sqrt(mean_squared_error(Target_test,pred_test_lasso))) \n",
    "print(r2_score(Target_test, pred_test_lasso))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Conclusion:\n",
    "Linear Regression Model: Test set RMSE of 35064.2005 thousand and R-square of 35.27 percent.\n",
    "\n",
    "Ridge Regression Model: Test set RMSE of 35061.0113 thousand and R-square of 35.28 percent.\n",
    "\n",
    "Lasso Regression Model: Test set RMSE of 35064.1886 thousand and R-square of 35.27 percent.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
 

 

Share this post

assignment helpassignment helperassignment expertsassignment writing services