library(MASS)

data(Cars93)
head(Cars93)

View(Cars93)

# user written function for creating descriptive statistics
mystats <- function(x) {
  nmiss<-sum(is.na(x))
  a <- x[!is.na(x)]
  m <- mean(a)
  n <- length(a)
  s <- sd(a)
  min <- min(a)
  p1<-quantile(a,0.01)
  p5<-quantile(a,0.05)
  p10<-quantile(a,0.10)
  q1<-quantile(a,0.25)
  q2<-quantile(a,0.5)
  q3<-quantile(a,0.75)
  p90<-quantile(a,0.90)
  p95<-quantile(a,0.95)
  p99<-quantile(a,0.99)
  max <- max(a)
  UC <- m+3*s
  LC <- m-3*s
  outlier_flag<- max>UC | min<LC
  return(c(n=n, nmiss=nmiss, outlier_flag=outlier_flag, mean=m, stdev=s,min = min, p1=p1,p5=p5,p10=p10,q1=q1,q2=q2,q3=q3,p90=p90,p95=p95,p99=p99,max=max, UC=UC, LC=LC ))
}

str(Cars93)

Cars93$ln_mpg<-log(Cars93$MPG.highway)

vars <- c('Min.Price','Price','Max.Price','MPG.city','MPG.highway','EngineSize','Horsepower','RPM',
          'Rev.per.mile','Fuel.tank.capacity','Passengers','Length','Wheelbase','Width','Turn.circle',
          'Rear.seat.room','Luggage.room','Weight')



diag_stats<-t(data.frame(apply(Cars93[vars], 2, mystats)))
View(diag_stats)


#Outlier Capping
Cars93$Min.Price[Cars93$Min.Price > 43.928]<- 43.928
Cars93$Price[Cars93$Price > 49.20]<- 49.20
Cars93$Max.Price[Cars93$Max.Price > 52.768]<- 52.768
Cars93$MPG.city[Cars93$MPG.city > 42.320]<- 42.320
Cars93$MPG.highway[Cars93$MPG.highway > 46.320]<- 46.320
Cars93$Fuel.tank.capacity[Cars93$Fuel.tank.capacity > 23.32]<- 23.32

#Correlation Matrix
corrm <- cor(Cars93[,vars])
View(corrm)

write.csv(corrm, file="corr.csv")

#Variable Reduction
fitt <- step(lm(MPG.highway~ Min.Price
                +Price
                +Max.Price
                +MPG.city
                +MPG.highway
                +EngineSize
                +Horsepower
                +RPM
                +Rev.per.mile
                +Fuel.tank.capacity
                +Passengers
                +Length
                +Wheelbase
                +Width
                +Turn.circle
                +Rear.seat.room+ Luggage.room
                +Weight+Type, data = Cars93), direction = "both")

fitt_an <- anova(fitt)
summary(fitt_an) 

set.seed(123)
#Splitting data into Training, Validaton and Testing Dataset
train_ind <- sample(1:nrow(Cars93), size = floor(0.70 * nrow(Cars93)))

training <- Cars93[train_ind,]
testing <- Cars93[-train_ind,]

#Building model for training dataset
fit <- lm(MPG.highway ~ Price + Max.Price + MPG.city + Rev.per.mile + Passengers + 
            Length + Wheelbase + Luggage.room + Weight, data=training)

summary(fit)


require(MASS)
step1 <- stepAIC(fit, direction = "both")
ls(step1)
step1$anova

#Final Model on training dataset
fit2 <-lm(MPG.highway ~ Price + Max.Price + MPG.city + Passengers + Length + 
            Wheelbase + Luggage.room + Weight, data=training)

summary(fit2)


#multi collinearity check using VIF
library(car)
vif(fit2)

