## Jash Poonatar
## R Assignment-1
## Roll no:- 25
##Importing the dataset
df_marks = read.csv(file.choose()) 

##Performing exploratory data analysis:
head(df_marks)
dim(df_marks) 
str(df_marks) 

## Since the final score is a numeric variable, we can go ahead with linear 
## regression, or a regression tree if the linear accuracy is low
## Since the grade is a categorical variable with 4 levels we can go ahead with 
## a classification tree 

## First we check for any missing values by running code:- 
sum(is.na(df_marks)) 
sapply(df_marks, function(x) sum(is.na(df_marks))) 
## We see that the data-set does not have any missing values and we can proceed 
## with the investigation. 

## Checking for outliers:
marks_outliers = sapply(df_marks[,-c(6,7)], function(x) (length(boxplot(x)$out))) 
marks_outliers
## We see that the dataset has 14 outliers in the Exam3 column which needs to be 
## replaced and 1 outlier in the final score column which can be ignored.

##Replacing Outliers:
boxplot(df_marks$Exam3)$out 
df_marks$Exam3[(df_marks$Exam3 %in% (boxplot(df_marks$Exam3)$out))]=mean(df_marks$Exam3,na.rm=TRUE) 
boxplot(df_marks$Exam3) 


## Performing the Train-Test Split:
set.seed(100) 
s = sample(nrow(df_marks),.80*nrow(df_marks)) 
s
df_training = df_marks[s,] 
df_training
df_testing = df_marks[-s,] 
df_testing

nrow(df_training) 
nrow(df_testing) 

## 1)For the prediction of Final Score:
## a)Linear Regression:
model1 = lm(Final_score~., data = df_training) 
summary(model1) 
## We see that the R-squared is 89% which is fairly good
## Exam 1 and the grade B,C and D have p-values less than 0.05 and are thus 
## significant variables.However since the final score does not depend on the 
## grade intuitively, we only use the exam1,2,3 and 4 as the variables in the model

model2 = lm(Final_score~ī..Exam1+Exam2+Exam3+Exam4, data = df_training) 
summary(model2) 
## We see that the R-squared is now just 53% which is poor
## The p-values of Exam1, 2 and 3 are less than 0.05 making them significant
## parameters.The F-statistic however is just 22.84

AIC(model2) 
##The AIC of the model is 687

model3 = lm(Final_score~ī..Exam1+Exam2+Exam3, data = df_training) 
summary(model3) 
## We see that the R-squared is now just 53% which is poor
## The p-values of Exam1, 2 and 3 are all less than 0.05 making them significant 
## parameters. The F-statistic however has increased to 30.11 making it a better
## model than model2 due to greater variance

AIC(model3) 
## The AIC of the model is now 686

## Checking for the assumption of linearity:
plot(df_marks) 
## We see that the Final Score has a linear relationship with Exam1, Exam2, Exam 
## 3 and very slightly with Exam 4 which is in line with our linear model

## Checking for the assumption of multicollinearity: 
vif(model3)
## Since the VIF of all the variables is much less than 5, it is safe the assume
## that there is no multicollinearity present in the model

##Checking for the assumption of multivariate normality: 
shapiro.test(model3$residuals) 
##Since the p-value is less than 0.05 (0.0006), we reject H0 and conclude that the residuals do not follow a normal distribution

plot(density(model3$residuals)) 
##We see that the residuals follow a very slightly skewed distribution (almost normal distribution)

##Checking for the assumption of homoscedasticity: 
bptest(model3) 
##Since the p-value is very less than 0.05 (1.339e-05), we reject H0 and conclude that the residuals or error terms do not possess homoscedasticity 

##Printing Diagnostic Plots:
dev.off() 
par(mfrow=c(2,2)) 
plot(model3) 
##The residuals v/s fitted plot follows a slight pattern with a low u-curve and hence the residuals follow an almost normal distribution and we can thus continue with a linear regression 
##The Q-Q Plot follows a straight line but deviates slightly towards the upper and lower tails and hence the residuals follow an almost normal distribution except towards the tails
##The Scale-Location plot has equal data points on either side of skewed red line and hence the residuals do not possess homoscedasticity

##Performing predictions on the basis of the Model3 and the testing dataset:
p = predict(model3, df_testing) 
p
(cor(df_testing$Final_score, p))^2 
##We see that the prediction shows an accuracy of 59%
##Since linear regression gives us a low predictive accuracy, we now go for a regression tree 

##b)Creating a decision (regression) tree:
dtree = rpart(Final_score~Exam1+Exam2+Exam3, data = df_training, method = "anova") 
##Since the dataset is large enough, we do not need to tune the hyperparameters of the decision tree
fancyRpartPlot(dtree) 

##Performing predictions based on dtree:
p = predict(dtree, df_testing) 
p

##Finding the R-Squared for accuracy:
r_squared = cor(df_testing$Final_score, p)**2 
r_squared
##We now get an accuracy of 69% and hence applying a regression tree instead of linear regression is the right choice

df_predict = data.frame("actual values" = df_testing$Final_score, "predicted values" = p) 
df_predict

##Finding the min-max accuracy: 
min_max_accuracy = mean(apply(df_predict,1,min)/apply(df_predict,1,max)) 
min_max_accuracy
##We get an accuracy of 89% which is a good value


##2)For the prediction of Grade:
##a)Creating a decision (classification) tree:
dtree1 = rpart(Grade~., data = df_training, method = "class") 
fancyRpartPlot(dtree1) 
##We see that the decision tree is classified only based on the final score although we used a full model

##Getting the complexity parameter:
printcp(dtree1)
plotcp(dtree1) 



##Performing predictions based on dtree1:
p1 = predict(dtree1, df_testing, type = "class") 
p1

##Confusion Matrix:
t = table(df_testing$Grade,p1) 
t
accuracy = sum(diag(t))/sum(t) 
accuracy
##We see that the accuracy is 95% which is a very good value