#install.packages("forecast")
#install.packages('tidyverse')
library(forecast)
library(tseries)
library(dplyr)
data = read.csv('D:/RICHA/Semester 6/SRM 4/Project/covid_19_india.csv',header = TRUE)

head(data)
str(data)

#converting date column to date format from character.
data$Date = as.Date(data$Date,'%d-%m-%Y')

#Extracting required columns
Date = data$Date
Cases = data$Confirmed
covid_df = data.frame(Date, Cases)
head(covid_df)

#summing up the cases in different states for the same state.
covid_df_agg =aggregate(covid_df$Cases, by = list(covid_df$Date), FUN = sum)
head(covid_df_agg)
duplicated(covid_df_agg$Group.1)

#renaming columns
names(covid_df_agg)[names(covid_df_agg) == "Group.1"] <- "date"
names(covid_df_agg)[names(covid_df_agg) == "x"] <- "conf_cases"
colnames(covid_df_agg)
head(covid_df_agg)


#frequency is 365 as the data we have is on a daily basis
cases_ts = ts(covid_df_agg$conf_cases,start = c(2020),frequency = 365)

#plotting time series to see the trend.
autoplot(cases_ts)
#install.packages("TSstudio")
library(TSstudio)
ts.plot(cases_ts)
# we see a clear trend in the data and can say that the it is not stationary

acf(cases_ts)
#there is a high auto-correlation which means that the data is not stationary.
pacf(cases_ts)
#this seems fine as not many spikes are crossing the blue dotted line.

#eliminating trends and fitting the model
cases_ts_log = log(cases_ts)

acf(cases_ts_log)
#there is a high auto-correlation which means that the data is not stationary.
pacf(cases_ts_log)
#this seems fine as not many spikes are crossing the blue dotted line.

cases_ts_log_d2 = diff(cases_ts_log, difference = 5)
acf(cases_ts_log_d2)

#using auto.arima function to find the best model with the lowest AIC
covidmodel = auto.arima(cases_ts_log,ic = "aic",trace = TRUE)
#best model is ARIMA(4,2,1) with AIC of -1296.698
covidmodel
acf(ts(covidmodel$residuals))
pacf(ts(covidmodel$residuals))

covid_forecast = forecast(covidmodel,level = c(95), h = 30)
str(covid_forecast)
colnames(covid_forecast)
head(covid_forecast$mean)
plot(covid_forecast)
as.numeric(covid_forecast$mean)
forecasted_cases = exp(covid_forecast$mean)
forecasted_dates = format(seq(as.Date("2021-08-12"), as.Date("2021-09-10"), by ="days",format = "%d-%m-%Y"))
forecasted_dates
forecasted_df = data.frame(forecasted_dates,forecasted_cases)
head(forecasted_df)
names(forecasted_df)[names(forecasted_df) == "forecasted_dates"] <- "date"
names(forecasted_df)[names(forecasted_df) == "forecasted_cases"] <- "conf_cases"

Box.test(covidmodel$residuals, lag = 5, type = "Ljung-Box")

Box.test(covidmodel$residuals, lag = 15, type = "Ljung-Box")
#if p-value is greater than 0.05 it means that there is no auto-correlation between the residuals and the model is a good fit.

forecasted_covid_df_agg = rbind(covid_df_agg,forecasted_df)
forecasted_covid_df_agg

#install.packages("writexl")
library(writexl)
write_xlsx(forecasted_covid_df_agg,"D:/RICHA/Semester 6/SRM 4/Project/covid_cases_forecast1.xlsx")
