# -*- coding: utf-8 -*
"""
Created on Mon Mar 28 11:05:44 2022

@author: harsh
"""
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as plt


"""Q1"""

#"""Student 1:"""

PROBLEM1 = pd.read_excel(r"C:\Users\harsh\Downloads\Problem 1.xlsx")
PROBLEM1

#a.) I stay in Mumbai. What if there are only a few jobs available in Mumbai?
# for resolving the student's issue, we first filter out the job availability in Mumbai

mumbai_jobs = PROBLEM1[PROBLEM1["Location"].astype("str").str.contains("Mumbai")]

#Ans - '''There are 3731 jobs in Mumbai'''

#b.) What if I wish to settle in other metro cities moving further and 
#how many companies do I have to select from?


#We are assuming the metro cities to be the following-
#Mumbai, Bengaluru, Kolkata, Delhi, Chennai, Gurgaon, Pune

metro_cities = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]

#by using loops
metro_jobs = 0
for i in metro_cities:
    metro_jobs = metro_jobs + PROBLEM1[PROBLEM1["Location"].astype("str").str.contains(i)].shape[0]
metro_jobs
#Ans - The student can look out for 13805 companies.


#"""Student 2"""

#a.) Can you tell me that the majority openings require how many years of experience? 
#Do we have high intake for freshers? 

sb.displot(PROBLEM1["Experience"], kind = "hist", color = "cyan")        
sb.histplot(PROBLEM1["Experience"], kde = True, color = "cyan")

#since there are high number of unique values, the x axis is not distinguishable

#now we plot top 10 ranges with highest occurence
major_openings = pd.value_counts(PROBLEM1.Experience).reset_index()

#renaming columns
major_openings.columns = ["Experience", "Frequency"]

#therby we locate top 10 rows
top10 = major_openings.iloc[0:10,:]

import matplotlib.pyplot as plt
plt.bar(top10.Experience, top10.Frequency, color = "grey")

#The bar chart shows the years of experience that the majority openings require

# sorting the majority openings for jobs open for freshers:
major_openings = major_openings.sort_values(by = 'Experience')
fresher_openings = major_openings.iloc[0:6,:]
# total fresher openings:
total_fresheropenings = sum(fresher_openings.Frequency)
total_fresheropenings
#Ans -  Total fresher openings are 1653       


"""Q2"""

PROBLEM2 = pd.read_excel(r"C:\Users\harsh\Downloads\Problem 2.xlsx")
PROBLEM2

# Fitting regression models

import numpy as np
import statsmodels.api as sm
import pandas as pd
import statsmodels.api as smf


X1 = np.array(PROBLEM2["X1"]).reshape(-1, 1)
X1 = sm.add_constant(X1)
X2 = np.array(PROBLEM2["X2"]).reshape(-1, 1)
X2 = sm.add_constant(X2)
X3 = np.array(PROBLEM2["X3"]).reshape(-1, 1)
X3 = sm.add_constant(X3)
X4 = np.array(PROBLEM2["X4"]).reshape(-1, 1)
X4 = sm.add_constant(X4)

Y1 = np.array(PROBLEM2["Y1"]).reshape(-1, 1)
Y2 = np.array(PROBLEM2["Y2"]).reshape(-1, 1)
Y3 = np.array(PROBLEM2["Y3"]).reshape(-1, 1)
Y4 = np.array(PROBLEM2["Y4"]).reshape(-1, 1)

model1 = smf.OLS(Y1, X1)
print(model1.fit().summary())
# here Durbin-Watson Test indicates that there is negative autocorrelation

model2 = smf.OLS(Y2, X2)
print(model2.fit().summary())
# here Durbin-Watson Test indicates that there is negative autocorrelation

model3 = smf.OLS(Y3, X3)
print(model3.fit().summary())
# here Durbin-Watson Test indicates that there is negative autocorrelation

model4 = smf.OLS(Y4, X4)
print(model4.fit().summary())
# here Durbin-Watson Test indicates that there is positive autocorrelation
# and Jarque-Bera test statistic of 0.524, indicates that errors are normally distributed 

##All 4 models have similar AIC and R squared values, 
##and the first three models have similar goodness of fit, relating to negative correlation
##fourth model statisfies all the test statistics and goodness of fit tests and seems to be the best model.