# -*- coding: utf-8 -*-
"""
Created on Wed Mar 16 16:16:59 2022

@author: Kshitij
"""

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as plt

Problem =pd.read_excel ( r"C:\Users\Kshitij\Desktop\Kshitij\IAQS\sem 4\python\Problem 1.xlsx")

"""Student 1:"""

#a.) I stay in Mumbai. What if there are only a few jobs available in Mumbai?
# for resolving the student's issue, we first filter out the job availability
# in Mumbai

mumbai_jobs = Problem[Problem["Location"].astype("str").str.contains("Mumbai")]

'''There are 3731 jobs in Mumbai'''

#b.) What if I wish to settle in other metro cities moving further and how many companies do
#    I have to select from?


#We are assuming the metro cities to be the following-
#Mumbai, Bengaluru, Kolkata, Delhi, Chennai, Gurgaon, Pune

metros = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]

metro_jobs = 0
for i in metros:
    metro_jobs = metro_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
metro_jobs

list = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]
bengaluru_jobs = 0
kolkata_jobs = 0
delhi_jobs = 0
chennai_jobs = 0
gurgaon_jobs = 0
pune_jobs = 0
for i in list:
    if (i == "Bengaluru"): 
        bengaluru_jobs = bengaluru_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Kolkata"): 
        kolkata_jobs = kolkata_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Delhi"): 
        delhi_jobs = delhi_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Chennai"): 
        chennai_jobs = chennai_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Gurgaon"): 
        gurgaon_jobs = gurgaon_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Pune"): 
        pune_jobs = pune_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]

'''
bengaluru_jobs 
5529

kolkata_jobs 
373

delhi_jobs 
1834

chennai_jobs 
1518

gurgaon_jobs 
2770

pune_jobs
1781

metro_jobs
13805
'''        
#Student 2

#a.) Can you tell me that the majority openings require how many years of experience? Do
#    we have high intake for freshers?

sb.displot(Problem["Experience"], kind = "hist")        
sb.histplot(Problem["Experience"], kde = True)

#Due to a high number of unique values, the x axis is not distinguishable

openings = pd.value_counts(Problem.Experience).reset_index()
openings.columns = ["Experience", "Frequency"]
top5 = openings.iloc[0:5,:]

import matplotlib.pyplot as plt
plt.bar(top5.Experience, top5.Frequency, color = "yellow")

#The bar chart shows the years of experience that the majority openings require#

"""
  Experience  Frequency
0   5-10 yrs       1274
1    2-5 yrs       1188
2    3-8 yrs        922
3    2-7 yrs        832
4    4-9 yrs        678
"""
#Manually modifying the dataframe in variable explorer
#to sort ascending values of Experience

fresher_jobs = openings.iloc[0:6, :]

"""
   Experience  Frequency
38    0-0 yrs        151
26    0-1 yrs        296
21    0-2 yrs        386
28    0-3 yrs        269
39    0-4 yrs        130
"""

total_fresher_jobs = sum(fresher_jobs.Frequency)

"""There are 1653 total fresher jobs"""


#Q2The data contains 4 Independent (Xi) and 4 Dependent variables (Yi)
#Fit a Simple Linear Regression Model in all the 4 cases and note down your observations
#w.r.t the Model(s) built.
#Do you see any discrepancies? If yes, comment and draw final conclusion and measures.

q2 = pd.read_excel(r"C:\Users\Kshitij\Desktop\Kshitij\IAQS\sem 4\python\Problem 2.xlsx")

# Fitting regression models

import numpy as np
import statsmodels.formula.api as smf
import statsmodels.tools as sm

lm1 = smf.ols(formula='Y1 ~ X1', data=q2)
model1 = lm1.fit()
model1.summary()

lm2 = smf.ols(formula='Y2 ~ X2', data=q2)
model2 = lm2.fit()
model2.summary()

lm3 = smf.ols(formula='Y3 ~ X3', data=q2)
model3 = lm3.fit()
model3.summary()

lm4 = smf.ols(formula='Y4 ~ X4', data=q2)
model4 = lm4.fit()
model4.summary()

"""
All the models have similar R-squared and coefficient values. The summary shows the detailed
attributes of all the models. The discrepancy is that the models have a similar line of best fit even
though the data is different.
"""