import pandas as pd
import numpy as np 


#Q1)
problem1 = pd.read_excel("C:\\Users\\abhinav\\Downloads\\Problem 1.xlsx")
#Student1
#a)
#finding no of jobs in Mumbai
problem1[problem1["Location"].astype("str").str.contains("Mumbai")].shape[0]
#There are ample of job opportunities in Mumbai (3731 in total)

#b)
#finding no of jobs in Mumbai
#We take 6 other metropolitan cities other than mumbai and find no of job 
#opportunities present
list = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]
metro_jobs = 0
for i in list:
    metro_jobs = metro_jobs + problem1[problem1["Location"].astype("str").str.contains(i)].shape[0]
metro_jobs
#There are 13805 job opportunities in the 6 other metro cities which you can
#opt for so again there is ample of jobs to choose from if at all he moves out 
#of mumbai to other metro cities so the student will
#not face any problem of lack of job availablity.

#Student2
#a)Can you tell me that the majority openings require how many years of
#experience? Do we have high intake for freshers?
#Because there are 129 unique values in Experience column, we select top 7
#values on the basis of count and plot it to see which range has highest demand
#and answer accordingly
freqTable = pd.value_counts(problem1.Experience).reset_index()
freqTable.columns = ['Experience', 'frequency']
Top7 = freqTable.iloc[0:7,:]
import matplotlib.pyplot as plt 
plt.bar(Top7.Experience,Top7.frequency)
#from the plot, we can see that experience of 5-10 years have highest count and
#therefore higher demand among other ranges.
#To look for freshers, we will extract fields 0-0 yrs, 0-1 yrs, 0-2 yrs, 
#0-3 yrs, 0-4 yrs and 0-5 yrs as these feilds means that the job appoints
#freshers as well. for doing this, we will sort the frequency table in 
#ascending order of experience and extract top 6 rows and ass the counts to 
#check how many job openings are ready to appoint freshers
freqTableascsorted = freqTable.sort_values(by = 'Experience')
freshersfreq = freqTableascsorted.iloc[0:6,:]
freshers_job_openings = sum(freshersfreq.frequency)
freshers_job_openings  
#is equal to 1653
#	Experience	frequency
#38	 0-0 yrs	     151
#26	 0-1 yrs    	 296
#21  0-2 yrs	     386
#28	 0-3 yrs	     269
#39	 0-4 yrs	     130
#18	 0-5 yrs	     421
#we can see that there are total 1653 job openings which take
#in freshers so there are enough jobs for freshers to start with. 

#b)But what about my salary if I am unwilling to shift to a new location in the
#later stage of my career? Which city will give me the best salary?
#we find the place having best salary for a fresher as the student is a 
#fresher 
list = ["Experience","Location","Salary"]
data = problem1[list]
data = data.sort_values(by = 'Experience')
freshersdata = data.iloc[0:1653,:]
freshersdata = freshersdata.sort_values(by = 'Salary', ascending = False)
#highest salary range is 6-10, so we extract jobs with salary 6-10 and check
#which city has most jobs with this particular salary range for freshers
index = np.where(freshersdata.Salary == '6to10', True, False)
highestsalarytable = freshersdata[index]
highestsalarytable['Location'].value_counts()
#Bengaluru                              11
#Gurgaon                                 9
#Noida                                   5
#Delhi                                   4
#Mumbai                                  4
#Chennai                                 3
#Delhi, Gurgaon                          2
#Gurgaon(Udyog Vihar)                    2
#Pune                                    2
#Kolkata                                 2
#Bengaluru, Chennai, Hyderabad           1
#Delhi(Vasant Vihar)                     1
#Noida(Sector-8 Noida)                   1
#Mumbai, Navi Mumbai, Mumbai Suburbs     1
#Gurgaon, Noida                          1
#Mumbai, Pune                            1
#Ahmedabad(Makarba)                      1
#Ahmedabad(Sola)                         1
#Lucknow                                 1
#Mumbai, Mumbai Suburbs, Navi Mumbai     1
#Mumbai Suburbs                          1
#Gurgaon(Cyber City)                     1
#Nasik, Mumbai                           1
#Navi Mumbai                             1
#Pune(Hadapsar)                          1
#Delhi, Bengaluru, Hyderabad             1
#Bengaluru(Hebbal)                       1
#Delhi, Bengaluru, Gurgaon               1
#Noida, Greater Noida                    1
#Hyderabad, Chennai, Karimnagar          1
#Mumbai Suburbs, Mumbai                  1
#Bengaluru(Sector 1 HSR Layout+7)        1
#Ahmedabad, Gandhinagar                  1
#from the frequency table above we can see that Gurgaon has highest job 
#openings at highest slalry range for freshers (16) and so is the best city
#for a fresher to start at if he/she aims for a good CTC.

 
#Q2)
problem2 = pd.read_excel("C:\\Users\\abhinav\\Downloads\\Problem 2.xlsx")
import statsmodels.formula.api as smf
lr1 = smf.ols(formula='Y1 ~ X1', data=problem2)
fitted_model_1 = lr1.fit()
fitted_model_1.summary()

"""
                coef      std err       t        P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.2721      2.335      3.543      0.006       2.991      13.553
X1             0.5001      0.118      4.241      0.002       0.233       0.767

R-squared:                       0.667
Adj. R-squared:                  0.629
F-statistic:                     17.99

"""

lr2 = smf.ols(formula='Y2 ~ X2', data=problem2)
fitted_model_2 = lr2.fit()
fitted_model_2.summary()

"""
                coef      std err       t        P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.2739      2.336      3.542      0.006       2.990      13.558
X2             0.5000      0.118      4.239      0.002       0.233       0.767

R-squared:                       0.666
Adj. R-squared:                  0.629
F-statistic:                     17.97

"""

lr3 = smf.ols(formula='Y3 ~ X3', data=problem2)
fitted_model_3 = lr3.fit()
fitted_model_3.summary()

"""
                coef      std err       t        P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.2783      2.334      3.547      0.006       2.998      13.558
X3             0.4997      0.118      4.239      0.002       0.233       0.766

R-squared:                       0.666
Adj. R-squared:                  0.629
F-statistic:                     17.97

"""

lr4 = smf.ols(formula='Y4 ~ X4', data=problem2)
fitted_model_4 = lr4.fit()
fitted_model_4.summary()

"""
                coef      std err       t        P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.2757      2.333      3.547      0.006       2.998      13.553
X4             0.4999      0.118      4.243      0.002       0.233       0.766

R-squared:                       0.667
Adj. R-squared:                  0.630
F-statistic:                     18.00

"""

"""
The values of coefficients, r squred value, adj. r squared value, f statistic,
p values for wald test are the same for all the 4 models even though they are 
different datasets. This means that same model is fitted for all the datasets 
so we can conclude that the datasets given are very similar and is hinting 
towards some disrepancy which has to be rectified through review of the dataset
and understanding the variables given and performing changes accordingly.
"""