# -*- coding: utf-8 -*-
"""
Created on Mon Mar 28 13:33:50 2022

@author: kshri
"""

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as plt
import matplotlib.pyplot as ply

Problem =pd.read_excel ( r"D:\shriya\class 13 on\class 14 (second yr)\sem4\python\data\Problem 1.xlsx")

"""Student 1:"""

#a.) I stay in Mumbai. What if there are only a few jobs available in Mumbai?
# for resolving the student's issue, we first filter out the job availability
# in Mumbai

mumbai_jobs = Problem[Problem["Location"].astype("str").str.contains("Mumbai")]

'''There are 3731 jobs in Mumbai'''

#b.) What if I wish to settle in other metro cities moving further and how many companies do
#    I have to select from?


#We are assuming the metro cities to be the following-
#Mumbai, Bengaluru, Kolkata, Delhi, Chennai, Gurgaon, Pune

metros = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]

metro_job = 0
for i in metros:
    metro_job = metro_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
metro_job

list = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]
bengaluru_job = 0
kolkata_job = 0
delhi_job = 0
chennai_job = 0
gurgaon_job = 0
pune_job = 0
for i in list:
    if (i == "Bengaluru"): 
        bengaluru_job = bengaluru_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Kolkata"): 
        kolkata_job = kolkata_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Delhi"): 
        delhi_job = delhi_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Chennai"): 
        chennai_job = chennai_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Gurgaon"): 
        gurgaon_job = gurgaon_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Pune"): 
        pune_job = pune_job + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]

'''
bengaluru_job 
5529

kolkata_job 
373

delhi_job 
1834

chennai_job 
1518

gurgaon_job 
2770

pune_job
1781

metro_job
13805
'''        
#Student 2

#a.) Can you tell me that the majority openings require how many years of experience? Do
#    we have high intake for freshers?

sb.displot(Problem["Experience"], kind = "hist")        
sb.histplot(Problem["Experience"], kde = True)


experience_problem=Problem["Experience"].value_counts()    
ply.pie(experience_problem)

#Due to a high number of unique values, visual representation through graphs
#like histogram and pie chart are not suirtable

openings = pd.value_counts(Problem.Experience).reset_index()
openings.columns = ["Experience", "Frequency"]
top5 = openings.iloc[0:5,:]
plt.bar(top5.Experience, top5.Frequency, color = "yellow")

#The bar chart shows the years of experience that the majority openings require
#Bar chart is the best form of visualization

# an alternate form of could be a distribution table like
experience_problem=Problem["Experience"].value_counts()
experience_problem
"""
  Experience  Frequency
0   5-10 yrs       1274
1    2-5 yrs       1188
2    3-8 yrs        922
3    2-7 yrs        832
4    4-9 yrs        678
"""
#Manually modifying the dataframe in variable explorer
#to sort ascending values of Experience

fresher_jobs = openings.iloc[0:6, :]

"""
   Experience  Frequency
38    0-0 yrs        151
26    0-1 yrs        296
21    0-2 yrs        386
28    0-3 yrs        269
39    0-4 yrs        130
"""

total_fresher_jobs = sum(fresher_jobs.Frequency)

"""There are 1653 total fresher jobs"""

### PART B:
        
# Which city will give me the best salary if I only wish to shift now?

# Given the student is a fresher, we find the best salary giving city as follows:
new_columns = ["Experience","Location","Salary"]

# adding the new colums to "Problem" dataset into a new variable
Best_salary = Problem[new_columns]

# sorting the above for "ONLY" freshers
Best_salary = Best_salary.sort_values(by = "Experience")
freshers_salary = Best_salary.iloc[0:1653,:]
freshers_salary = freshers_salary.sort_values(by = "Salary", ascending = False )


# since 6-10 is the highest salary range, we extract those cities which provide 
# this salary the most no. of times

highest_salary = np.where(freshers_salary.Salary == "6to10", True, False)
salary_table = freshers_salary[highest_salary]
salary_table["Location"].value_counts()

# comment - upon analysing the outcome we realise that ""Gurgaon"" has the highest
# job opening in the "6to10" range at '16' followed by Mumbai and Bangalore.




problem2 =pd.read_csv ( r"D:\shriya\class 13 on\class 14 (second yr)\sem4\python\data\prob-2.csv")


import statsmodels.formula.api as smf

lr1 = smf.ols(formula='Y1 ~ X1', data=problem2)
fitted_model1 = lr1.fit()
fitted_model1.summary()

lr2 = smf.ols(formula='Y2 ~ X2', data=problem2)
fitted_model2 = lr2.fit()
fitted_model2.summary()

lr3 = smf.ols(formula='Y3 ~ X3', data=problem2)
fitted_model3 = lr3.fit()
fitted_model2.summary()


lr4 = smf.ols(formula='Y4 ~ X4', data=problem2)
fitted_model4= lr4.fit()
fitted_model4.summary()