"Roll.No _ 432-AOI-Assignment 2"

# Importing the required libraries and datasets

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as mt

"------------------------""QUESTION 1--------------------------------"""
Problem =pd.read_excel ( r"P:\Complete Actuarial Science Stuff\Python\assignment 2\Problem 1.xlsx")

"""Student 1:"""
# for resolving the student's issue, we first filter out the job availability
# in Mumbai

mumbai_jobs = Problem[Problem["Location"].astype("str").str.contains("Mumbai")]

# COMMENT - there are ample number of job opportuniites in the region. ["3731 to
# be specific"]

# for job opportunities in other metro cities, we consider the following Metro 
# cities - "Mumbai, Bengaluru, Delhi, Gurgaon, Chennai, Pune"


metro_list = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]

# using loops:
metro_jobs = 0
for i in metro_list:
    metro_jobs =metro_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]

"Alternative way: getting job availabilties for individual metro cities"

metro_list = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]
bengaluru_jobs = 0
kolkata_jobs = 0
delhi_jobs = 0
chennai_jobs = 0
gurgaon_jobs = 0
pune_jobs = 0
for i in metro_list:
    if (i == "Bengaluru"): 
        bengaluru_jobs = bengaluru_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Kolkata"): 
        kolkata_jobs = kolkata_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Delhi"): 
        delhi_jobs = delhi_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Chennai"): 
        chennai_jobs = chennai_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Gurgaon"): 
        gurgaon_jobs = gurgaon_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]
    if (i == "Pune"): 
        pune_jobs = pune_jobs + Problem[Problem["Location"].astype("str").str.contains(i)].shape[0]

#  the student will have to look out from 13805 job opportunities

"""Student 2:"""
### PART A:
# Required: Can you tell me that the majority openings require how many years of experience? Do
# we have high intake for freshers?

sb.displot(Problem["Experience"],kind = "hist",
            color = "violet")

# due to too many unique experience ranges, the plot gets too clumsy rendering 
# it inapproriate
# Thus, we will create a list containing value count for each unique range. 
# Furthermore, only plotting the top 10 ranges with highest occurence

Majority_opening = Problem["Experience"].value_counts().reset_index()

# renaming the columns of the above list
Majority_opening.columns = ['Experience', 'frequency']

# locating the top 10 rows of the above list for ease of plotting and visualisation
Top_10 = Majority_opening.iloc[0:10,:]

#plotting the top 10 openings:
import matplotlib.pyplot as plt 
plt.bar(Top_10.Experience,Top_10.frequency, color = "violet")

# sorting the majority openings for jobs open for freshers:
Majority_opening = Majority_opening.sort_values(by = 'Experience')
fresher_opening = Majority_opening.iloc[0:6,:]
# total fresher openings:
total_fresh_opening = sum(fresher_opening.frequency)
total_fresh_opening 
        # total freq = 1653

### PART B:
        
# Which city will give me the best salary if I only wish to shift now?

# Given the student is a fresher, we find the best salary giving city as follows:
new_columns = ["Experience","Location","Salary"]

# storing the required columns from the "Problem" dataset into a new variable
Best_salary = Problem[new_columns]

# sorting the above for "ONLY" freshers
Best_salary = Best_salary.sort_values(by = "Experience")
freshers_salary = Best_salary.iloc[0:1653,:]
freshers_salary = freshers_salary.sort_values(by = "Salary", ascending = False )

# since 6-10 is the highest salary range, we extract those cities which provide 
# this salary the most no. of times
import numpy as np
highest_salary = np.where(freshers_salary.Salary == "6to10", True, False)
salary_table = freshers_salary[highest_salary]
salary_table["Location"].value_counts()

# comment - upon analysing the outcome we realise that ""Gurgaon"" has the highest
# job opening in the "6to10" range at '16' followed by Mumbai and Bangalore.

"------------------------""QUESTION 2""--------------------------------"

# importing the given dataset:

Problem_2 =pd.read_excel ( r"P:\Complete Actuarial Science Stuff\Python\assignment 2\Problem 2 (1).xlsx")

# running EDA over the provided dataset - 
Problem_2.head()
Problem_2.tail()
Problem_2.describe()

# Fitting regression models
import statsmodels.api as smf
import statsmodels.tools as sm

X1 = np.array(Problem_2["X1"]).reshape(-1, 1)
X1 = sm.add_constant(X1)
X2 = np.array(Problem_2["X2"]).reshape(-1, 1)
X2 = sm.add_constant(X2)
X3 = np.array(Problem_2["X3"]).reshape(-1, 1)
X3 = sm.add_constant(X3)
X4 = np.array(Problem_2["X4"]).reshape(-1, 1)
X4 = sm.add_constant(X4)

Y1 = np.array(Problem_2["Y1"]).reshape(-1, 1)
Y2 = np.array(Problem_2["Y2"]).reshape(-1, 1)
Y3 = np.array(Problem_2["Y3"]).reshape(-1, 1)
Y4 = np.array(Problem_2["Y4"]).reshape(-1, 1)

model1 = smf.OLS(Y1, X1)
print(model1.fit().summary())

model2 = smf.OLS(Y2, X2)
print(model2.fit().summary())

model3 = smf.OLS(Y3, X3)
print(model3.fit().summary())

model4 = smf.OLS(Y4, X4)
print(model4.fit().summary())

"""INFERENCE FROM THE OUTPUT OF THE MODEL:"""

#~ On observation we realise that evenafter the datasets for each model being varying
#  in nature, they have a synonymous R-squared and coefficient values.
#~ The model summary printed can be analysed for detailed test results/ output.
#~ The possible discrepancy is that all the four models have a pretty similar
# line of best fit provided that the data is  non-identical.

"------------------------------------THE END--------------------------------"