# -*- coding: utf-8 -*-
"""
Created on Mon Mar 28 15:48:11 2022

@author: DELL
"""

# Importing the required libraries and datasets

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as mt

"------------------------""QUESTION 1--------------------------------"""
problem1 = pd.read_excel(r"C:\Users\DELL\Downloads\problem11.xlsx")
"""Student 1:"""

jobs_mumbai = problem1[problem1["Location"].astype("str").str.contains("Mumbai")]

# for job opportunities in other metro cities, we consider the following Metro 
# cities - "Mumbai, Bengaluru, Delhi, Gurgaon, Chennai, Pune"
list_metro = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]

# using loops:
metro_jobs = 0
for i in list_metro:
    list_metro =list_metro + problem1[problem1["Location"].astype("str").str.contains(i)].shape[0]

#  the student will have to look out from 13805 job opportunities

"""Student 2:"""
### PART A:
# Required: Can you tell me that the majority openings require how many years of experience? Do
# we have high intake for freshers?

sb.displot(problem1["Experience"],kind = "hist",
            color = "violet")

# due to too many unique experience ranges, the plot gets too clumsy rendering 
# it inapproriate
# Thus, we will create a list containing value count for each unique range. 
# Furthermore, only plotting the top 10 ranges with highest occurence

Majority_op = problem1["Experience"].value_counts().reset_index()

# renaming the columns of the above list
Majority_op.columns = ['Experience', 'frequency']

# locating the top 10 rows of the above list for ease of plotting and visualisation
Top_10 = Majority_op.iloc[0:10,:]

#plotting the top 10 openings:
import matplotlib.pyplot as plt 
plt.bar(Top_10.Experience,Top_10.frequency, color = "violet")

# sorting the majority openings for jobs open for freshers:
Majority_op = Majority_op.sort_values(by = 'Experience')
fresher_op = Majority_op.iloc[0:6,:]
# total fresher openings:
tot_fresh_op = sum(fresher_op.frequency)
tot_fresh_op 
        # total freq = 1653

### PART B:
        
# Which city will give me the best salary if I only wish to shift now?

# Given the student is a fresher, we find the best salary giving city as follows:
new_cols = ["Experience","Location","Salary"]

# storing the required columns from the "problem1" dataset into a new variable
top_salary = problem1[new_cols]

# sorting the above for "ONLY" freshers
top_salary = top_salary.sort_values(by = "Experience")
freshy_sal = top_salary.iloc[0:1653,:]
freshy_sal = freshy_sal.sort_values(by = "Salary", ascending = False )

                                    

"------------------------""QUESTION 1--------------------------------"""

problem2 = pd.read_excel(r"‪C:\Users\DELL\Downloads\Problem2.xlsx")
import statsmodels.formula.api as smf

lr1 = smf.ols(formula='Y1 ~ X1', data=problem2)
fitted_model1 = lr1.fit()
fitted_model1.summary()


lr2 = smf.ols(formula='Y2 ~ X2', data=problem2)
fitted_model2 = lr2.fit()
fitted_model2.summary()


lr3 = smf.ols(formula='Y3 ~ X3', data=problem2)
fitted_model3 = lr3.fit()
fitted_model3.summary()


lr4 = smf.ols(formula='Y4 ~ X4', data=problem2)
fitted_model4 = lr4.fit()
fitted_model4.summary()