# -*- coding: utf-8 -*-
"""
Created on Tue Mar 29 15:31:04 2022

@author: Asus
"""

## Python Assignment 2 
## Made By Sabyasachi Rathore
## Roll No 434

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib as plt

##Question 1

Data1 = pd.read_excel(r" D:\College Stuff\Python\problem1.xlsx")
##Student 1:

MumbaiJobs = Data1[Data1["Location"].astype("str").str.contains("Mumbai")]

## Metro Cities include - "Mumbai, Bengaluru, Delhi, Gurgaon, Chennai & Pune"
MetroList = ["Bengaluru", "Kolkata", "Delhi", "Chennai", "Gurgaon", "Pune"]

## Loop:
MetroJobs = 0
for i in MetroList:
    MetroList =MetroList + Data1[Data1["Location"].astype("str").str.contains(i)].shape[0]

## 13805 job opportunities

##Student 2:
## (A)
# Required: Years of experience for majority of job openings, Fresher Intake.

sb.displot(Data1["Experience"],kind = "hist",
            color = "violet")

## The plot is too complex due to the high number of unique experience ranges.
## Hence, We will plot 10 ranges with the highest occurrence and 
## make a list of the unique ranges as a prerequisite step.

MajorityList = Data1["Experience"].value_counts().reset_index()

## Renaming the columns 
MajorityList.columns = ['Experience', 'frequency']

## 10 Ranges with the highest occurrence:
RelevantTen = MajorityList.iloc[0:10,:]

## Plotting:
import matplotlib.pyplot as plt 
plt.bar(RelevantTen.Experience,RelevantTen.frequency, color = "violet")

## Checking the jobs open for freshers:
MajorityList = MajorityList.sort_values(by = 'Experience')
OpenForFreshers = MajorityList.iloc[0:6,:]
# total fresher openings:
TotalOpeningsFreshers= sum(OpenForFreshers.frequency)
TotalOpeningsFreshers
## 1653 Job Openings for Freshers

## (B)
        
## City with the Highest salary for a fresher if I were to shit instantaneously:

RelevantColumns = ["Experience","Location","Salary"]

# storing the required columns from the "Data1" dataset into a new variable
HighestSalary = Data1[RelevantColumns]

# sorting the above for "ONLY" freshers
HighestSalary = HighestSalary.sort_values(by = "Experience")
FresherSalary = HighestSalary.iloc[0:1653,:]
FresherSalary = FresherSalary.sort_values(by = "Salary", ascending = False )

                                    

##Question 2

Data2 = pd.read_excel(r" D:\College Stuff\Python\Problem2.xlsx")

import statsmodels.formula.api as smf

Model1 = smf.ols(formula='Y1 ~ X1', data=Data2)
Fitted1 = Model1.fit()
Fitted1.summary()


Model2 = smf.ols(formula='Y2 ~ X2', data=Data2)
Fitted2 = Model2.fit()
Fitted2.summary()


Model3 = smf.ols(formula='Y3 ~ X3', data=Data2)
Fitted3 = Model3.fit()
Fitted3.summary()


Model4 = smf.ols(formula='Y4 ~ X4', data=Data2)
Fitted4 = Model4.fit()
Fitted4.summary()