# -*- coding: utf-8 -*-
"""
Created on Tue Mar 22 01:26:12 2022

@author: Rohil
"""

import pandas as pd
import numpy as np
import seaborn as sns
import statistics as stats
import statsmodels.api as sm

# 1)
"""
Dataset description:
    The data set gives information about various people working in different cities.
    Columns:
        Experience - Years of work experience
        Key_Skills: Key skills required for the job.
        Location: Location of work.
        Salary: Annual CTC of the employee
        Company_name_encoded: Company name in a numeric format.
"""
data = pd.read_excel(
    r"C:\Users\Rohil\Desktop\IAQS\Semester 4\Python\Problem 1.xlsx"
)

"""
Feature Engineering Tasks:
    1. Splitting the locaiton column into their primary locations 
    (considering the first location given as the primary location).
"""

data["loc_temp"] = data.Location.str.split(",")
data["PrimaryLocation"] = data["loc_temp"].apply(lambda x: x[0].split("(")[0])
data["PrimaryLocation"] = data["PrimaryLocation"].str.strip()
data.drop(columns=["loc_temp"], inplace=True)

# Data Cleaning
data.loc[data["PrimaryLocation"] == "Mumbai Suburbs", "PrimaryLocation"] = "Mumbai"
data.loc[
    (data["PrimaryLocation"] == "Bangalore")
    | (data["PrimaryLocation"] == "Bengaluru Bangalore"),
    "PrimaryLocation",
] = "Bengaluru"
data.loc[data["PrimaryLocation"] == "Doha", "PrimaryLocation"] = "Qatar"
data.loc[data["PrimaryLocation"] == "Dubai/ UAE", "PrimaryLocation"] = "Dubai"
data.reset_index(inplace=True, drop=True)

# Problem 1:
# a)
"""
To answer student 1's question, I believe that a bar plot based on location and
count of people would give the best answer. 
To make the barplot, a little bit of data manipulation would be required, 
1. Groupby city, and aggregate by count. 
2. Keep only relevant metro cities in the dataset. I have considered the following:
    Delhi, Bengaluru, Gurgaon, Noida, Pune, Ahmedabad, Mumbai, Chennai, Hyderabad, 
    Dubai, Kolkata, Australia, Germany, Qatar, Canada, Sinagapore, and Malaysia. 
"""
data["PrimaryLocation"].nunique()
location_data = data[
    data["PrimaryLocation"].isin(
        [
            "Delhi",
            "Bengaluru",
            "Gurgaon",
            "Noida",
            "Pune",
            "Ahmedabad",
            "Mumbai",
            "Chennai",
            "Hyderabad",
            "Dubai",
            "Kolkata",
            "Australia",
            "Canada",
            "Sinagapore",
            "Malaysia",
        ]
    )
]

location_data = location_data.groupby("PrimaryLocation").count()
p1 = sns.barplot(data=location_data, x=location_data.index, y="Experience")
p1.set_ylabel("Number of Jobs")
p1.set_xticklabels(labels=list(location_data.index), rotation=90)
p1.set(title="Number of Jobs per City")

# b)
location_data1 = data[
    data["PrimaryLocation"].isin(
        [
            "Delhi",
            "Bengaluru",
            "Gurgaon",
            "Noida",
            "Pune",
            "Ahmedabad",
            "Mumbai",
            "Chennai",
            "Hyderabad",
            "Dubai",
            "Kolkata",
            "Australia",
            "Canada",
            "Singapore",
            "Malaysia",
        ]
    )
]
location_data1 = location_data1.groupby("PrimaryLocation").nunique()
location_data1.columns = ["1", "2", "3", "4", "Number of Companies"]
p2 = sns.barplot(data=location_data1, x=location_data1.index, y="Number of Companies")
p2.set_xticklabels(labels=location_data1.index, rotation=90)
p2.set(title="Number of Companies per City")

"""
Answers to Student 1:
    a) As seen in the plot, Mumbai is only second to Bangalore in terms of number of 
    jobs available. There are no shortage of jobs in Bombay.
    
    b) Moving to Bangalore would be a good bet moving ahead from a job 
    perspective given that it has the most number of jobs and companies to choose from. 
"""

# Problem 2:

# a)
"""
Feature Engineering:
    Given that the work experience is given as a range, I have considered the 
    average to give an estimate for work experience per person.
    
"""
work_data = data
work_data["temp1"] = work_data["Experience"].str.replace("yrs", "").str.strip()
work_data["temp2"] = work_data["temp1"].str.split("-")
work_data["Average Experience"] = work_data["temp2"].apply(
    lambda x: stats.mean([float(a) for a in x])
)
work_data.drop(columns=["temp1", "temp2"], inplace=True)


# Boxplot to check summary statistics of experience required.
sns.boxplot(data=work_data, y="Average Experience")


# City wise average salary
work_data_sal = data
work_data_sal["temp1"] = work_data_sal["Salary"].str.split("to")
work_data_sal["Average Salary"] = work_data_sal["temp1"].apply(
    lambda x: stats.mean([float(a) for a in x])
)
final_df = work_data_sal.groupby("PrimaryLocation").mean()
finaldf = final_df.loc[
    [
        "Delhi",
        "Bengaluru",
        "Gurgaon",
        "Noida",
        "Pune",
        "Ahmedabad",
        "Mumbai",
        "Chennai",
        "Hyderabad",
        "Dubai",
        "Kolkata",
        "Australia",
        "Canada",
        "Singapore",
        "Malaysia",
    ],
    :,
]
p3 = sns.barplot(data=finaldf, x=finaldf.index, y="Average Salary")
p3.set_xticklabels(labels=finaldf.index, rotation=90)
p3.set(title="Average Salary by Location")

"""
Answers to Student 2:
    As seen in the boxplot that has been plotted, the majority of positions
    require between 3 to 7 years of experience with the average being around 5. 
    The plot seems to be skewed towrds a higher number of work experience, but there 
    do seem to be opportunities present for freshers in the lower quartile of the 
    plot as well.
    
    Mumbai ,Delhi and Bengaluru seem to have the highest average salary amongst Indian cities.
    So, if one wants to stay in India, they can target these cities. 
    However, cities abroad offer much higher average salaries. With Canada being a good bet.
"""

# 2)
data1 = pd.read_excel(
    r"C:\Users\Rohil\Desktop\IAQS\Semester 4\Python\Problem 2.xlsx"
)

# Fitting regression models

X1 = np.array(data1["X1"]).reshape(-1, 1)
X1 = sm.add_constant(X1)
X2 = np.array(data1["X2"]).reshape(-1, 1)
X2 = sm.add_constant(X2)
X3 = np.array(data1["X3"]).reshape(-1, 1)
X3 = sm.add_constant(X3)
X4 = np.array(data1["X4"]).reshape(-1, 1)
X4 = sm.add_constant(X4)

Y1 = np.array(data1["Y1"]).reshape(-1, 1)
Y2 = np.array(data1["Y2"]).reshape(-1, 1)
Y3 = np.array(data1["Y3"]).reshape(-1, 1)
Y4 = np.array(data1["Y4"]).reshape(-1, 1)

model1 = sm.OLS(Y1, X1)
print(model1.fit().summary())

model2 = sm.OLS(Y2, X2)
print(model2.fit().summary())

model3 = sm.OLS(Y3, X3)
print(model3.fit().summary())

model4 = sm.OLS(Y4, X4)
print(model4.fit().summary())

"""
Although the data are different for all the models, they seem to have a similar
R-squared and coefficient values. The detailed testing results and values can be 
seen in the summaries that have been printed above. All the models will have a very similar
line of best fit, which seems like a discrepency given that the data is different.
"""