# -*- coding: utf-8 -*-
"""
Created on Wed Mar 23 13:30:47 2022

@author: samyaks.ext
"""

import seaborn as sns
import statistics as stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib as plt


# 1)
df = pd.read_excel(r"C:\Users\samyakworld\Python\Problem 1.xlsx")

df["loc_temp"] = df.Location.str.split(",")
df["PrimaryLocation"] = df["loc_temp"].apply(lambda x: x[0].split("(")[0])
df["PrimaryLocation"] = df["PrimaryLocation"].str.strip()
df.drop(columns=["loc_temp"], inplace=True)
df.loc[df["PrimaryLocation"] == "Mumbai Suburbs", "PrimaryLocation"] 
df.loc[(df["PrimaryLocation"] == "Bangalore")|(df["PrimaryLocation"] == "Bengaluru Bangalore"),"PrimaryLocation",] = "Bengaluru"
df.loc[df["PrimaryLocation"] == "Doha", "PrimaryLocation"] = "Qatar"
df.loc[df["PrimaryLocation"] == "Dubai/ UAE", "PrimaryLocation"] = "Dubai"
df.reset_index(inplace=True, drop=True)

df["PrimaryLocation"].nunique()
location_data = df[
    df["PrimaryLocation"].isin(
        [
            "Delhi",
            "Bengaluru",
            "Gurgaon",
            "Noida",
            "Pune",
            "Ahmedabad",
            "Mumbai",
            "Chennai",
            "Hyderabad",
            "Dubai",
            "Kolkata",
            "Australia",
            "Canada",
            "Sinagapore",
            "Malaysia",
        ]
    )
]

location_data = location_data.groupby("PrimaryLocation").count()
p1 = sns.barplot(data=location_data, x=location_data.index, y="Experience")
p1.set_ylabel("Number of Jobs")
p1.set_xticklabels(labels=list(location_data.index), rotation=90)
p1.set(title="Number of Jobs per City")

location_data1 = df[
    df["PrimaryLocation"].isin(
        [
            "Delhi",
            "Bengaluru",
            "Gurgaon",
            "Noida",
            "Pune",
            "Ahmedabad",
            "Mumbai",
            "Chennai",
            "Hyderabad",
            "Dubai",
            "Kolkata",
            "Australia",
            "Canada",
            "Singapore",
            "Malaysia",
        ]
    )
]
location_data1 = location_data1.groupby("PrimaryLocation").nunique()
location_data1.columns = ["1", "2", "3", "4", "Number of Companies"]
p2 = sns.barplot(data=location_data1, x=location_data1.index, y="Number of Companies")
p2.set_xticklabels(labels=location_data1.index, rotation=90)
p2.set(title="Number of Companies per City")


# Problem 2:

# a)
work = df
work["temp1"] = work["Experience"].str.replace("yrs", "").str.strip()
work["temp2"] = work["temp1"].str.split("-")
work["Average Experience"] = work["temp2"].apply(
    lambda x: stats.mean([float(a) for a in x])
)
work.drop(columns=["temp1", "temp2"], inplace=True)

sns.boxplot(data=work, y="Average Experience")

work_sal = df
work_sal["temp1"] = work_sal["Salary"].str.split("to")
work_sal["Average Salary"] = work_sal["temp1"].apply(
    lambda x: stats.mean([float(a) for a in x])
)
final_ = work_sal.groupby("PrimaryLocation").mean()
final = final_.loc[
    [
        "Delhi",
        "Bengaluru",
        "Gurgaon",
        "Noida",
        "Pune",
        "Ahmedabad",
        "Mumbai",
        "Chennai",
        "Hyderabad",
        "Dubai",
        "Kolkata",
        "Australia",
        "Canada",
        "Singapore",
        "Malaysia",
    ],
    :,
]
p3 = sns.barplot(data=final, x=final.index, y="Average Salary")
p3.set_xticklabels(labels=final.index, rotation=90)
p3.set(title="Average Salary by Location")


# 2)
data1 = pd.read_excel(
    r"C:\Users\Sahil\Sem 4\Problem 2.xlsx"
)

X1 = np.array(data1["X1"]).reshape(-1, 1)
X1 = sm.add_constant(X1)
X2 = np.array(data1["X2"]).reshape(-1, 1)
X2 = sm.add_constant(X2)
X3 = np.array(data1["X3"]).reshape(-1, 1)
X3 = sm.add_constant(X3)
X4 = np.array(data1["X4"]).reshape(-1, 1)
X4 = sm.add_constant(X4)

Y1 = np.array(data1["Y1"]).reshape(-1, 1)
Y2 = np.array(data1["Y2"]).reshape(-1, 1)
Y3 = np.array(data1["Y3"]).reshape(-1, 1)
Y4 = np.array(data1["Y4"]).reshape(-1, 1)

model1 = sm.OLS(Y1, X1)
print(model1.fit().summary())

model2 = sm.OLS(Y2, X2)
print(model2.fit().summary())

model3 = sm.OLS(Y3, X3)
print(model3.fit().summary())

model4 = sm.OLS(Y4, X4)
print(model4.fit().summary())