Download the dataset from https://drive.google.com/file/d/1ig6Zedw618UPrVLlr7XPYlD9jGfTlU4d/view?usp=sharing
import pandas as pd
import numpy as np
df = pd.read_csv("toy_dataset.csv")
import numpy as np
degrees=['BBA', 'BCOM', 'MBA','MCOM']
df["Qualification"] = np.random.choice(degrees, len(df))
df.columns
# forming dataframe
data = pd.DataFrame(df)
# storing into the excel file
data.to_excel("toydataset_XL.xlsx")
df.head()
df.tail()
df.dtypes
df.nunique()
df.isnull().sum()
# See rows with missing values
df[df.isnull().any(axis=1)]
df.info()
# Viewing the data statistics
df.describe()
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(3,5))
sns.boxplot(y="Age", data=df )
plt.figure(figsize=(3,5))
sns.boxplot(y="Salary_in_rs", data=df )
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import Counter
import warnings
warnings.filterwarnings("ignore") ##To ignore the warnings if any
rcParams['figure.figsize'] = 10,5
sb.barplot(x = df['City'].value_counts().values, y = df['City'].value_counts().index)
plt.title('City wise')
plt.xlabel('Counts')
plt.ylabel('Cities')
plt.show()
sb.barplot(y = df['Illness'].value_counts().values, x = df['Illness'].value_counts().index)
plt.title('Illness count')
plt.xlabel('Counts')
plt.ylabel('Illness')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.barplot(y = df['Gender'].value_counts().values, x = df['Gender'].value_counts().index)
plt.title('Gender Distribution overall')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'City',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by city')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by Qualification')
plt.show()
m = df[df['Gender'] == 'Male']
f = df[df['Gender'] == 'Female']
xi = pd.Series(m['Salary_in_rs'])
yi = pd.Series(f['Salary_in_rs'])
rcParams['figure.figsize'] = 15,5
plt.hist(xi,alpha = 0.7,label = 'Male')
plt.hist(yi,alpha = 0.4,label = 'Female')
plt.title('Salary Distribution by Gender')
plt.xlabel('Salary')
plt.ylabel('Count')
plt.legend()
plt.show()
m = df[df['Gender'] == 'Male']
f = df[df['Gender'] == 'Female']
xa = pd.Series(m['Age'])
ya = pd.Series(f['Age'])
rcParams['figure.figsize'] = 15,5
plt.hist(xa,alpha = 0.7,label = 'Male')
plt.hist(ya,alpha = 0.4,label = 'Female')
plt.title('Age Distribution by Gender')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = df,palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = m, palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications of males')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = f, palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications of females')
plt.show()
data.City.value_counts().plot.bar()
plt.title("count of workers in each city")
plt.xlabel('city')
plt.ylabel('workers count')
plt.show()
delhi_df = df[df['City'] == 'Delhi']
bangalore_df = df[df['City'] == 'Bangalore']
kolkata_df = df[df['City'] == 'Kolkatta']
jaipur_df = df[df['City'] == 'Jaipur']
mumbai_df = df[df['City'] == 'Mumbai']
hyd_df = df[df['City'] == 'Hyderabad']
chennai_df = df[df['City'] == 'Chennai']
lucknow_df = df[df['City'] == 'Lucknow']
a = pd.Series(delhi_df['Salary_in_rs'])
b = pd.Series(bangalore_df['Salary_in_rs'])
c = pd.Series(kolkata_df['Salary_in_rs'])
d = pd.Series(jaipur_df['Salary_in_rs'])
e = pd.Series(mumbai_df['Salary_in_rs'])
f = pd.Series(hyd_df['Salary_in_rs'])
g = pd.Series(chennai_df['Salary_in_rs'])
h = pd.Series(lucknow_df['Salary_in_rs'])
plt.figure(figsize=(16,7))
bins = np.linspace(0, 175000, 200)
plt.hist(a, bins, alpha=0.5, label='Delhi')
plt.hist(b, bins, alpha=0.5, label='Bangalore')
plt.hist(c, bins, alpha=0.5, label='Kolkatta', color='cyan')
plt.hist(d, bins, alpha=0.5, label='Jaipur', color='crimson')
plt.hist(e, bins, alpha=0.5, label='Mumbai', color='Black')
plt.hist(f, bins, alpha=0.5, label='Hyderabad', color='Gold')
plt.hist(g, bins, alpha=0.5, label='Chennai', color='DarkBlue')
plt.hist(h, bins, alpha=0.5, label='Lucknow', color='Lime')
plt.legend(loc='upper right', prop={'size' : 22})
plt.xlabel('Income')
plt.ylabel('Frequency', rotation=0)
plt.rc('axes', labelsize=10)
plt.rc('axes', titlesize=30)
plt.rc('xtick', labelsize=20) # fontsize of the tick labels
plt.rc('ytick', labelsize=20)
plt.title('Salary Distribution by City')
# Save
#plt.savefig('Income_Dist_City')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Age',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by Age')
plt.show()
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Illness',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Classification by illness')
plt.show()
y = df['Age']
x=df['Salary_in_rs']
df2=pd.DataFrame()
df2['sal'] = x
df2['age'] = y
df2.info()
rcParams['figure.figsize'] = 10,5
sb.scatterplot(x = df2['age'].values, y = df2['sal'].values)
plt.title('Correlation between age and salary')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
m = df[df['Gender'] == 'Male'] ####Dataframe of all 6 columns for males only
f = df[df['Gender'] == 'Female'] ####Dataframe of all 6 columns for females only
mh = m[m['City']=='Hyderabad']
fh = f[f['City']=='Hyderabad']
xa = pd.Series(mh['Salary_in_rs'].mean())
ya = pd.Series(fh['Salary_in_rs'].mean())
mb = m[m['City']=='Bangalore']
fb = f[f['City']=='Bangalore']
xb = pd.Series(mb['Salary_in_rs'].mean())
yb = pd.Series(fb['Salary_in_rs'].mean())
mc = m[m['City']=='Chennai']
fc = f[f['City']=='Chennai']
xc = pd.Series(mc['Salary_in_rs'].mean())
yc = pd.Series(fc['Salary_in_rs'].mean())
# data to plot
n_groups = 3
men_means = (xa[0], xb[0], xc[0])
women_means = (ya[0],yb[0],yc[0])
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.2
opacity = 0.8
rects1 = plt.bar(index, men_means, bar_width,
alpha=opacity,
color='g',
label='Men')
rects2 = plt.bar(index + bar_width, women_means, bar_width,
alpha=opacity,
color='r',
label='Women')
plt.xlabel('Working place')
plt.ylabel('Salary')
plt.title('Salaries of working men and women in different places')
plt.xticks(index + bar_width, ('Hyderabad', 'Bangalore','Chennai'))
plt.legend()
plt.tight_layout()
plt.show()
from pandas._libs.tslibs.offsets import YearEnd
from matplotlib.axis import YAxis
import numpy as np
import matplotlib.pyplot as plt
# data to plot
n_groups = 3
men_means = (xa[0], xb[0], xc[0])
women_means = (ya[0],yb[0],yc[0])
# create plot
import matplotlib.pyplot as plt
import numpy as np
mylabels = ["Bangalore", "Chennai", "Hyderabad"]
plt.pie(women_means, labels = mylabels, autopct='%1.1f%%')
plt.legend(title = "Citywise employment percentage of females")
plt.show()