Dataset¶

Download the dataset from https://drive.google.com/file/d/1ig6Zedw618UPrVLlr7XPYlD9jGfTlU4d/view?usp=sharing

Importing the libraries¶

import pandas as pd
import numpy as np

Importing the data¶

df = pd.read_csv("toy_dataset.csv")

Adding a column to the existing dataframe¶

import numpy as np
degrees=['BBA', 'BCOM', 'MBA','MCOM']
df["Qualification"] = np.random.choice(degrees, len(df))

Display column names¶

df.columns

Index(['Number', 'City', 'Gender', 'Age', 'Salary_in_rs', 'Illness',
       'Qualification'],
      dtype='object')

Saving data into an excel sheet¶

# forming dataframe
data = pd.DataFrame(df) 
  
# storing into the excel file
data.to_excel("toydataset_XL.xlsx")

Top 5 rows¶

df.head()

Last 5 rows¶

df.tail()

Data types¶

df.dtypes

Number            int64
City             object
Gender           object
Age               int64
Salary_in_rs      int64
Illness          object
Qualification    object
dtype: object

Number of unique values in each columns¶

df.nunique()

Number           150000
City                  8
Gender                2
Age                  41
Salary_in_rs      71761
Illness               2
Qualification         4
dtype: int64

Checking if there is any missing values¶

df.isnull().sum()

Number           0
City             0
Gender           0
Age              0
Salary_in_rs     0
Illness          0
Qualification    0
dtype: int64

Display missing values if there is any¶

# See rows with missing values
df[df.isnull().any(axis=1)]

Information about the data¶

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Number         150000 non-null  int64 
 1   City           150000 non-null  object
 2   Gender         150000 non-null  object
 3   Age            150000 non-null  int64 
 4   Salary_in_rs   150000 non-null  int64 
 5   Illness        150000 non-null  object
 6   Qualification  150000 non-null  object
dtypes: int64(3), object(4)
memory usage: 8.0+ MB

Descriptive Statistics¶

Statistical description about the data¶

# Viewing the data statistics
df.describe()

Importing libraries required for plotting the graphs¶

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Boxplot¶

plt.figure(figsize=(3,5))
sns.boxplot(y="Age",   data=df )

<Axes: ylabel='Age'>

plt.figure(figsize=(3,5))
sns.boxplot(y="Salary_in_rs",   data=df )

<Axes: ylabel='Salary_in_rs'>

Horizontal Bar chart¶

import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import Counter
import warnings
warnings.filterwarnings("ignore") ##To ignore the warnings if any

rcParams['figure.figsize'] = 10,5
sb.barplot(x = df['City'].value_counts().values, y = df['City'].value_counts().index)
plt.title('City wise')
plt.xlabel('Counts')
plt.ylabel('Cities')
plt.show()

Bar chart¶

sb.barplot(y = df['Illness'].value_counts().values, x = df['Illness'].value_counts().index)
plt.title('Illness count')
plt.xlabel('Counts')
plt.ylabel('Illness')
plt.show()

rcParams['figure.figsize'] = 15,5
sb.barplot(y = df['Gender'].value_counts().values, x = df['Gender'].value_counts().index)
plt.title('Gender Distribution overall')
plt.show()

Gender Distribution by city¶

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'City',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by city')
plt.show()

Gender Distribution by Qualification¶

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by Qualification')
plt.show()

Salary Distribution by Gender¶

m = df[df['Gender'] == 'Male']
f = df[df['Gender'] == 'Female']
xi = pd.Series(m['Salary_in_rs'])
yi = pd.Series(f['Salary_in_rs'])
rcParams['figure.figsize'] = 15,5
plt.hist(xi,alpha = 0.7,label = 'Male')
plt.hist(yi,alpha = 0.4,label = 'Female')
plt.title('Salary Distribution by Gender')
plt.xlabel('Salary')
plt.ylabel('Count')
plt.legend()
plt.show()

Age Distribution by Gender¶

m = df[df['Gender'] == 'Male']
f = df[df['Gender'] == 'Female']
xa = pd.Series(m['Age'])
ya = pd.Series(f['Age'])
rcParams['figure.figsize'] = 15,5
plt.hist(xa,alpha = 0.7,label = 'Male')
plt.hist(ya,alpha = 0.4,label = 'Female')
plt.title('Age Distribution by Gender')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()
plt.show()

City wise Qualifications¶

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = df,palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications')
plt.show()

City wise Qualifications of males¶

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = m, palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications of males')
plt.show()

City wise Qualifications of females¶

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = f, palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications of females')
plt.show()

Number of employees in each city¶

data.City.value_counts().plot.bar()
plt.title("count of workers in each city")
plt.xlabel('city')
plt.ylabel('workers count')
plt.show()

Histogram¶

delhi_df = df[df['City'] == 'Delhi']
bangalore_df = df[df['City'] == 'Bangalore']
kolkata_df = df[df['City'] == 'Kolkatta']
jaipur_df = df[df['City'] == 'Jaipur']
mumbai_df = df[df['City'] == 'Mumbai']
hyd_df = df[df['City'] == 'Hyderabad']
chennai_df = df[df['City'] == 'Chennai']
lucknow_df = df[df['City'] == 'Lucknow']

a = pd.Series(delhi_df['Salary_in_rs'])
b = pd.Series(bangalore_df['Salary_in_rs'])
c = pd.Series(kolkata_df['Salary_in_rs'])
d = pd.Series(jaipur_df['Salary_in_rs'])
e = pd.Series(mumbai_df['Salary_in_rs'])
f = pd.Series(hyd_df['Salary_in_rs'])
g = pd.Series(chennai_df['Salary_in_rs'])
h = pd.Series(lucknow_df['Salary_in_rs'])

plt.figure(figsize=(16,7))

bins = np.linspace(0, 175000, 200)

plt.hist(a, bins, alpha=0.5, label='Delhi')
plt.hist(b, bins, alpha=0.5, label='Bangalore')
plt.hist(c, bins, alpha=0.5, label='Kolkatta', color='cyan')
plt.hist(d, bins, alpha=0.5, label='Jaipur', color='crimson')
plt.hist(e, bins, alpha=0.5, label='Mumbai', color='Black')
plt.hist(f, bins, alpha=0.5, label='Hyderabad', color='Gold')
plt.hist(g, bins, alpha=0.5, label='Chennai', color='DarkBlue')
plt.hist(h, bins, alpha=0.5, label='Lucknow', color='Lime')
plt.legend(loc='upper right', prop={'size' : 22})
plt.xlabel('Income')
plt.ylabel('Frequency', rotation=0)
plt.rc('axes', labelsize=10) 
plt.rc('axes', titlesize=30) 
plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
plt.rc('ytick', labelsize=20) 
plt.title('Salary Distribution by City')
# Save
#plt.savefig('Income_Dist_City')

plt.show()

Bar chart¶

Age Distribution by Gender¶

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Age',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by Age')
plt.show()

rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Illness',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Classification by illness')
plt.show()

y = df['Age']
x=df['Salary_in_rs']
df2=pd.DataFrame()
df2['sal'] = x
df2['age'] = y
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   sal     150000 non-null  int64
 1   age     150000 non-null  int64
dtypes: int64(2)
memory usage: 2.3 MB

Scatter plot¶

rcParams['figure.figsize'] = 10,5
sb.scatterplot(x = df2['age'].values, y = df2['sal'].values)
plt.title('Correlation between age and salary')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.show()

import numpy as np
import matplotlib.pyplot as plt

m = df[df['Gender'] == 'Male'] ####Dataframe of all 6 columns for  males only
f = df[df['Gender'] == 'Female'] ####Dataframe of all 6 columns for  females only

mh = m[m['City']=='Hyderabad']
fh = f[f['City']=='Hyderabad']
xa = pd.Series(mh['Salary_in_rs'].mean())
ya = pd.Series(fh['Salary_in_rs'].mean())

mb = m[m['City']=='Bangalore']
fb = f[f['City']=='Bangalore']
xb = pd.Series(mb['Salary_in_rs'].mean())
yb = pd.Series(fb['Salary_in_rs'].mean())

mc = m[m['City']=='Chennai']
fc = f[f['City']=='Chennai']
xc = pd.Series(mc['Salary_in_rs'].mean())
yc = pd.Series(fc['Salary_in_rs'].mean())

# data to plot
n_groups = 3
men_means = (xa[0], xb[0], xc[0])
women_means = (ya[0],yb[0],yc[0])

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.2
opacity = 0.8

rects1 = plt.bar(index, men_means, bar_width,
alpha=opacity,
color='g',
label='Men')

rects2 = plt.bar(index + bar_width, women_means, bar_width,
alpha=opacity,
color='r',
label='Women')

plt.xlabel('Working place')
plt.ylabel('Salary')
plt.title('Salaries of working men and women in different places')
plt.xticks(index + bar_width, ('Hyderabad', 'Bangalore','Chennai'))
plt.legend()

plt.tight_layout()
plt.show()

Pie chart¶

from pandas._libs.tslibs.offsets import YearEnd
from matplotlib.axis import YAxis
import numpy as np
import matplotlib.pyplot as plt


# data to plot
n_groups = 3
men_means = (xa[0], xb[0], xc[0])
women_means = (ya[0],yb[0],yc[0])

# create plot
import matplotlib.pyplot as plt
import numpy as np

mylabels = ["Bangalore", "Chennai", "Hyderabad"]

plt.pie(women_means, labels = mylabels, autopct='%1.1f%%')
plt.legend(title = "Citywise employment percentage of females")
plt.show()

	Number	Age	Salary_in_rs
count	150000.000000	150000.000000	150000.000000
mean	75000.500000	44.950200	91252.798273
std	43301.414527	11.572486	24989.500948
min	1.000000	25.000000	-654.000000
25%	37500.750000	35.000000	80867.750000
50%	75000.500000	45.000000	93655.000000
75%	112500.250000	55.000000	104519.000000
max	150000.000000	65.000000	177157.000000

	Number	City	Gender	Age	Salary_in_rs	Illness	Qualification
0	116407	Kolkatta	Female	57	87004	No	BBA
1	116408	Kolkatta	Male	60	93196	No	BCOM
2	116409	Kolkatta	Male	25	112492	No	BCOM
3	116410	Kolkatta	Male	28	91910	No	BCOM
4	116411	Kolkatta	Female	51	85497	No	BCOM

	Number	City	Gender	Age	Salary_in_rs	Illness	Qualification
149995	116402	Chennai	Female	59	131543	No	BBA
149996	116403	Chennai	Female	52	135768	Yes	MCOM
149997	116404	Chennai	Female	57	148654	No	BBA
149998	116405	Chennai	Male	62	125842	No	MCOM
149999	116406	Chennai	Male	64	139082	No	BCOM