Importing the libraries

In [1]:
import pandas as pd
import numpy as np

Importing the data

In [2]:
df = pd.read_csv("toy_dataset.csv")

Adding a column to the existing dataframe

In [3]:
import numpy as np
degrees=['BBA', 'BCOM', 'MBA','MCOM']
df["Qualification"] = np.random.choice(degrees, len(df))

Display column names

In [4]:
df.columns
Out[4]:
Index(['Number', 'City', 'Gender', 'Age', 'Salary_in_rs', 'Illness',
       'Qualification'],
      dtype='object')

Saving data into an excel sheet

In [5]:
# forming dataframe
data = pd.DataFrame(df) 
  
# storing into the excel file
data.to_excel("toydataset_XL.xlsx")

Top 5 rows

In [6]:
df.head()
Out[6]:
Number City Gender Age Salary_in_rs Illness Qualification
0 116407 Kolkatta Female 57 87004 No BBA
1 116408 Kolkatta Male 60 93196 No BCOM
2 116409 Kolkatta Male 25 112492 No BCOM
3 116410 Kolkatta Male 28 91910 No BCOM
4 116411 Kolkatta Female 51 85497 No BCOM

Last 5 rows

In [7]:
df.tail()
Out[7]:
Number City Gender Age Salary_in_rs Illness Qualification
149995 116402 Chennai Female 59 131543 No BBA
149996 116403 Chennai Female 52 135768 Yes MCOM
149997 116404 Chennai Female 57 148654 No BBA
149998 116405 Chennai Male 62 125842 No MCOM
149999 116406 Chennai Male 64 139082 No BCOM

Data types

In [8]:
df.dtypes
Out[8]:
Number            int64
City             object
Gender           object
Age               int64
Salary_in_rs      int64
Illness          object
Qualification    object
dtype: object

Number of unique values in each columns

In [9]:
df.nunique()
Out[9]:
Number           150000
City                  8
Gender                2
Age                  41
Salary_in_rs      71761
Illness               2
Qualification         4
dtype: int64

Checking if there is any missing values

In [10]:
df.isnull().sum()
Out[10]:
Number           0
City             0
Gender           0
Age              0
Salary_in_rs     0
Illness          0
Qualification    0
dtype: int64

Display missing values if there is any

In [11]:
# See rows with missing values
df[df.isnull().any(axis=1)]
Out[11]:
Number City Gender Age Salary_in_rs Illness Qualification

Information about the data

In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Number         150000 non-null  int64 
 1   City           150000 non-null  object
 2   Gender         150000 non-null  object
 3   Age            150000 non-null  int64 
 4   Salary_in_rs   150000 non-null  int64 
 5   Illness        150000 non-null  object
 6   Qualification  150000 non-null  object
dtypes: int64(3), object(4)
memory usage: 8.0+ MB

Descriptive Statistics

Statistical description about the data

In [13]:
# Viewing the data statistics
df.describe()
Out[13]:
Number Age Salary_in_rs
count 150000.000000 150000.000000 150000.000000
mean 75000.500000 44.950200 91252.798273
std 43301.414527 11.572486 24989.500948
min 1.000000 25.000000 -654.000000
25% 37500.750000 35.000000 80867.750000
50% 75000.500000 45.000000 93655.000000
75% 112500.250000 55.000000 104519.000000
max 150000.000000 65.000000 177157.000000

Importing libraries required for plotting the graphs

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Boxplot

In [15]:
plt.figure(figsize=(3,5))
sns.boxplot(y="Age",   data=df )
Out[15]:
<Axes: ylabel='Age'>
In [16]:
plt.figure(figsize=(3,5))
sns.boxplot(y="Salary_in_rs",   data=df )
Out[16]:
<Axes: ylabel='Salary_in_rs'>

Horizontal Bar chart

In [17]:
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import rcParams
from collections import Counter
import warnings
warnings.filterwarnings("ignore") ##To ignore the warnings if any

rcParams['figure.figsize'] = 10,5
sb.barplot(x = df['City'].value_counts().values, y = df['City'].value_counts().index)
plt.title('City wise')
plt.xlabel('Counts')
plt.ylabel('Cities')
plt.show()

Bar chart

In [18]:
sb.barplot(y = df['Illness'].value_counts().values, x = df['Illness'].value_counts().index)
plt.title('Illness count')
plt.xlabel('Counts')
plt.ylabel('Illness')
plt.show()
In [19]:
rcParams['figure.figsize'] = 15,5
sb.barplot(y = df['Gender'].value_counts().values, x = df['Gender'].value_counts().index)
plt.title('Gender Distribution overall')
plt.show()

Gender Distribution by city

In [20]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'City',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by city')
plt.show()

Gender Distribution by Qualification

In [21]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by Qualification')
plt.show()

Salary Distribution by Gender

In [22]:
m = df[df['Gender'] == 'Male']
f = df[df['Gender'] == 'Female']
xi = pd.Series(m['Salary_in_rs'])
yi = pd.Series(f['Salary_in_rs'])
rcParams['figure.figsize'] = 15,5
plt.hist(xi,alpha = 0.7,label = 'Male')
plt.hist(yi,alpha = 0.4,label = 'Female')
plt.title('Salary Distribution by Gender')
plt.xlabel('Salary')
plt.ylabel('Count')
plt.legend()
plt.show()

Age Distribution by Gender

In [23]:
m = df[df['Gender'] == 'Male']
f = df[df['Gender'] == 'Female']
xa = pd.Series(m['Age'])
ya = pd.Series(f['Age'])
rcParams['figure.figsize'] = 15,5
plt.hist(xa,alpha = 0.7,label = 'Male')
plt.hist(ya,alpha = 0.4,label = 'Female')
plt.title('Age Distribution by Gender')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()
plt.show()

City wise Qualifications

In [24]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = df,palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications')
plt.show()

City wise Qualifications of males

In [25]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = m, palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications of males')
plt.show()

City wise Qualifications of females

In [26]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Qualification',hue = 'City',data = f, palette=['c','b','r','g','y','k','m','c'])
plt.title('city wise qualifications of females')
plt.show()

Number of employees in each city

In [27]:
data.City.value_counts().plot.bar()
plt.title("count of workers in each city")
plt.xlabel('city')
plt.ylabel('workers count')
plt.show()

Histogram

In [28]:
delhi_df = df[df['City'] == 'Delhi']
bangalore_df = df[df['City'] == 'Bangalore']
kolkata_df = df[df['City'] == 'Kolkatta']
jaipur_df = df[df['City'] == 'Jaipur']
mumbai_df = df[df['City'] == 'Mumbai']
hyd_df = df[df['City'] == 'Hyderabad']
chennai_df = df[df['City'] == 'Chennai']
lucknow_df = df[df['City'] == 'Lucknow']

a = pd.Series(delhi_df['Salary_in_rs'])
b = pd.Series(bangalore_df['Salary_in_rs'])
c = pd.Series(kolkata_df['Salary_in_rs'])
d = pd.Series(jaipur_df['Salary_in_rs'])
e = pd.Series(mumbai_df['Salary_in_rs'])
f = pd.Series(hyd_df['Salary_in_rs'])
g = pd.Series(chennai_df['Salary_in_rs'])
h = pd.Series(lucknow_df['Salary_in_rs'])

plt.figure(figsize=(16,7))

bins = np.linspace(0, 175000, 200)

plt.hist(a, bins, alpha=0.5, label='Delhi')
plt.hist(b, bins, alpha=0.5, label='Bangalore')
plt.hist(c, bins, alpha=0.5, label='Kolkatta', color='cyan')
plt.hist(d, bins, alpha=0.5, label='Jaipur', color='crimson')
plt.hist(e, bins, alpha=0.5, label='Mumbai', color='Black')
plt.hist(f, bins, alpha=0.5, label='Hyderabad', color='Gold')
plt.hist(g, bins, alpha=0.5, label='Chennai', color='DarkBlue')
plt.hist(h, bins, alpha=0.5, label='Lucknow', color='Lime')
plt.legend(loc='upper right', prop={'size' : 22})
plt.xlabel('Income')
plt.ylabel('Frequency', rotation=0)
plt.rc('axes', labelsize=10) 
plt.rc('axes', titlesize=30) 
plt.rc('xtick', labelsize=20)    # fontsize of the tick labels
plt.rc('ytick', labelsize=20) 
plt.title('Salary Distribution by City')
# Save
#plt.savefig('Income_Dist_City')

plt.show()

Bar chart

Age Distribution by Gender

In [29]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Age',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Distribution by Age')
plt.show()
In [30]:
rcParams['figure.figsize'] = 15,5
sb.countplot(x = 'Illness',hue = 'Gender',data = df,palette=['c','b'])
plt.title('Gender Classification by illness')
plt.show()
In [31]:
y = df['Age']
x=df['Salary_in_rs']
df2=pd.DataFrame()
df2['sal'] = x
df2['age'] = y
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   sal     150000 non-null  int64
 1   age     150000 non-null  int64
dtypes: int64(2)
memory usage: 2.3 MB

Scatter plot

In [32]:
rcParams['figure.figsize'] = 10,5
sb.scatterplot(x = df2['age'].values, y = df2['sal'].values)
plt.title('Correlation between age and salary')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.show()
In [33]:
import numpy as np
import matplotlib.pyplot as plt

m = df[df['Gender'] == 'Male'] ####Dataframe of all 6 columns for  males only
f = df[df['Gender'] == 'Female'] ####Dataframe of all 6 columns for  females only

mh = m[m['City']=='Hyderabad']
fh = f[f['City']=='Hyderabad']
xa = pd.Series(mh['Salary_in_rs'].mean())
ya = pd.Series(fh['Salary_in_rs'].mean())

mb = m[m['City']=='Bangalore']
fb = f[f['City']=='Bangalore']
xb = pd.Series(mb['Salary_in_rs'].mean())
yb = pd.Series(fb['Salary_in_rs'].mean())

mc = m[m['City']=='Chennai']
fc = f[f['City']=='Chennai']
xc = pd.Series(mc['Salary_in_rs'].mean())
yc = pd.Series(fc['Salary_in_rs'].mean())

# data to plot
n_groups = 3
men_means = (xa[0], xb[0], xc[0])
women_means = (ya[0],yb[0],yc[0])

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.2
opacity = 0.8

rects1 = plt.bar(index, men_means, bar_width,
alpha=opacity,
color='g',
label='Men')

rects2 = plt.bar(index + bar_width, women_means, bar_width,
alpha=opacity,
color='r',
label='Women')

plt.xlabel('Working place')
plt.ylabel('Salary')
plt.title('Salaries of working men and women in different places')
plt.xticks(index + bar_width, ('Hyderabad', 'Bangalore','Chennai'))
plt.legend()

plt.tight_layout()
plt.show()

Pie chart

In [34]:
from pandas._libs.tslibs.offsets import YearEnd
from matplotlib.axis import YAxis
import numpy as np
import matplotlib.pyplot as plt


# data to plot
n_groups = 3
men_means = (xa[0], xb[0], xc[0])
women_means = (ya[0],yb[0],yc[0])

# create plot
import matplotlib.pyplot as plt
import numpy as np

mylabels = ["Bangalore", "Chennai", "Hyderabad"]

plt.pie(women_means, labels = mylabels, autopct='%1.1f%%')
plt.legend(title = "Citywise employment percentage of females")
plt.show()