Get Number of NULLS in a DataFrame
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline
train_data = pd.read_csv('./data/train.csv')
null_in_train_csv = train_data.isnull().sum()
null_in_train_csv = train_data.isnull().sum()
null_in_train_csv = null_in_train_csv[null_in_train_csv > 0]
null_in_train_csv.sort_values(inplace=True)
null_in_train_csv.plot.bar()
null_in_train_csv
Filter Columns based on Correlation:
sns.heatmap(train_csv.corr(), vmax=.8, square=True);
arr_train_cor = train_csv.corr()['SalePrice']
idx_train_cor_gt0 = arr_train_cor[arr_train_cor > 0].sort_values(ascending=False).index.tolist()
arr_train_cor[idx_train_cor_gt0]
Find count of zeroes in a Column:
zero_in_masvnrarea = train_meta['MasVnrArea'][train_meta['MasVnrArea'] == 0].index.tolist()
Replace all NULL with 0 if %of 0 in a column is more than 50%
null_in_masvnrarea = train_meta[train_meta['MasVnrArea'].isnull()].index.tolist()
zero_in_masvnrarea = train_meta['MasVnrArea'][train_meta['MasVnrArea'] == 0].index.tolist()
print("How many null value in MasVnrArea? %d / 1460" % len(null_in_masvnrarea))
print("How many zero value in MasVnrArea? %d / 1460" % len(zero_in_masvnrarea))
train_meta['MasVnrArea'][null_in_masvnrarea] = 0
Create a new column with value as 0 if 0 else for non zero value 1:
nonzero_in_masvnrarea = train_clean['MasVnrArea'][train_clean['MasVnrArea'] != 0].index.tolist()
train_clean['has_MasVnrArea'] = 0
train_clean['has_MasVnrArea'][nonzero_in_masvnrarea] = 1
Create a new binned column:
digitize: This is a function. First define the bins, i.e value for a range.
In the below example -1,1 -> 1, 1,1004 -> 2, 1004,4000 -> 3
When you run this output column will have 1/2/3 based on the value lies in which range.
Create a random Age list based on mean and std and the data for missing column records:
age_avg = concat_set['Age'].mean()
age_std = concat_set['Age'].std()
age_null_count = concat_set['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size = age_null_count)
concat_set['Age'][np.isnan(concat_set['Age'])] = age_null_random_list
concat_set['Age'] = concat_set['Age'].astype(int)
Create a Categorical Column:
concat_set['CategoricalAge'] = pd.cut(concat_set['Age'], 5)
Create a new column by Splitting a column and extract values from it:
concat_set['Title'] = concat_set['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
concat_set['Title'].value_counts()
Fillna example:
concat_set['Fare'].fillna((concat_set['Fare'].median()), inplace=True)
If else one liner:
concat_set['IsAlone'] = 0
concat_set.loc[concat_set['FamilySize'] == 1, 'IsAlone'] = 1
Use of categorical function in pandas to create numericals:
for feature in concat_set.keys():
concat_set[feature] = pd.Categorical(concat_set[feature]).codes
Display all the columns in a df as output:
pd.set_option('display.max_columns', None)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline
train_data = pd.read_csv('./data/train.csv')
null_in_train_csv = train_data.isnull().sum()
null_in_train_csv = train_data.isnull().sum()
null_in_train_csv = null_in_train_csv[null_in_train_csv > 0]
null_in_train_csv.sort_values(inplace=True)
null_in_train_csv.plot.bar()
null_in_train_csv
Filter Columns based on Correlation:
sns.heatmap(train_csv.corr(), vmax=.8, square=True);
arr_train_cor = train_csv.corr()['SalePrice']
idx_train_cor_gt0 = arr_train_cor[arr_train_cor > 0].sort_values(ascending=False).index.tolist()
arr_train_cor[idx_train_cor_gt0]
Find count of zeroes in a Column:
zero_in_masvnrarea = train_meta['MasVnrArea'][train_meta['MasVnrArea'] == 0].index.tolist()
Replace all NULL with 0 if %of 0 in a column is more than 50%
null_in_masvnrarea = train_meta[train_meta['MasVnrArea'].isnull()].index.tolist()
zero_in_masvnrarea = train_meta['MasVnrArea'][train_meta['MasVnrArea'] == 0].index.tolist()
print("How many null value in MasVnrArea? %d / 1460" % len(null_in_masvnrarea))
print("How many zero value in MasVnrArea? %d / 1460" % len(zero_in_masvnrarea))
train_meta['MasVnrArea'][null_in_masvnrarea] = 0
Create a new column with value as 0 if 0 else for non zero value 1:
nonzero_in_masvnrarea = train_clean['MasVnrArea'][train_clean['MasVnrArea'] != 0].index.tolist()
train_clean['has_MasVnrArea'] = 0
train_clean['has_MasVnrArea'][nonzero_in_masvnrarea] = 1
Create a new binned column:
digitize: This is a function. First define the bins, i.e value for a range.
In the below example -1,1 -> 1, 1,1004 -> 2, 1004,4000 -> 3
When you run this output column will have 1/2/3 based on the value lies in which range.
bins_totalbsmtsf = [-1, 1, 1004, 4000]
train_clean['binned_TotalBsmtSF'] = np.digitize(train_clean['TotalBsmtSF'], bins_totalbsmtsf)
Display in Ipython:
from IPython.display import display
-----------------------------------------------------------------------------
Merge Test and Train Data by removing to be predicted column:
concat_set = pd.concat((train_data, pd.read_csv('test.csv'))).reset_index()
age_avg = concat_set['Age'].mean()
age_std = concat_set['Age'].std()
age_null_count = concat_set['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size = age_null_count)
concat_set['Age'][np.isnan(concat_set['Age'])] = age_null_random_list
concat_set['Age'] = concat_set['Age'].astype(int)
Create a Categorical Column:
concat_set['CategoricalAge'] = pd.cut(concat_set['Age'], 5)
Create a new column by Splitting a column and extract values from it:
concat_set['Title'] = concat_set['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
concat_set['Title'].value_counts()
Fillna example:
concat_set['Fare'].fillna((concat_set['Fare'].median()), inplace=True)
If else one liner:
concat_set['IsAlone'] = 0
concat_set.loc[concat_set['FamilySize'] == 1, 'IsAlone'] = 1
Use of categorical function in pandas to create numericals:
for feature in concat_set.keys():
concat_set[feature] = pd.Categorical(concat_set[feature]).codes
Display all the columns in a df as output:
pd.set_option('display.max_columns', None)
No comments:
Post a Comment