1) Exploring the data with an exploratory notebook in order to learn about the dataset.
2) Engineer useful features for our machine learning models by manipulating tabular data using pandas, numpy.
3) Creating three machine learning models: Logistic Regression, Random Forest, Multilayer Perceptron Neural Network
4) Using kaggle to score our models and measure progress.
# MISSING VALUE CHECK
# Define function
def check_missing_value(df):
# Returns how many values are missing in each column
missing_values = df.isnull().sum()
# Percentage
missing_values_percent = 100 * missing_values / len(df)
# Table
missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1)
# renaming columns
missing_value_table_renamed_columns = missing_values_table.rename( columns={0: 'Missing values', 1: '% of total values'})
# return summary info
return missing_value_table_renamed_columns
# Create a dataframe using the previously called function to explain the missing data
expl_missing_values_df = check_missing_value(apptrain)
# Creating a new dataframe called mvdf that shows the amount of missing values and the percentage and sorts it.
mvdf = expl_missing_values_df.loc[~(expl_missing_values_df==0).all(axis=1)]
mvdf.sort_values(by=['% of total values'], ascending=False).head(30)
correlations = apptrain.corr()['TARGET'].sort_values()
print('Top 15 positive correlations', correlations.tail(15))
print('Top 15 negative correlations', correlations.head(15))
# Creating train data copies
apptrain_domain = apptrain.copy()
apptest_domain = apptest.copy()
#Creating variables for train data
apptrain_domain['CREDIT_INCOME_PERCENT'] = apptrain_domain['AMT_CREDIT'] * 100 / apptrain_domain['AMT_INCOME_TOTAL']
apptrain_domain['ANNUITY_INCOME_PERCENT'] = apptrain_domain['AMT_ANNUITY'] / apptrain_domain['AMT_INCOME_TOTAL']
apptrain_domain['CREDIT_TERM'] = apptrain_domain['AMT_ANNUITY'] / apptrain_domain['AMT_CREDIT']
apptrain_domain['DAYS_EMPLOYED_PERCENT'] = apptrain_domain['DAYS_EMPLOYED'] / apptrain_domain['DAYS_BIRTH']
#Creating variables for test data
apptest_domain['CREDIT_INCOME_PERCENT'] = apptest_domain['AMT_CREDIT'] / apptest_domain['AMT_INCOME_TOTAL']
apptest_domain['ANNUITY_INCOME_PERCENT'] = apptest_domain['AMT_ANNUITY'] / apptest_domain['AMT_INCOME_TOTAL']
apptest_domain['CREDIT_TERM'] = apptest_domain['AMT_ANNUITY'] / apptest_domain['AMT_CREDIT']
apptest_domain['DAYS_EMPLOYED_PERCENT'] = apptest_domain['DAYS_EMPLOYED'] / apptest_domain['DAYS_BIRTH']
With our limited knowledge of the Banking/Financial industry, we created features that are obvious in their correlation. These domain features prove to score at the top of the correlation function given by pandas
from sklearn.linear_model import LogisticRegression
# Develop model with parameters
logistic_regression = LogisticRegression(C=0.0001, verbose=2, n_jobs=-1)
#train the model by giving it the data
logistic_regression.fit(training_data_scaled, train_labels)
from sklearn.ensemble import RandomForestClassifier
# Declare the model, tune parameters, fit data
Random_Forest = RandomForestClassifier(n_estimators = 1000, verbose = 1, n_jobs = -1, max_features = 'auto')
Random_Forest.fit(training_data, train_labels)
from sklearn.preprocessing import Imputer, MinMaxScaler
imputer = Imputer(strategy = 'median')
fe_training_data = imputer.fit_transform(fe_training_data)
fe_testing_data = imputer.transform(fe_testing_data)
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(fe_training_data)
fe_training_data = scaler.transform(fe_training_data)
fe_testing_data = scaler.transform(fe_testing_data)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation='relu', learning_rate='adaptive', max_iter=1000, verbose=True)
mlp.fit(fe_training_data, train_labels)