1) Exploring the data with an exploratory notebook in order to learn about the dataset.
2) Engineer useful features for our machine learning models by manipulating tabular data using pandas, numpy.
3) Creating three machine learning models: Logistic Regression, Random Forest, Multilayer Perceptron Neural Network
4) Using kaggle to score our models and measure progress.
# MISSING VALUE CHECK # Define function def check_missing_value(df): # Returns how many values are missing in each column missing_values = df.isnull().sum() # Percentage missing_values_percent = 100 * missing_values / len(df) # Table missing_values_table = pd.concat([missing_values, missing_values_percent], axis=1) # renaming columns missing_value_table_renamed_columns = missing_values_table.rename( columns={0: 'Missing values', 1: '% of total values'}) # return summary info return missing_value_table_renamed_columns # Create a dataframe using the previously called function to explain the missing data expl_missing_values_df = check_missing_value(apptrain) # Creating a new dataframe called mvdf that shows the amount of missing values and the percentage and sorts it. mvdf = expl_missing_values_df.loc[~(expl_missing_values_df==0).all(axis=1)] mvdf.sort_values(by=['% of total values'], ascending=False).head(30)
correlations = apptrain.corr()['TARGET'].sort_values() print('Top 15 positive correlations', correlations.tail(15)) print('Top 15 negative correlations', correlations.head(15))
# Creating train data copies apptrain_domain = apptrain.copy() apptest_domain = apptest.copy() #Creating variables for train data apptrain_domain['CREDIT_INCOME_PERCENT'] = apptrain_domain['AMT_CREDIT'] * 100 / apptrain_domain['AMT_INCOME_TOTAL'] apptrain_domain['ANNUITY_INCOME_PERCENT'] = apptrain_domain['AMT_ANNUITY'] / apptrain_domain['AMT_INCOME_TOTAL'] apptrain_domain['CREDIT_TERM'] = apptrain_domain['AMT_ANNUITY'] / apptrain_domain['AMT_CREDIT'] apptrain_domain['DAYS_EMPLOYED_PERCENT'] = apptrain_domain['DAYS_EMPLOYED'] / apptrain_domain['DAYS_BIRTH'] #Creating variables for test data apptest_domain['CREDIT_INCOME_PERCENT'] = apptest_domain['AMT_CREDIT'] / apptest_domain['AMT_INCOME_TOTAL'] apptest_domain['ANNUITY_INCOME_PERCENT'] = apptest_domain['AMT_ANNUITY'] / apptest_domain['AMT_INCOME_TOTAL'] apptest_domain['CREDIT_TERM'] = apptest_domain['AMT_ANNUITY'] / apptest_domain['AMT_CREDIT'] apptest_domain['DAYS_EMPLOYED_PERCENT'] = apptest_domain['DAYS_EMPLOYED'] / apptest_domain['DAYS_BIRTH']
With our limited knowledge of the Banking/Financial industry, we created features that are obvious in their correlation. These domain features prove to score at the top of the correlation function given by pandas
from sklearn.linear_model import LogisticRegression # Develop model with parameters logistic_regression = LogisticRegression(C=0.0001, verbose=2, n_jobs=-1) #train the model by giving it the data logistic_regression.fit(training_data_scaled, train_labels)
from sklearn.ensemble import RandomForestClassifier # Declare the model, tune parameters, fit data Random_Forest = RandomForestClassifier(n_estimators = 1000, verbose = 1, n_jobs = -1, max_features = 'auto') Random_Forest.fit(training_data, train_labels)
from sklearn.preprocessing import Imputer, MinMaxScaler imputer = Imputer(strategy = 'median') fe_training_data = imputer.fit_transform(fe_training_data) fe_testing_data = imputer.transform(fe_testing_data) scaler = MinMaxScaler(feature_range = (0,1)) scaler.fit(fe_training_data) fe_training_data = scaler.transform(fe_training_data) fe_testing_data = scaler.transform(fe_testing_data) from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(activation='relu', learning_rate='adaptive', max_iter=1000, verbose=True) mlp.fit(fe_training_data, train_labels)