In this, Jupyter notebook, I used the PhD applications dataset provided by my master thesis advisor. I used basic and known machine learnin methods to predict if the student is accepted, waitlisted or rejected.
Table of Contents
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
# For data visualization
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
# All the models we will apply
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
#from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from scipy.stats import skew # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import numpy as np
import os
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "MStat"
def save_fig(fig_id, tight_layout=True):
path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
The data used for this analysis was collected from a major Universities Graduate Mathematics Application system for students applying for the Mathematics PHD program. The information is used by the department of mathematics to determine which applicants will be admitted into the graduate program. Each year members of the department of mathematics review each graduate application and give the prospective student a rating score between one and five, five being the best, with all values in between possible. This rating score determines whether an applicant is accepted, rejected, or put on a waitlist for the Universities Mathematics graduate program.
The rating score (or just RATING) and whether an applicant is accepted, rejected, or put on a waitlist (DECISION) are the variables of interest for this project. The purpose of this research is to create both a regression and classification models that can accurately predict the RATING and DECISION, based on the data submitted by the student. The models we are using includes Random Rorest, Gradient Boosting, Generalized Linear, Stacked Ensemble, XGBoost and Deep learning.
The data is collected in a spreadsheet for easy visual inspection. Each row of data represents a single applicant identified by a unique identification number. Each application consists of the qualitative and quantitative data described in the table below. Note that he qualitative variables are identified by blue highlighting.The following variables make up the columns of the spreadsheet. Note that some of these fields are optional for the student to submit, so not every field has an entry for every student. This creates an issues of missing data, and later on we will discuss how this issue was dealt with.
from IPython.display import HTML, display
import tabulate
table = [["Applicant Client ID","Application ID"],
["Emphasis Area",'First choice of study area'],
["Emphasis Area 2",'Secondary choice of study area'],
["Emphasis Area 3",'Tertiary choice of area they would like to study area'],
["UU_APPL_CITIZEN","US Citizen (Yes or No)"],
["AGE",'Age of the applicant in years'],
["SEX",'Gender of the applicant: Male or Female'],
["LOW_INCOME",'If the applicant is coming from low income family'],
["UU_FIRSTGEN",'If the appicant is the first generation attending grad school'],
["UU_APPL_NTV_LANG","Applicant's native language"],
["HAS_LANGUAGE_TEST",'Foreign Language Exam, if applicable (TOEFL IBT, IELTS, or blank)'],
["TEST_READ",'Score on the reading part of TOEFL'],
["TEST_SPEAK",'Score on the speaking part of TOEFL'],
["TEST_WRITE",'Score on the writing part of TOEFL'],
["TEST_LISTEN",'Score on the listening part of TOEFL'],
["MAJOR",'Applicants undergraduate major'],
["GPA",'Applicants GPA'],
["NUM_PREV_INSTS",'Number of the previous instituions student studied'],
["HAS_GRE_GEN",'If applicant has taken GRE General exam'],
["GRE_VERB",'Raw score on verbal part of the GRE'],
["GRE_QUANT",'Raw score on the quantitative part of the GRE'],
["GRE_AW",'Raw score on analytical writing part of the GRE'],
["HAS_GRE_SUBJECT",'If applicant has taken GRE Subject exam'],
["GRE_SUB",'Raw score on Math subject GRE'],
["NUM_RECOMMENDS",'Number of recommenders of the applicant'],
["R_AVG_ORAL","Average of recommender's for applicants oral excellence" ],
["R_AVG_WRITTEN","Average of recommender's for applicants oral excellence"],
["R_AVG_ACADEMIC","Average of recommender's for applicants oral excellence"],
["R_AVG_KNOWLEDGE","Average of recommender's for applicants oral excellence"],
["R_AVG_EMOT","Average of recommender's for applicants oral excellence"],
["R_AVG_MOT","Average of recommender's for applicants oral excellence"],
["R_AVG_RES","Average of recommender's for applicants oral excellence"],
["R_AVG_RATING","Average of recommender's for applicants oral excellence"],
["RATING",'Rating score (any value between 1-5)'],
["DECISION",'Faculty application decision (Accept, Reject, or Waitlist)'], ]
headers = ["Variable", "Description"]
print(display(HTML(tabulate.tabulate(table, headers,tablefmt='html'))))
The data set includes 759 graduate applications, that were submitted for admission in Fall 2016, Fall 2017, Fall 2018 and Fall 2019. There are various missing data points throughout both the dataset. The tables below describes the number of missing values for each variable for whole data set. Missing data is represented by shorter columns. The bottom of the table lists the various variable names. The top of the table represents how many data entries we have. On the left of the table is the percentage of the missing data for a specific category. The numbers on the right of the table records the number of variables that each variable has. For example, on the bottom columns starting from TEST_READ, TEST_SPEAK, TEST_WRITE and TETS_LISTEN have shorter columns.
The applicants age (AGE) was calculated using the applicants birthday and is accurate as of 1 January of the year in which they applied. Also, since all universities do not use the same GPA scale, GPA values over four were reviewed and scaled based on information deduced from the applicants resume.
Load the data and see the first four rows of the data.
students = pd.read_csv('Sergazy_last.csv')
#students.head(4)
Let us have an idea about the data. See the information about it. We will see what kind of variables we have.
students.info()
There 759 number of observations with 36 columns.
Let us the number of the missing values for each variable.
students.isnull().sum()
Right now let us see the relations between variables via visualization.
# to make this notebook's output identical at every run
np.random.seed(42)
students['DECISION'].value_counts()
students['AGE'].plot(kind = 'hist')
students.plot(kind = 'scatter',x = 'GPA',y = 'RATING')
students.plot(kind="scatter", x="GPA", y="RATING", alpha=0.4,
s=students["AGE"], label="AGE", figsize=(10,7),
c="R_AVG_RATING", cmap=plt.get_cmap("jet"), colorbar=True,
sharex=False)
plt.legend()
save_fig("students_rating_GPA_scatterplot")
Let us see survival according to the who variable.
sns.countplot(data = students, x = 'DECISION', hue = 'SEX')
#sns.countplot(data = students, x = 'DECISION', hue = 'MAJOR')
sns.catplot(data = students, x = 'DECISION', hue = 'MAJOR', kind = 'count', col = 'SEX')
These histogram does not tell you much except if unspecified sex has equal number of being accepted,rejected, or waitlisted.
sns.scatterplot(x = 'AGE',y = 'GPA', data =students,hue = 'DECISION')
sns.barplot(y = 'R_AVG_RATING',x = 'DECISION', data = students)
sns.countplot(data = students, x = 'DECISION', hue = 'LOW_INCOME')
#UU_FIRSTGEN
#UU_APPL_NTV_LANG
sns.countplot(data = students, x = 'DECISION', hue = 'UU_FIRSTGEN')
sns.countplot(data = students, x = 'DECISION', hue = 'NUM_PREV_INSTS')
These histograms show that low income, being first generation in your family coming to grad school or number of previous institutions you studied is not much of relavent to get accepted.
bins = np.arange(0, 80, 5)
g = sns.FacetGrid(students, row='SEX', col='UU_FIRSTGEN', hue='DECISION', margin_titles=True, height=3, aspect=1.1)
g.map(sns.distplot, 'AGE', kde=False, bins=bins, hist_kws=dict(alpha=0.7))
g.add_legend()
plt.show()
Here, We do not any see any blue color in left bottom corner. We see that unspecified first generation and unspecified sex helped to increase the chance of getting admitted.
This shows that waitlisted people have higher average than being admitted.
sns.catplot('GPA','DECISION',data = students, aspect = 2.0, kind = 'point' )
Below we see that higher number of previous instutions is better to get admitted.
sns.catplot('NUM_PREV_INSTS','DECISION',data = students, aspect = 2.0, kind = 'point' )
print(type(students.groupby('SEX').mean()))
students.groupby('SEX').mean()
#students.groupby('SEX').sum().plot(kind='bar');
students.groupby(['SEX', 'DECISION'] )['DECISION'].count().plot.bar(figsize=(8, 6));
students.groupby(['SEX', 'DECISION'] )['DECISION'].count().unstack('SEX')
students.groupby(['SEX', 'DECISION','UU_FIRSTGEN'] )['DECISION'].count().unstack('SEX')
#students.groupby(['SEX', 'DECISION','UU_FIRSTGEN'] )['DECISION'].count().unstack(1).plot.bar(figsize=(8, 6));
students.isnull().sum()
import missingno as msno
fig = plt.figure()
msno.bar(students)
fig.savefig('missing_data.png')
First we drop Applicant_Client_ID.
students = students.drop('Applicant_Client_ID',axis =1)
By looking here, either we can get rid off 8 variables that have missing values or we can fill them mean, median or common values.
First, let us take a look of GPA. We know GPA should not be higher than 4. Let us see if there is GPA higher than 4.
students['GPA'].nlargest(6)
These tell us we have two student entered their GPA higher than 4. We will equal them to 4.00 to be consistent. Let us make them equal to 4.00.
students.loc[733,'GPA'] = 4.00
students.loc[105,'GPA'] = 4.00
students.columns
According to these website, https://www.prepscholar.com/toefl/blog/what-is-the-average-toefl-score/
According to this website, United States's average TOEFL score is for Reading 21, for Listening 23, for Speaking 23, and for Writing 22. Total of these scores is 89. We will see first average scores other students average TOEFL score for each section.
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
# print(students.head())
students.groupby('SEX').mean()
g_mean = students.groupby('SEX').mean()
#df.loc[df.Age.isnull() & (df.Sex == 'female'),'Age'] = g_mean['Age']['female']
#df.loc[df.Age.isnull() & (df.Sex == 'male'), 'Age'] = g_mean['Age']['male']
# Fill the TEST_READ
students.loc[students.TEST_READ.isnull() & (students.SEX == 'Male'),'TEST_READ'] = g_mean['TEST_READ']['Male']
students.loc[students.TEST_READ.isnull() & (students.SEX == 'Female'),'TEST_READ'] = g_mean['TEST_READ']['Female']
students.loc[students.TEST_READ.isnull() & (students.SEX == 'Unspecified'),'TEST_READ'] = g_mean['TEST_READ']['Unspecified']
# Fill the TEST_SPEAK
students.loc[students.TEST_SPEAK.isnull() & (students.SEX == 'Male'),'TEST_SPEAK'] = g_mean['TEST_SPEAK']['Male']
students.loc[students.TEST_SPEAK.isnull() & (students.SEX == 'Female'),'TEST_SPEAK'] = g_mean['TEST_SPEAK']['Female']
students.loc[students.TEST_SPEAK.isnull() & (students.SEX == 'Unspecified'),'TEST_SPEAK'] = g_mean['TEST_SPEAK']['Unspecified']
# Fill the TEST_LISTEN
students.loc[students.TEST_LISTEN.isnull() & (students.SEX == 'Male'),'TEST_LISTEN'] = g_mean['TEST_LISTEN']['Male']
students.loc[students.TEST_LISTEN.isnull() & (students.SEX == 'Female'),'TEST_LISTEN'] = g_mean['TEST_LISTEN']['Female']
students.loc[students.TEST_LISTEN.isnull() & (students.SEX == 'Unspecified'),'TEST_LISTEN'] = g_mean['TEST_LISTEN']['Unspecified']
# Fill the TEST_WRITE
students.loc[students.TEST_WRITE.isnull() & (students.SEX == 'Male'),'TEST_WRITE'] = g_mean['TEST_WRITE']['Male']
students.loc[students.TEST_WRITE.isnull() & (students.SEX == 'Female'),'TEST_WRITE'] = g_mean['TEST_WRITE']['Female']
students.loc[students.TEST_WRITE.isnull() & (students.SEX == 'Unspecified'),'TEST_WRITE'] = g_mean['TEST_WRITE']['Unspecified']
# Fill the GRE_VERB
students.loc[students.GRE_VERB.isnull() & (students.SEX == 'Male'),'GRE_VERB'] = g_mean['GRE_VERB']['Male']
students.loc[students.GRE_VERB.isnull() & (students.SEX == 'Female'),'GRE_VERB'] = g_mean['GRE_VERB']['Female']
students.loc[students.GRE_VERB.isnull() & (students.SEX == 'Unspecified'),'GRE_VERB'] = g_mean['GRE_VERB']['Unspecified']
# Fill the GRE_QUANT
students.loc[students.GRE_QUANT.isnull() & (students.SEX == 'Male'),'GRE_QUANT'] = g_mean['GRE_QUANT']['Male']
students.loc[students.GRE_QUANT.isnull() & (students.SEX == 'Female'),'GRE_QUANT'] = g_mean['GRE_QUANT']['Female']
students.loc[students.GRE_QUANT.isnull() & (students.SEX == 'Unspecified'),'GRE_QUANT'] = g_mean['GRE_QUANT']['Unspecified']
# Fill the GRE_AW
students.loc[students.GRE_AW.isnull() & (students.SEX == 'Male'),'GRE_AW'] = g_mean['GRE_AW']['Male']
students.loc[students.GRE_AW.isnull() & (students.SEX == 'Female'),'GRE_AW'] = g_mean['GRE_AW']['Female']
students.loc[students.GRE_AW.isnull() & (students.SEX == 'Unspecified'),'GRE_AW'] = g_mean['GRE_AW']['Unspecified']
# Fill the GRE_SUB
students.loc[students.GRE_SUB.isnull() & (students.SEX == 'Male'),'GRE_SUB'] = g_mean['GRE_SUB']['Male']
students.loc[students.GRE_SUB.isnull() & (students.SEX == 'Female'),'GRE_SUB'] = g_mean['GRE_SUB']['Female']
students.loc[students.GRE_SUB.isnull() & (students.SEX == 'Unspecified'),'GRE_SUB'] = g_mean['GRE_SUB']['Unspecified']
#students.loc[students.TEST_LISTEN.isnull() & (students.CTZNSHP == 'UNITED STATES'),'TEST_LISTEN'] = listen_mean
#students.loc[students.TEST_WRITE.isnull() & (students.CTZNSHP == 'UNITED STATES'),'TEST_WRITE'] = write_mean
We will replaces the missing values of any variables with the mean of other observations for particular variable.
students.isnull().sum()
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
# print(citizenship.get_group('UNITED STATES'))
#students.info()
Let us see the histograms of each variables now.
%matplotlib inline
import matplotlib.pyplot as plt
students.hist(bins=50, figsize=(24,19))
save_fig("attribute_histogram_plots")
plt.show()
from scipy import stats
from scipy.stats import norm, skew #for some statistics
sns.distplot(students['RATING'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(students['RATING'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('Rating distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(students['RATING'], plot=plt)
plt.show()
This is slightly left-skewed. But we will keep it this way.
Now the data almost ready. We would like to convert categorical variables to numeric variables.
#students.head(12)
students_ce = students.copy()
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['Emphasis Area','Emphasis Area 2','Emphasis Area 3','UU_APPL_CITIZEN',
'CTZNSHP','SEX','LOW_INCOME','UU_FIRSTGEN','UU_APPL_NTV_LANG',
'HAS_LANGUAGE_TEST','HAS_GRE_GEN','HAS_GRE_SUBJECT','MAJOR'
])
df_binary = encoder.fit_transform(students_ce)
df_binary.head()
df = df_binary
#df_binary.shape
#df_binary.info()
#df.info()
First, we will estimate DECISION using RATING variable.
# let us divide this into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(
df.loc[:, df.columns != 'DECISION'],
df['DECISION'], test_size = 0.2, random_state=0)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
#X_train.info()
#Y_train
#cleanup_nums = {"DECISION": {"Admit": 1, "Waitlist": 2, "Reject": 3 }}
#df.replace(cleanup_nums, inplace=True)
#df["DECISION"] = df["DECISION"].astype('category')
#df.info()
#This is a crucial step in rescaling input data so that all the features are mean zero with a unit variance.
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = AdaBoostClassifier(base_estimator = None,
algorithm = 'SAMME',n_estimators=100, learning_rate=1.0)
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
adaboost = cm.trace()/cm.sum()
print('Accuracy on a test set: ', adaboost)
classifier = DecisionTreeClassifier(criterion = 'entropy',random_state = 0)
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
#print(accuracy_score(Y_test, Y_pred))
d_tree = cm.trace()/cm.sum()
print('Accuracy on a test set: ', d_tree)
classifier = GaussianNB()
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
gnb=cm.trace()/cm.sum()
print('Accuracy on a test set: ', gnb)
classifier = KNeighborsClassifier(leaf_size = 100, p=2)
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
knn = cm.trace()/cm.sum()
print('Accuracy on a test set: ', knn)
classifier = LinearSVC(max_iter = 10000, class_weight = 'balanced', random_state = 0)
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
lsvm = cm.trace()/cm.sum()
print('Accuracy on a test set: ', lsvm)
classifier = LogisticRegression(C=0.65,random_state = 0, solver='lbfgs',
multi_class='multinomial',class_weight='balanced', max_iter = 10000)
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
log_regr= cm.trace()/cm.sum()
print('Accuracy on a test set: ', log_regr)
#classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0)
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy',
class_weight='balanced',random_state = 0,
bootstrap=True)
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
rf = cm.trace()/cm.sum()
print('Accuracy on a test set: ', rf)
classifier = Perceptron(random_state = 0, class_weight = 'balanced')
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
perc = cm.trace()/cm.sum()
print('Accuracy on a test set: ', perc)
classifier = linear_model.SGDClassifier(max_iter=5, tol=None,random_state = 0,
alpha=0.0001, average=True,class_weight = 'balanced')
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
sgd = cm.trace()/cm.sum()
print('Accuracy on a test set: ', sgd)
classifier = SVC(max_iter = 10000, random_state = 0, gamma = 'auto')
classifier.fit(X_train, Y_train)
print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
svm = cm.trace()/cm.sum()
print('Accuracy on a test set: ', svm)
classifier = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0,
subsample=0.8, colsample_bytree=0.8,
nthread=4, scale_pos_weight=1, seed=27)
classifier.fit(X_train, Y_train)
print(classifier.score(X_train, Y_train))
#how about on a test set
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
xgboost = cm.trace()/cm.sum()
xgboost
table = pd.DataFrame({
'Model': ['Support Vector Machines', 'Linear Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Perceptron', 'Stochastic Gradient Decent',
'Decision Tree', 'Adaptive Boosting Classifier','XG Boost Classifier'],
'Score': [svm,lsvm, knn, log_regr,
rf, gnb, perc, sgd,
d_tree, adaboost, xgboost]})
table_df = table.sort_values(by='Score', ascending=False)
table_df = table_df.set_index('Model')
table_df
## We willl use this function to see what parameters gives a best score
def best_score(classifier):
print('Accuracy on a train set: ', classifier.best_score_)
print('Best parameters: ', classifier.best_params_)
print('Best Estimator:',classifier.best_estimator_)
return classifier.best_score_
param_grid = {'n_estimators': [30, 50, 100], 'learning_rate': [0.08, 0.1, 1, 5, 10]}
ada_grid = GridSearchCV(AdaBoostClassifier(algorithm = 'SAMME'), param_grid, cv=10, refit=True, verbose=1)
ada_grid.fit(X_train,Y_train)
score_ada_grid = best_score(ada_grid)
y_pred_grid = ada_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
ada_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', ada_grid_accuracy)
decs_tree = DecisionTreeClassifier(random_state = 0)
#param_grid = {'min_samples_split': [4,7,10,12], criterion : ['entropy','gini']}
param_grid = {'min_samples_split' : range(10,500,20),'max_depth': range(1,20,2)}
decs_tree_grid = GridSearchCV(decs_tree, param_grid, cv=10, refit=True, verbose=1)
decs_tree_grid.fit(X_train, Y_train)
#print(decs_tree_grid.score(X_train_sc, Y_train))
score_decs_tree_grid= best_score(decs_tree_grid)
y_pred_grid = decs_tree_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
decs_tree_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', decs_tree_grid_accuracy)
knn = KNeighborsClassifier()
param_grid = {'leaf_size':list(range(3,15,1)), 'n_neighbors':list(range(1, 15, 1)),
'weights':['uniform', 'distance']}
knn_grid = GridSearchCV(knn, param_grid, cv=10, verbose=1, scoring='accuracy')
knn_grid.fit(X_train, Y_train)
score_knn_grid = best_score(knn_grid)
y_pred_grid = knn_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
knn_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', knn_grid_accuracy)
param_grid = {"max_depth": [8,10,15], "learning_rate" : [0.008,0.01,0.012],
"num_leaves": [80,100,120], "n_estimators": [200,250] }
lgbm_grid = GridSearchCV(lgb.LGBMClassifier(silent=False), param_grid, cv=10, refit=True, verbose=1)
lgbm_grid.fit(X_train,Y_train, verbose=True)
score_lgbm_grid = best_score(lgbm_grid)
y_pred_grid = lgbm_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
lgbm_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy is: ', lgbm_grid_accuracy)
param_grid = {'C': [0.1,10, 100, 1000,5000]}
lsvm_grid = GridSearchCV(LinearSVC(max_iter = 10000), param_grid, cv=10, refit=True, verbose=1)
lsvm_grid.fit(X_train,Y_train)
score_lsvm_grid= best_score(lsvm_grid)
y_pred_grid = lsvm_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
lsvm_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', lsvm_grid_accuracy)
param_grid = {'C': np.logspace(0, 4, 1), 'penalty' : [ 'l2']
, 'multi_class':['multinomial','auto','ovr'] }
log_reg = LogisticRegression(random_state = 42,class_weight='balanced',solver ='lbfgs', max_iter = 10000)
log_reg_grid = GridSearchCV(log_reg, param_grid, cv=10, refit=True, verbose=1)
log_reg_grid.fit(X_train,Y_train)
score_log_reg_grid = best_score(log_reg_grid)
y_pred_grid = log_reg_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
log_reg_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', log_reg_grid_accuracy)
param_grid = {'max_depth': [3, 5, 6, 7, 8], 'max_features': [6,7,8,9,10],
'min_samples_split': [5, 6, 7, 8],'n_estimators':[30,50,100]}
rf_grid = GridSearchCV(RandomForestClassifier(criterion = 'entropy',class_weight='balanced',bootstrap=True), param_grid, cv=10, refit=True, verbose=1)
rf_grid.fit(X_train,Y_train)
score_rf_grid = best_score(rf_grid)
y_pred_grid = rf_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
rf_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', rf_grid_accuracy)
#classifier = linear_model.SGDClassifier(max_iter=5, tol=None,random_state = 0,
alpha=0.0001, average=True,class_weight = 'balanced')
#classifier.fit(X_train, Y_train)
param_grid = {
'loss': ['log'],
'penalty': ['elasticnet'],
'alpha': [10 ** x for x in range(-5, 1)],
'l1_ratio': [0, 0.05, 0.1, 0.5, 0.8, 0.9, 1],
}
sgd = linear_model.SGDClassifier(random_state=0, class_weight='balanced')
sgd_grid = GridSearchCV(sgd, param_grid=param_grid,
n_jobs=1 )
sgd_grid.fit(X_train, Y_train)
score_sgd_grid = best_score(sgd_grid)
y_pred_grid = sgd_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
sgd_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', sgd_grid_accuracy)
param_grid = {'C': [0.1,10, 100, 1000,5000]}
svm_grid = GridSearchCV(SVC(max_iter = 10000), param_grid, cv=10, refit=True, verbose=1)
svm_grid.fit(X_train,Y_train)
score_lsvm_grid= best_score(svm_grid)
y_pred_grid = svm_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
svm_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', svm_grid_accuracy)
xgb = XGBClassifier(learning_rate = 0.1,min_child_weight=1,
subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
nthread=4, b_pos_weight=1, seed=27)
#gbc = GradientBoostingClassifier()
param_grid = {'n_estimators': [50, 100,500],
'min_samples_split': [3, 4, 5, 6, 7],
'max_depth': [3, 4, 5, 6]}
xgb_grid = GridSearchCV(xgb, param_grid, cv=10, refit=True, verbose=1)
xgb_grid.fit(X_train, Y_train)
score_xgb_grid = best_score(xgb_grid)
y_pred_grid = xgb_grid.predict(X_test)
cm_grid = confusion_matrix(Y_test, y_pred_grid)
print(cm_grid)
xgb_grid_accuracy = cm_grid.trace()/cm_grid.sum()
print('Gridsearch accuracy on a test set is: ', xgb_grid_accuracy)
table2 = pd.DataFrame({
'Model': ['Support Vector Machines', 'Linear Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'Stochastic Gradient Decent', 'Decision Tree',
'Adaptive Boosting Classifier','XG Boost Classifier','Light GBM'],
'Score': [svm_grid_accuracy,lsvm_grid_accuracy, knn_grid_accuracy, log_reg_grid_accuracy,
rf_grid_accuracy, sgd_grid_accuracy, decs_tree_grid_accuracy,
ada_grid_accuracy, xgb_grid_accuracy,lgbm_grid_accuracy]})
table_df2 = table2.sort_values(by='Score', ascending=False)
table_df2 = table_df2.set_index('Model')
table_df2
#type(table_df2)
#table_df2.shape
#table_df.shape
Let us combine these two table.
pd.concat([table_df, table_df2],axis =1, sort=False, keys = ['Score','Score with grid search']) # concat dataframes
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
import numpy as np
import warnings
warnings.simplefilter('ignore')
# Set up models
ada = AdaBoostClassifier(base_estimator = None,
algorithm = 'SAMME',n_estimators=100, learning_rate=1.0)
dtree = DecisionTreeClassifier(criterion = 'entropy',random_state = 0)
gnb = GaussianNB()
knn = KNeighborsClassifier(leaf_size = 100, p=2)
#cls5 = LinearSVC(max_iter = 10000, class_weight = 'balanced', random_state = 0)
logreg = LogisticRegression(C=0.65,random_state = 0, solver='lbfgs',
multi_class='multinomial',class_weight='balanced', max_iter = 10000)
rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy',
class_weight='balanced',random_state = 0,
bootstrap=True)
#cls8 = Perceptron(random_state = 0, class_weight = 'balanced')
sgd = linear_model.SGDClassifier(max_iter=5, tol=None,random_state = 0,
alpha=0.0001, average=True,
class_weight = 'balanced',loss ='modified_huber')
svm = SVC(max_iter = 10000, random_state = 0, gamma = 'auto',probability=True)
xgb = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0,
subsample=0.8, colsample_bytree=0.8,
nthread=4, scale_pos_weight=1, seed=27)
# Logit will be used for stacking
sclf = StackingCVClassifier(classifiers=[ada, dtree, gnb,knn, logreg, rf,sgd,svm,xgb],
meta_classifier=ada, use_probas=True, cv=5)
# Do CV
for clf, label in zip([ada, dtree, gnb,knn, logreg, rf,sgd,svm,xgb, sclf],
['ADA Boost','Decision tree','Gaussian Naive Bayes',
'KNN','Logistic Regression',
'Random Forest','SGD','SVM','XGB',
'StackingClassifier']):
scores = model_selection.cross_val_score(clf, X_train, Y_train, cv=5)
print("Accuracy: %0.4f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label))
# Fit on train data / predict on test data
#sclf_fit = sclf.fit(X_train, Y_train)
#mypreds = sclf_fit.predict_proba(X_test)
def accuracy(Y_train, Y_pred):
cm = confusion_matrix(Y_train, Y_pred)
return cm.trace()/cm.sum()
#print('Accuracy on a train set: ',classifier.score(X_train, Y_train))
#how about on a test set
#Y_pred = classifier.predict(X_test)
#cm = confusion_matrix(Y_test, Y_pred)
#print(cm)
#log_regr= cm.trace()/cm.sum()
#print('Accuracy on a test set: ', log_regr)
X = X_train
y = Y_train
stack_gen_model = sclf.fit(np.array(X), np.array(y))
ada_model_full_data = ada.fit(X, y)
dtree_model_full_data = dtree.fit(X, y)
gnb_model_full_data = gnb.fit(X, y)
knn_model_full_data = knn.fit(X, y)
logreg_model_full_data = logreg.fit(X, y)
rf_model_full_data = rf.fit(X, y)
sgd_model_full_data = sgd.fit(X,y)
svm_model_full_data = svm.fit(X, y)
xgb_model_full_data = xgb.fit(X, y)
#stack_gen_model.predict(X_test)
import statistics
from statistics import mode
def most_common(List):
return(mode(List))
def mixed_models_predict(X):
return list(ada_model_full_data.predict(X),
dtree_model_full_data.predict(X),
gnb_model_full_data.predict(X),
knn_model_full_data.predict(X),
logreg_model_full_data.predict(X),
rf_model_full_data.predict(X),
sgd_model_full_data.predict(X),
svm_model_full_data.predict(X),
xgb_model_full_data.predict(X),
stack_gen_model.predict(np.array(X)))
print(accuracy(Y_test, most_common(mixed_models_predict(X_test))))
mixed_models_predict(X_test)
import h2o
from h2o.automl import H2OAutoML
h2o.init()
import os
import h2o
import pandas as pd
type(df)
X_train_hf = h2o.H2OFrame(X_train)
#X_train.head()
Y_train_df = Y_train.to_frame()
Y_train_hf = h2o.H2OFrame(Y_train_df)
#Y_train_hf.head()
X_test_hf = h2o.H2OFrame(X_test)
Y_test_df = Y_test.to_frame()
Y_test_hf = h2o.H2OFrame(Y_test_df)
type(Y_test_hf)
train = X_train_hf.cbind(Y_train_hf)
test = X_test_hf.cbind(Y_test_hf)
y = "DECISION"
x = X_train_hf.columns
#train, test = df_hf.split_frame(ratios=[.8], seed=1)
# For binary classification, response should be a factor
#train[y] = train[y].asfactor()
#test[y] = test[y].asfactor()
aml_first = H2OAutoML(max_models = 30, max_runtime_secs=300, seed = 1)
aml_first.train(x = x, y = y, training_frame = train)
lb = aml_first.leaderboard
lb.head()
lb.head(rows=lb.nrows) # Entire leaderboard
preds_first_train = aml_first.predict(test)
preds_first_train.describe()
performance = aml_first.leader.model_performance(test)
performance.show()
cm = performance.confusion_matrix()
cm
df.head()
For the first part, we will estimate rating. With estimated rating column we will estimate DECISION column.
# y1_decision = students['DECISION']
# df = df.drop('DECISION', axis =1)
#df.info()
df = df.drop('DECISION', axis =1)
df.info()
#df_NA.info()
# let us divide this into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(
df.loc[:, df.columns != 'RATING'],
df['RATING'], test_size = 0.2, random_state=0)
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
def rmse(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
def cv_rmse(model, X=X_train, y=Y_train):
rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
return (rmse)
alphas_ridge = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas_lasso = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
alphas_elastic = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
elastic_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
alphas_krr = [0.6, 0.65,0.7, 0.75,0.7, 0.75,0.8, 0.85, 0.9, 0.95, 0.99, 1]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_ridge, cv=kfolds)) #This model may be very sensitive to outliers.
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas_lasso, random_state=42, cv=kfolds))
#ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=alphas_elastic, cv=kfolds, l1_ratio=elastic_l1ratio))
krr = KernelRidge(alpha=alphas_krr, kernel='polynomial', degree=2, coef0=2.5)
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4,
max_features='sqrt', min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =42)
lightgbm = LGBMRegressor(objective='regression',
num_leaves=4,
learning_rate=0.01,
n_estimators=5000,
max_bin=200,
bagging_fraction=0.75,
bagging_freq=5,
bagging_seed=7,
feature_fraction=0.2,
feature_fraction_seed=7,
verbose=-1,
)
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
max_depth=3, min_child_weight=0.05,
gamma=0.04, subsample=0.7,
colsample_bytree=0.7,
objective='reg:squarederror', nthread=-1,
scale_pos_weight=1, random_state =42,
reg_alpha=0.06, reg_lambda=0.8571)
tsr = TheilSenRegressor(random_state=42)
rnsc = RANSACRegressor(random_state=42)
huber = HuberRegressor()
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(criterion = 'mse',random_state = 0)
sgd_reg = linear_model.SGDRegressor(max_iter=1000, random_state = 42, eta0=0.000001, average=True)
score = cv_rmse(ridge)
print("Ridge Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
ridge_score= score.mean()
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
lasso_score=score.mean()
score = cv_rmse(elasticnet)
print("Elasticnet: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
elasticnet_score=score.mean()
score = cv_rmse(rnsc)
print("RNSC: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
ransac_score = score.mean()
score = cv_rmse(tsr)
print("TSR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
theilsen_score = score.mean()
score = cv_rmse(huber)
print("Huber: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
huber_score = score.mean()
score = cv_rmse(krr)
print("Kernel Ridge Regression: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
krr_score = score.mean()
score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
svr_score = score.mean()
score = cv_rmse(lightgbm)
print("Light GBM: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
lightgm_score = score.mean()
score = cv_rmse(sgd_reg)
print("SGD : {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
sgd_score = score.mean()
score = cv_rmse(lin_reg)
print("Linear Regression: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
lin_score =score.mean()
score = cv_rmse(tree_reg)
print("Decision Tree : {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
tree_score =score.mean()
score = cv_rmse(forest_reg)
print("Random Forest : {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
forest = score.mean()
score = cv_rmse(gbr)
print("Gradient Boosting: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
gbr_score =score.mean()
score = cv_rmse(xgboost)
print("XG boost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )
xgb_score =score.mean()
table = pd.DataFrame({
'Model': ['Ridge Regression', 'LASSO Regression', 'Elastic Net', 'Ransac Regression',
'Thielsen Regressor', 'Huber Regressor', 'Kernel Ridge Regression', 'Epsilon-Support Vector Regression',
'Light GBM', 'SGD','Linear Regression', 'Decision Tree Regressor',
'Random Forest regressor', 'Gradient Boosting','XGBoost Regressor'],
'Score': [ridge_score,lasso_score, elasticnet_score, ransac_score,
theilsen_score, huber_score, krr_score, svr_score,
lightgm_score, sgd_score, lin_score,tree_score,
forest, gbr_score, xgb_score]})
table_df = table.sort_values(by='Score', ascending=True)
table_df = table_df.set_index('Model')
table_df
stack_gen = StackingCVRegressor(regressors=(svr,huber, ridge, sgd_reg, lasso, elasticnet, lin_reg,
tsr,forest_reg,lightgbm,xgboost),
meta_regressor=svr,
use_features_in_secondary=True)
X = X_train
y = Y_train
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
svr_model_full_data = svr.fit(X, y)
huber_model_full_data = huber.fit(X, y)
ridge_model_full_data = ridge.fit(X, y)
sgd_model_full_data = sgd_reg.fit(X, y)
lasso_model_full_data = lasso.fit(X, y)
elastic_model_full_data = elasticnet.fit(X, y)
lin_model_full_data = lin_reg.fit(X,y)
tsr_model_full_data = tsr.fit(X, y)
rf_model_full_data = forest_reg.fit(X, y)
xgb_model_full_data = xgboost.fit(X, y)
lgb_model_full_data = lightgbm.fit(X, y)
def mixed_models_predict(X):
return (
(0.01 * xgb_model_full_data.predict(X)) + \
(0.01 * lgb_model_full_data.predict(X)) + \
(0.05 * rf_model_full_data.predict(X)) + \
(0.05 * tsr_model_full_data.predict(X)) + \
(0.05 * lin_model_full_data.predict(X)) + \
(0.05 * elastic_model_full_data.predict(X)) + \
(0.05 * lasso_model_full_data.predict(X)) + \
(0.05 * sgd_model_full_data.predict(X)) + \
(0.1 * ridge_model_full_data.predict(X)) + \
(0.1 * huber_model_full_data.predict(X)) + \
(0.18 * svr_model_full_data.predict(X)) + \
(0.29 * stack_gen_model.predict(np.array(X))))
print('RMSE score on train data:')
print(rmse(y, mixed_models_predict(X)))
print('RMSE score on test data:')
print(rmse(Y_test, mixed_models_predict(X_test)))
hf = h2o.H2OFrame(df)
#df.info()
y1 = "RATING"
# Identify predictors and response
x = hf.columns
x.remove(y1) #first I removed rating
# y2 = "DECISION"
# x.remove(y2) #now I removed decision
train, test = hf.split_frame(ratios=[.8], seed=1)
aml_first = H2OAutoML(max_models = 30, max_runtime_secs=300, seed = 1)
aml_first.train(x = x, y = y1, training_frame = train)
lb = aml_first.leaderboard
lb.head()
lb.head(rows=lb.nrows) # Entire leaderboard
performance = aml_first.leader.model_performance(test)
performance.show()
preds_first = aml_first.predict(test)
diff = (test['RATING']-preds_first)**2
diff.mean()
x = 0.7061227303998321
np.sqrt(x)
test['RATING']
preds_first