Titanic data

결정 트리, 랜덤포레스트, XGBoost, lightBGM, CATBoost 비교

전처리

1
!pip install catboost
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
     |████████████████████████████████| 67.3MB 57kB/s 
[?25hRequirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (from catboost) (4.4.1)
Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from catboost) (1.4.1)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.19.5)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from catboost) (3.2.2)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.1.5)
Requirement already satisfied: graphviz in /usr/local/lib/python3.7/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.7/dist-packages (from plotly->catboost) (1.3.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (1.3.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (2.4.7)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->catboost) (2018.9)
Installing collected packages: catboost
Successfully installed catboost-0.25.1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
1
2
3
4
5
6
7
8
def set_seed(seed_value):
random.seed(seed_value)
np.random.seed(seed_value)
os.environ["PYTHONHASHSEED"] = str(seed_value)


SEED = 42
set_seed(SEED)
1
2
3
4
train_df = pd.read_csv('/content/sample_data/titanic_train.csv')
test_df = pd.read_csv('/content/sample_data/titanic_test.csv')
print(f"Train shape: {train_df.shape}")
train_df.sample(3)
Train shape: (891, 12)

PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
709 710 1 3 Moubarek, Master. Halim Gonios ("William George") male NaN 1 1 2661 15.2458 NaN C
439 440 0 2 Kvillner, Mr. Johan Henrik Johannesson male 31.0 0 0 C.A. 18723 10.5000 NaN S
840 841 0 3 Alhomaki, Mr. Ilmari Rudolf male 20.0 0 0 SOTON/O2 3101287 7.9250 NaN S
1
2
print(f"Test shape: {test_df.shape}")
test_df.sample(3)
Test shape: (418, 11)

PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
20 912 1 Rothschild, Mr. Martin male 55.00 1 0 PC 17603 59.40 NaN C
338 1230 2 Denbury, Mr. Herbert male 25.00 0 0 C.A. 31029 31.50 NaN S
250 1142 2 West, Miss. Barbara J female 0.92 1 2 C.A. 34651 27.75 NaN S
1
2
3
4
5
6
7
full_df = pd.concat(
[
train_df.drop(["PassengerId", "Survived"], axis=1),
test_df.drop(["PassengerId"], axis=1),
]
)
y_train = train_df["Survived"].values
1
full_df.isna().sum()
Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64
1
full_df = full_df.drop(["Age", "Cabin"], axis=1)
1
2
3
4
5
6
7
8
9
10
11
12
13
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(full_df["Fare"], bins=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title("Fare distribution", fontsize=16)

plt.subplot(1, 2, 2)
embarked_info = full_df["Embarked"].value_counts()
plt.bar(embarked_info.index, embarked_info.values)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title("Embarked distribution", fontsize=16);

png

1
2
full_df["Embarked"].fillna("S", inplace=True)
full_df["Fare"].fillna(full_df["Fare"].mean(), inplace=True)
1
2
3
4
5
full_df["Title"] = full_df["Name"].str.extract(" ([A-Za-z]+)\.")
full_df["Title"] = full_df["Title"].replace(["Ms", "Mlle"], "Miss")
full_df["Title"] = full_df["Title"].replace(["Mme", "Countess", "Lady", "Dona"], "Mrs")
full_df["Title"] = full_df["Title"].replace(["Dr", "Major", "Col", "Sir", "Rev", "Jonkheer", "Capt", "Don"], "Mr")
full_df = full_df.drop(["Name"], axis=1)
1
2
3
full_df["Sex"] = full_df["Sex"].map({"male": 1, "female": 0}).astype(int)    
full_df["Embarked"] = full_df["Embarked"].map({"S": 1, "C": 2, "Q": 3}).astype(int)
full_df['Title'] = full_df['Title'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3}).astype(int)
1
2
3
4
full_df["TicketNumber"] = full_df["Ticket"].str.split()
full_df["TicketNumber"] = full_df["TicketNumber"].str[-1]
full_df["TicketNumber"] = LabelEncoder().fit_transform(full_df["TicketNumber"])
full_df = full_df.drop(["Ticket"], axis=1)
1
2
full_df["FamilySize"] = full_df["SibSp"] + full_df["Parch"] + 1
full_df["IsAlone"] = full_df["FamilySize"].apply(lambda x: 1 if x == 1 else 0)
1
full_df.head()

Pclass Sex SibSp Parch Fare Embarked Title TicketNumber FamilySize IsAlone
0 3 1 1 0 7.2500 1 0 209 2 0
1 1 0 1 0 71.2833 2 2 166 2 0
2 3 0 0 0 7.9250 1 1 466 1 1
3 1 0 1 0 53.1000 1 2 67 2 0
4 3 1 0 0 8.0500 1 0 832 1 1
1
2
3
4
5
6
X_train = full_df[:y_train.shape[0]]
X_test = full_df[y_train.shape[0]:]

print(f"Train X shape: {X_train.shape}")
print(f"Train y shape: {y_train.shape}")
print(f"Test X shape: {X_test.shape}")
Train X shape: (891, 10)
Train y shape: (891,)
Test X shape: (418, 10)
1
2
3
4
5
6
7
8
one_hot_cols = ["Embarked", "Title"]
for col in one_hot_cols:
full_df = pd.concat(
[full_df, pd.get_dummies(full_df[col], prefix=col)],
axis=1,
join="inner",
)
full_df = full_df.drop(one_hot_cols, axis=1)
1
2
scaler = StandardScaler()
full_df.loc[:] = scaler.fit_transform(full_df)
1
print(full_df)

Pclass Sex SibSp Parch Fare TicketNumber FamilySize IsAlone Embarked_1 Embarked_2 Embarked_3 Title_0 Title_1 Title_2 Title_3
0 0.841916 0.743497 0.481288 -0.445000 -0.503595 -0.846179 0.073352 -1.233758 0.655011 -0.50977 -0.32204 0.819619 -0.502625 -0.425920 -0.221084
1 -1.546098 -1.344995 0.481288 -0.445000 0.734503 -1.004578 0.073352 -1.233758 -1.526692 1.96167 -0.32204 -1.220079 -0.502625 2.347858 -0.221084
2 0.841916 -1.344995 -0.479087 -0.445000 -0.490544 0.100529 -0.558346 0.810532 0.655011 -0.50977 -0.32204 -1.220079 1.989556 -0.425920 -0.221084
3 -1.546098 -1.344995 0.481288 -0.445000 0.382925 -1.369263 0.073352 -1.233758 0.655011 -0.50977 -0.32204 -1.220079 -0.502625 2.347858 -0.221084
4 0.841916 0.743497 -0.479087 -0.445000 -0.488127 1.448759 -0.558346 0.810532 0.655011 -0.50977 -0.32204 0.819619 -0.502625 -0.425920 -0.221084
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
413 0.841916 0.743497 -0.479087 -0.445000 -0.488127 0.347336 -0.558346 0.810532 0.655011 -0.50977 -0.32204 0.819619 -0.502625 -0.425920 -0.221084
414 -1.546098 -1.344995 -0.479087 -0.445000 1.461829 -0.938271 -0.558346 0.810532 -1.526692 1.96167 -0.32204 -1.220079 -0.502625 2.347858 -0.221084
415 0.841916 0.743497 -0.479087 -0.445000 -0.503595 0.026855 -0.558346 0.810532 0.655011 -0.50977 -0.32204 0.819619 -0.502625 -0.425920 -0.221084
416 0.841916 0.743497 -0.479087 -0.445000 -0.488127 1.183533 -0.558346 0.810532 0.655011 -0.50977 -0.32204 0.819619 -0.502625 -0.425920 -0.221084
417 0.841916 0.743497 0.481288 0.710763 -0.211473 -0.253105 0.705051 -1.233758 -1.526692 1.96167 -0.32204 -1.220079 -0.502625 -0.425920 4.523164

1309 rows × 15 columns

1
2
3
4
5
6
X_train_norm = full_df[:y_train.shape[0]]
X_test_norm = full_df[y_train.shape[0]:]

print(f"Train norm X shape: {X_train_norm.shape}")
print(f"Train y shape: {y_train.shape}")
print(f"Test norm X shape: {X_test_norm.shape}")
Train norm X shape: (891, 15)
Train y shape: (891,)
Test norm X shape: (418, 15)
1
categorical_columns = ['Sex', 'Embarked', 'Title', 'TicketNumber', 'IsAlone']
1
cross_valid_scores = {}
1
X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, test_size=0.3)

결정트리 생성

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
%%time
parameters = {
"max_depth": [3, 5, 7, 9, 11, 13],
}

model_desicion_tree = DecisionTreeClassifier(
random_state=SEED,
class_weight='balanced',
)

model_desicion_tree = GridSearchCV(
model_desicion_tree,
parameters,
cv=5,
scoring='accuracy',
)

model_desicion_tree.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_desicion_tree.best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: ' + \
f'{model_desicion_tree.best_score_:.3f}'
)
cross_valid_scores['desicion_tree'] = model_desicion_tree.best_score_
print('-----')
-----
Best parameters {'max_depth': 11}
Mean cross-validated accuracy score of the best_estimator: 0.817
-----
CPU times: user 180 ms, sys: 2.04 ms, total: 182 ms
Wall time: 202 ms

랜덤 포레스트

그리드서치

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
%%time
parameters = {
"n_estimators": [5, 10, 15, 20, 25],
"max_depth": [3, 5, 7, 9, 11, 13],
}

model_random_forest = RandomForestClassifier(
random_state=SEED,
class_weight='balanced',
)

model_random_forest = GridSearchCV(
model_random_forest,
parameters,
cv=5,
scoring='accuracy',
)

model_random_forest.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_random_forest.best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: '+ \
f'{model_random_forest.best_score_:.3f}'
)
cross_valid_scores['random_forest'] = model_random_forest.best_score_
print('-----')
-----
Best parameters {'max_depth': 11, 'n_estimators': 25}
Mean cross-validated accuracy score of the best_estimator: 0.844
-----
CPU times: user 4.84 s, sys: 43 ms, total: 4.89 s
Wall time: 4.9 s

랜덤 서치

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
%%time

parameters = {
"n_estimators": [5, 10, 15, 20, 25],
"max_depth": [3, 5, 7, 9, 11, 13],
}


model2_random_forest_rs = RandomizedSearchCV(model2_random_forest,parameters,cv=5,n_iter=50,random_state=0,scoring="accuracy")
model2_random_forest_rs.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model2_random_forest_rs.best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: '+ \
f'{model2_random_forest_rs.best_score_:.3f}'
)
cross_valid_scores['random_forest'] = model2_random_forest_rs.best_score_
print('-----')
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py:281: UserWarning: The total space of parameters 30 is smaller than n_iter=50. Running 30 iterations. For exhaustive searches, use GridSearchCV.
  % (grid_size, self.n_iter, grid_size), UserWarning)


-----
Best parameters {'n_estimators': 25, 'max_depth': 11}
Mean cross-validated accuracy score of the best_estimator: 0.844
-----
CPU times: user 4.68 s, sys: 27.1 ms, total: 4.71 s
Wall time: 4.73 s

###파라미터 튜닝을 하지않은 randomForest

1
2
3
4
5
6
7

model_rf1=RandomForestClassifier(max_depth=5)
model_rf1.fit(X_train,y_train)

y_pred_rf1=model_rf1.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, y_pred_rf1))
정확도 : 1.0
1
2
3
4
5
6
model_rf2=RandomForestClassifier(n_estimators= 25, max_depth= 11)
model_rf2.fit(X_train,y_train)

y_pred_rf2=model_rf2.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, y_pred_rf2))
정확도 : 0.9589552238805971

XGBOOST

gridSearch

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
%%time
parameters = {
'max_depth': [3, 5, 7, 9],
'n_estimators': [5, 10, 15, 20, 25, 50, 100],
'learning_rate': [0.01, 0.05, 0.1]
}

model_xgb = xgb.XGBClassifier(
random_state=SEED,
)

model_xgb = GridSearchCV(
model_xgb,
parameters,
cv=5,
scoring='accuracy',
)

model_xgb.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb.best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: ' +
f'{model_xgb.best_score_:.3f}'
)
cross_valid_scores['xgboost'] = model_xgb.best_score_
print('-----')
-----
Best parameters {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
Mean cross-validated accuracy score of the best_estimator: 0.846
-----
CPU times: user 13.7 s, sys: 177 ms, total: 13.9 s
Wall time: 14 s

xgboost에서 하이퍼파라미터튜닝을 위해 GridSearch를 진행.
time : 14.3 s
Best parameters {‘n_estimators’: 100, ‘max_depth’: 7, ‘learning_rate’: 0.1}

랜덤 서치

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
%%time
params = {
'max_depth': [3, 5, 7, 9],
'n_estimators': [5, 10, 15, 20, 25, 50, 100],
'learning_rate': [0.01, 0.05, 0.1]
}
model_xgb_random = xgb.XGBClassifier(
random_state=SEED,
)
model_xgb_random =RandomizedSearchCV(model_xgb_random ,params,cv=5,n_iter=50,random_state=0,scoring="accuracy")

model_xgb_random .fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb_random .best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: ' +
f'{model_xgb_random .best_score_:.3f}'
)
cross_valid_scores['xgboost'] = model_xgb_random .best_score_
print('-----')

-----
Best parameters {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1}
Mean cross-validated accuracy score of the best_estimator: 0.846
-----
CPU times: user 9.22 s, sys: 119 ms, total: 9.34 s
Wall time: 9.34 s

xgboost에서 두 서치의 성능을 보기위해 똑같은 환경에서 RandomSearch를 진행.
time : 9.46 s
Best parameters {‘n_estimators’: 100, ‘max_depth’: 7, ‘learning_rate’: 0.1}

파라미터튜닝을 하지않은 xgboost

1
2
3
4
5
6

model_1=xgb.XGBClassifier()
model_1.fit(X_train,y_train)
pred_y1=model_1.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, pred_y1))
정확도 : 0.8843283582089553

하이퍼 파라미터 적용

1
2
3
4
5
6
model_2=xgb.XGBClassifier(learning_rate= 0.1, max_depth= 7, n_estimators=100)
model_2.fit(X_train,y_train)

pred_y2=model_2.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, pred_y2))
정확도 : 0.9514925373134329

그리드 서치보다 랜덤 서치의 속도가 더 빠른 것을 알 수있다.
또한 하이퍼 파라미터를 튜닝 한 후의 정확도가 훨씬 올라갔음을 알 수 있다.

lightBGM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
%%time
parameters = {
'n_estimators': [5, 10, 15, 20, 25, 50, 100],
'learning_rate': [0.01, 0.05, 0.1],
'num_leaves': [7, 15, 31],
}

model_lgbm = lgbm.LGBMClassifier(
random_state=SEED,
class_weight='balanced',
)

model_lgbm = GridSearchCV(
model_lgbm,
parameters,
cv=5,
scoring='accuracy',
)

model_lgbm.fit(
X_train,
y_train,
categorical_feature=categorical_columns
)

print('-----')
print(f'Best parameters {model_lgbm.best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: ' +
f'{model_lgbm.best_score_:.3f}'
)
cross_valid_scores['lightgbm'] = model_lgbm.best_score_
print('-----')

Catboost

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
%%time
parameters = {
'iterations': [5, 10, 15, 20, 25, 50, 100],
'learning_rate': [0.01, 0.05, 0.1],
'depth': [3, 5, 7, 9, 11, 13],
}

model_catboost = cb.CatBoostClassifier(
verbose=False,
)

model_catboost = GridSearchCV(
model_catboost,
parameters,
cv=5,
scoring='accuracy',
)

model_catboost.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_catboost.best_params_}')
print(
f'Mean cross-validated accuracy score of the best_estimator: ' +
f'{model_catboost.best_score_:.3f}'
)
cross_valid_scores['catboost'] = model_catboost.best_score_
print('-----')
Author

lhj

Posted on

2021-04-13

Updated on

2021-04-13

Licensed under

You need to set install_url to use ShareThis. Please set it in _config.yml.
You forgot to set the business or currency_code for Paypal. Please set it in _config.yml.

Comments

You forgot to set the shortname for Disqus. Please set it in _config.yml.
You need to set client_id and slot_id to show this AD unit. Please set it in _config.yml.