Posted 2021-04-13Updated 2021-04-1314 minutes read (About 2100 words)

Titanic data

결정 트리, 랜덤포레스트, XGBoost, lightBGM, CATBoost 비교

전처리

1	!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 57kB/s 
[?25hRequirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (from catboost) (4.4.1)
Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from catboost) (1.4.1)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.19.5)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from catboost) (3.2.2)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.1.5)
Requirement already satisfied: graphviz in /usr/local/lib/python3.7/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.7/dist-packages (from plotly->catboost) (1.3.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (1.3.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (2.4.7)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->catboost) (2018.9)
Installing collected packages: catboost
Successfully installed catboost-0.25.1

import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics    
from sklearn.model_selection import RandomizedSearchCV

def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    

SEED = 42
set_seed(SEED)

train_df = pd.read_csv('/content/sample_data/titanic_train.csv')
test_df = pd.read_csv('/content/sample_data/titanic_test.csv')
print(f"Train shape: {train_df.shape}")
train_df.sample(3)

Train shape: (891, 12)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
709	710	1	3	Moubarek, Master. Halim Gonios ("William George")	male	NaN	1	1	2661	15.2458	NaN	C
439	440	0	2	Kvillner, Mr. Johan Henrik Johannesson	male	31.0	0	0	C.A. 18723	10.5000	NaN	S
840	841	0	3	Alhomaki, Mr. Ilmari Rudolf	male	20.0	0	0	SOTON/O2 3101287	7.9250	NaN	S

1 2	print(f"Test shape: {test_df.shape}") test_df.sample(3)

Test shape: (418, 11)

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
20	912	1	Rothschild, Mr. Martin	male	55.00	1	0	PC 17603	59.40	NaN	C
338	1230	2	Denbury, Mr. Herbert	male	25.00	0	0	C.A. 31029	31.50	NaN	S
250	1142	2	West, Miss. Barbara J	female	0.92	1	2	C.A. 34651	27.75	NaN	S

full_df = pd.concat(
    [
        train_df.drop(["PassengerId", "Survived"], axis=1), 
        test_df.drop(["PassengerId"], axis=1),
    ]
)
y_train = train_df["Survived"].values

1	full_df.isna().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

1	full_df = full_df.drop(["Age", "Cabin"], axis=1)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(full_df["Fare"], bins=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title("Fare distribution", fontsize=16)

plt.subplot(1, 2, 2)
embarked_info = full_df["Embarked"].value_counts()
plt.bar(embarked_info.index, embarked_info.values)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title("Embarked distribution", fontsize=16);

png

1 2	full_df["Embarked"].fillna("S", inplace=True) full_df["Fare"].fillna(full_df["Fare"].mean(), inplace=True)

full_df["Title"] = full_df["Name"].str.extract(" ([A-Za-z]+)\.")
full_df["Title"] = full_df["Title"].replace(["Ms", "Mlle"], "Miss")
full_df["Title"] = full_df["Title"].replace(["Mme", "Countess", "Lady", "Dona"], "Mrs")
full_df["Title"] = full_df["Title"].replace(["Dr", "Major", "Col", "Sir", "Rev", "Jonkheer", "Capt", "Don"], "Mr")
full_df = full_df.drop(["Name"], axis=1)

1
2
3

full_df["Sex"] = full_df["Sex"].map({"male": 1, "female": 0}).astype(int)    
full_df["Embarked"] = full_df["Embarked"].map({"S": 1, "C": 2, "Q": 3}).astype(int)    
full_df['Title'] = full_df['Title'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3}).astype(int)

full_df["TicketNumber"] = full_df["Ticket"].str.split()
full_df["TicketNumber"] = full_df["TicketNumber"].str[-1]
full_df["TicketNumber"] = LabelEncoder().fit_transform(full_df["TicketNumber"])
full_df = full_df.drop(["Ticket"], axis=1)

1 2	full_df["FamilySize"] = full_df["SibSp"] + full_df["Parch"] + 1 full_df["IsAlone"] = full_df["FamilySize"].apply(lambda x: 1 if x == 1 else 0)

1	full_df.head()

	Pclass	Sex	SibSp	Fare	Embarked	Title	TicketNumber	FamilySize	IsAlone
0	3	1	1	7.2500	1	0	209	2	0
1	1	0	1	71.2833	2	2	166	2	0
2	3	0	0	7.9250	1	1	466	1	1
3	1	0	1	53.1000	1	2	67	2	0
4	3	1	0	8.0500	1	0	832	1	1

X_train = full_df[:y_train.shape[0]]
X_test = full_df[y_train.shape[0]:]

print(f"Train X shape: {X_train.shape}")
print(f"Train y shape: {y_train.shape}")
print(f"Test X shape: {X_test.shape}")

Train X shape: (891, 10)
Train y shape: (891,)
Test X shape: (418, 10)

one_hot_cols = ["Embarked", "Title"]
for col in one_hot_cols:
    full_df = pd.concat(
        [full_df, pd.get_dummies(full_df[col], prefix=col)], 
        axis=1, 
        join="inner",
    )
full_df = full_df.drop(one_hot_cols, axis=1)

1 2	scaler = StandardScaler() full_df.loc[:] = scaler.fit_transform(full_df)

1	print(full_df)

	Pclass	Sex	SibSp	Parch	Fare	TicketNumber	FamilySize	IsAlone	Embarked_1	Embarked_2	Embarked_3	Title_0	Title_1	Title_2	Title_3
0	0.841916	0.743497	0.481288	-0.445000	-0.503595	-0.846179	0.073352	-1.233758	0.655011	-0.50977	-0.32204	0.819619	-0.502625	-0.425920	-0.221084
1	-1.546098	-1.344995	0.481288	-0.445000	0.734503	-1.004578	0.073352	-1.233758	-1.526692	1.96167	-0.32204	-1.220079	-0.502625	2.347858	-0.221084
2	0.841916	-1.344995	-0.479087	-0.445000	-0.490544	0.100529	-0.558346	0.810532	0.655011	-0.50977	-0.32204	-1.220079	1.989556	-0.425920	-0.221084
3	-1.546098	-1.344995	0.481288	-0.445000	0.382925	-1.369263	0.073352	-1.233758	0.655011	-0.50977	-0.32204	-1.220079	-0.502625	2.347858	-0.221084
4	0.841916	0.743497	-0.479087	-0.445000	-0.488127	1.448759	-0.558346	0.810532	0.655011	-0.50977	-0.32204	0.819619	-0.502625	-0.425920	-0.221084
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
413	0.841916	0.743497	-0.479087	-0.445000	-0.488127	0.347336	-0.558346	0.810532	0.655011	-0.50977	-0.32204	0.819619	-0.502625	-0.425920	-0.221084
414	-1.546098	-1.344995	-0.479087	-0.445000	1.461829	-0.938271	-0.558346	0.810532	-1.526692	1.96167	-0.32204	-1.220079	-0.502625	2.347858	-0.221084
415	0.841916	0.743497	-0.479087	-0.445000	-0.503595	0.026855	-0.558346	0.810532	0.655011	-0.50977	-0.32204	0.819619	-0.502625	-0.425920	-0.221084
416	0.841916	0.743497	-0.479087	-0.445000	-0.488127	1.183533	-0.558346	0.810532	0.655011	-0.50977	-0.32204	0.819619	-0.502625	-0.425920	-0.221084
417	0.841916	0.743497	0.481288	0.710763	-0.211473	-0.253105	0.705051	-1.233758	-1.526692	1.96167	-0.32204	-1.220079	-0.502625	-0.425920	4.523164

1309 rows × 15 columns

X_train_norm = full_df[:y_train.shape[0]]
X_test_norm = full_df[y_train.shape[0]:]

print(f"Train norm X shape: {X_train_norm.shape}")
print(f"Train y shape: {y_train.shape}")
print(f"Test norm X shape: {X_test_norm.shape}")

Train norm X shape: (891, 15)
Train y shape: (891,)
Test norm X shape: (418, 15)

1	categorical_columns = ['Sex', 'Embarked', 'Title', 'TicketNumber', 'IsAlone']

1	cross_valid_scores = {}

1	X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, test_size=0.3)

결정트리 생성

%%time
parameters = {
    "max_depth": [3, 5, 7, 9, 11, 13],
}

model_desicion_tree = DecisionTreeClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_desicion_tree = GridSearchCV(
    model_desicion_tree, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_desicion_tree.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_desicion_tree.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + \
    f'{model_desicion_tree.best_score_:.3f}'
)
cross_valid_scores['desicion_tree'] = model_desicion_tree.best_score_
print('-----')

-----
Best parameters {'max_depth': 11}
Mean cross-validated accuracy score of the best_estimator: 0.817
-----
CPU times: user 180 ms, sys: 2.04 ms, total: 182 ms
Wall time: 202 ms

랜덤 포레스트

그리드서치

%%time
parameters = {
    "n_estimators": [5, 10, 15, 20, 25], 
    "max_depth": [3, 5, 7, 9, 11, 13],
}

model_random_forest = RandomForestClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_random_forest = GridSearchCV(
    model_random_forest, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_random_forest.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_random_forest.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model_random_forest.best_score_:.3f}'
)
cross_valid_scores['random_forest'] = model_random_forest.best_score_
print('-----')

-----
Best parameters {'max_depth': 11, 'n_estimators': 25}
Mean cross-validated accuracy score of the best_estimator: 0.844
-----
CPU times: user 4.84 s, sys: 43 ms, total: 4.89 s
Wall time: 4.9 s

랜덤 서치

%%time

parameters = {
    "n_estimators": [5, 10, 15, 20, 25], 
    "max_depth": [3, 5, 7, 9, 11, 13],
}


model2_random_forest_rs = RandomizedSearchCV(model2_random_forest,parameters,cv=5,n_iter=50,random_state=0,scoring="accuracy")
model2_random_forest_rs.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model2_random_forest_rs.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model2_random_forest_rs.best_score_:.3f}'
)
cross_valid_scores['random_forest'] = model2_random_forest_rs.best_score_
print('-----')

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py:281: UserWarning: The total space of parameters 30 is smaller than n_iter=50. Running 30 iterations. For exhaustive searches, use GridSearchCV.
  % (grid_size, self.n_iter, grid_size), UserWarning)


-----
Best parameters {'n_estimators': 25, 'max_depth': 11}
Mean cross-validated accuracy score of the best_estimator: 0.844
-----
CPU times: user 4.68 s, sys: 27.1 ms, total: 4.71 s
Wall time: 4.73 s

###파라미터 튜닝을 하지않은 randomForest


model_rf1=RandomForestClassifier(max_depth=5)
model_rf1.fit(X_train,y_train)

y_pred_rf1=model_rf1.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, y_pred_rf1))

정확도 : 1.0

model_rf2=RandomForestClassifier(n_estimators= 25, max_depth= 11)
model_rf2.fit(X_train,y_train)

y_pred_rf2=model_rf2.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, y_pred_rf2))

정확도 : 0.9589552238805971

XGBOOST

gridSearch

%%time
parameters = {
    'max_depth': [3, 5, 7, 9], 
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1]
}

model_xgb = xgb.XGBClassifier(
    random_state=SEED,
)

model_xgb = GridSearchCV(
    model_xgb, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_xgb.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_xgb.best_score_:.3f}'
)
cross_valid_scores['xgboost'] = model_xgb.best_score_
print('-----')

-----
Best parameters {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
Mean cross-validated accuracy score of the best_estimator: 0.846
-----
CPU times: user 13.7 s, sys: 177 ms, total: 13.9 s
Wall time: 14 s

xgboost에서 하이퍼파라미터튜닝을 위해 GridSearch를 진행.
time : 14.3 s
Best parameters {‘n_estimators’: 100, ‘max_depth’: 7, ‘learning_rate’: 0.1}

랜덤 서치

%%time
params = {
    'max_depth': [3, 5, 7, 9], 
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1]
}
model_xgb_random = xgb.XGBClassifier(
    random_state=SEED,
)
model_xgb_random =RandomizedSearchCV(model_xgb_random ,params,cv=5,n_iter=50,random_state=0,scoring="accuracy")

model_xgb_random .fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb_random .best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_xgb_random .best_score_:.3f}'
)
cross_valid_scores['xgboost'] = model_xgb_random .best_score_
print('-----')

-----
Best parameters {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1}
Mean cross-validated accuracy score of the best_estimator: 0.846
-----
CPU times: user 9.22 s, sys: 119 ms, total: 9.34 s
Wall time: 9.34 s

xgboost에서 두 서치의 성능을 보기위해 똑같은 환경에서 RandomSearch를 진행.
time : 9.46 s
Best parameters {‘n_estimators’: 100, ‘max_depth’: 7, ‘learning_rate’: 0.1}

파라미터튜닝을 하지않은 xgboost


model_1=xgb.XGBClassifier()
model_1.fit(X_train,y_train)
pred_y1=model_1.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, pred_y1))

정확도 : 0.8843283582089553

하이퍼 파라미터 적용

model_2=xgb.XGBClassifier(learning_rate= 0.1, max_depth= 7, n_estimators=100)
model_2.fit(X_train,y_train)

pred_y2=model_2.predict(X1_test)

print('\n정확도 :', metrics.accuracy_score(y1_test, pred_y2))

정확도 : 0.9514925373134329

그리드 서치보다 랜덤 서치의 속도가 더 빠른 것을 알 수있다.
또한 하이퍼 파라미터를 튜닝 한 후의 정확도가 훨씬 올라갔음을 알 수 있다.

lightBGM

%%time
parameters = {
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [7, 15, 31],
}

model_lgbm = lgbm.LGBMClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_lgbm = GridSearchCV(
    model_lgbm, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_lgbm.fit(
    X_train, 
    y_train, 
    categorical_feature=categorical_columns
)

print('-----')
print(f'Best parameters {model_lgbm.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_lgbm.best_score_:.3f}'
)
cross_valid_scores['lightgbm'] = model_lgbm.best_score_
print('-----')

Catboost

%%time
parameters = {
    'iterations': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 9, 11, 13],
}

model_catboost = cb.CatBoostClassifier(
    verbose=False,
)

model_catboost = GridSearchCV(
    model_catboost, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_catboost.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_catboost.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_catboost.best_score_:.3f}'
)
cross_valid_scores['catboost'] = model_catboost.best_score_
print('-----')

https://ggaggu.github.io/2021/04/13/titanic2/

Author

lhj

Posted on

2021-04-13

Updated on

2021-04-13

Licensed under

You need to set install_url to use ShareThis. Please set it in _config.yml.

Titanic data

전처리

결정트리 생성

랜덤 포레스트

그리드서치

랜덤 서치

XGBOOST

gridSearch

랜덤 서치

파라미터튜닝을 하지않은 xgboost

하이퍼 파라미터 적용

lightBGM

Catboost

Author

Posted on

Updated on

Licensed under

Like this article? Support the author with

Comments

Links

Recents

Archives

Subscribe for updates