# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd import numpy as np import random import os
from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import lightgbm as lgb import catboost as ctb from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz import matplotlib.pyplot as plt import seaborn as sns
import warnings warnings.simplefilter('ignore')
1
!pip install kaggle
Requirement already satisfied: kaggle in /usr/local/lib/python3.7/dist-packages (1.5.12)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.1)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.24.3)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.23.0)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2020.12.5)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.0.1)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.41.1)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.7/dist-packages (from python-slugify->kaggle) (1.3)
print('Rows and Columns in train dataset:', train_df.shape) print('Rows and Columns in test dataset:', test_df.shape)
Rows and Columns in train dataset: (100000, 12)
Rows and Columns in test dataset: (100000, 11)
결측치 갯수 출력
1 2 3 4 5 6 7 8 9
print('Missing values per columns in train dataset') for col in train_df.columns: temp_col = train_df[col].isnull().sum() print(f'{col}: {temp_col}') print() print('Missing values per columns in test dataset') for col in test_df.columns: temp_col = test_df[col].isnull().sum() print(f'{col}: {temp_col}')
#cabin은 문자열을 분할하고, 제일 첫번째 글자를 따와서 넣는다. 결측치엔 X를 넣는다. #strip() : 양쪽 공백을 지운다. 여기서느 x[0]외엔 다 지우는듯. all_df['Cabin'] = all_df['Cabin'].fillna('X').map(lambda x: x[0].strip())
#print(all_df['Ticket'].head(10)) #Ticket, fillna with 'X', split string and take first split #split() : 문자열 나누기. 디폴트는 ' '이고, 문자를 가진 데이터들이 전부 띄워쓰기로 구분되어있기때문에 가능. all_df['Ticket'] = all_df['Ticket'].fillna('X').map(lambda x:str(x).split()[0] iflen(str(x).split()) > 1else'X')
#pclass에 따른 Fare의 평균을 구해서 dictionary형태로 만든다. fare_map = all_df[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict() #fare의 결측치에 본인 행의 pclass 값을 넣고, 그 값을 fare 평균에 맵핑시킨다. all_df['Fare'] = all_df['Fare'].fillna(all_df['Pclass'].map(fare_map['Fare'])) #유독 높은 가격이나 낮은 가격이 있기때문에, 이상치의 영향을 줄이기 위해서 Fare에 log를 취해준다. all_df['Fare'] = np.log1p(all_df['Fare'])
for train_idx, test_idx in kfold.split(train_kf_feature,train_kf_label):
X_train=train_kf_feature.iloc[train_idx] X_test=train_kf_feature.iloc[test_idx] y_train,y_test=train_kf_label.iloc[train_idx],train_kf_label.iloc[test_idx] #학습 진행 lgbm_model.fit(X_train,y_train) #예측 fold_pred=lgbm_model.predict(X_test) #정확도 n_iter+=1 fold_accuracy=metrics.accuracy_score(y_test,fold_pred) print("\n {}번째 교차 검증 정확도 : {} , 학습 데이터 크기:{}, 검증 데이터 크기 :{} ". format(n_iter,fold_accuracy,X_train.shape[0],X_test.shape[0])) cv_accuracy.append(fold_accuracy) #중요도 fi_tmp = pd.DataFrame() fi_tmp["feature"] = lgbm_temp.feature_name() fi_tmp["importance"] = lgbm_model.feature_importances_ feature_importances = feature_importances.append(fi_tmp)
print('\n 평균 검증 정확도 : ',np.mean(cv_accuracy))
1번째 교차 검증 정확도 : 0.78015 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
2번째 교차 검증 정확도 : 0.7824 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
3번째 교차 검증 정확도 : 0.78185 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
4번째 교차 검증 정확도 : 0.7816 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
5번째 교차 검증 정확도 : 0.7809 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
평균 검증 정확도 : 0.78138
for train_idx, test_idx in kfold.split(train_kf_feature,train_kf_label):
X_train=train_kf_feature.iloc[train_idx] X_test=train_kf_feature.iloc[test_idx] y_train,y_test=train_kf_label.iloc[train_idx],train_kf_label.iloc[test_idx] #학습 진행 cat_model.fit(X_train,y_train,verbose=500) #예측 fold_pred=cat_model.predict(X_test) #정확도 n_iter+=1 fold_accuracy=metrics.accuracy_score(y_test,fold_pred) print("\n {}번째 교차 검증 정확도 : {} , 학습 데이터 크기:{}, 검증 데이터 크기 :{} ". format(n_iter,fold_accuracy,X_train.shape[0],X_test.shape[0])) cv_accuracy.append(fold_accuracy) #중요도 . lgbm이랑 명령어가 다르다. fi_tmp = pd.DataFrame() fi_tmp["feature"] = X_test.columns.to_list() fi_tmp["importance"] = cat_model.get_feature_importance() feature_importances = feature_importances.append(fi_tmp)
print('\n 평균 검증 정확도 : ',np.mean(cv_accuracy))
0: learn: 0.6881430 total: 11.2ms remaining: 11.2s
500: learn: 0.4620724 total: 5.17s remaining: 5.15s
999: learn: 0.4513527 total: 10.2s remaining: 0us
6번째 교차 검증 정확도 : 0.77945 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
0: learn: 0.6881914 total: 12.4ms remaining: 12.3s
500: learn: 0.4635447 total: 5.02s remaining: 5s
999: learn: 0.4529141 total: 10.2s remaining: 0us
7번째 교차 검증 정확도 : 0.78335 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
0: learn: 0.6881970 total: 13.6ms remaining: 13.6s
500: learn: 0.4635994 total: 5.2s remaining: 5.18s
999: learn: 0.4529137 total: 10.3s remaining: 0us
8번째 교차 검증 정확도 : 0.78265 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
0: learn: 0.6882583 total: 11.2ms remaining: 11.2s
500: learn: 0.4622575 total: 5.08s remaining: 5.06s
999: learn: 0.4513804 total: 10.1s remaining: 0us
9번째 교차 검증 정확도 : 0.7821 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
0: learn: 0.6882789 total: 15.4ms remaining: 15.3s
500: learn: 0.4630108 total: 5.1s remaining: 5.08s
999: learn: 0.4522854 total: 10.1s remaining: 0us
10번째 교차 검증 정확도 : 0.7802 , 학습 데이터 크기:80000, 검증 데이터 크기 :20000
평균 검증 정확도 : 0.78155
1 2 3 4 5 6
# just to get ideas to improve order = list(feature_importances.groupby("feature").mean().sort_values("importance", ascending=False).index) plt.figure(figsize=(10, 10)) sns.barplot(x="importance", y="feature", data=feature_importances, order=order) plt.title("{} importance".format("CatBoostClassifier")) plt.tight_layout()