[천재교육] 결정트리시각화, 배깅, 랜덤포레스트 부스팅 실전

728x90

결정트리 모델의 시각화

from sklearn.tree import DecisionTreeClassifier # 결정트리모델 불러오기
from sklearn.datasets import load_iris # 그 유명한 붓꽃데이터
from sklearn.model_selection import train_test_split # 학습/테스트 데이터 나누는 함수
import warnings
warnings.filterwarnings('ignore') # warning 뜨는걸 무시한다

dt_clf = DecisionTreeClassifier(random_state=156) # DecisionTree Classifier 생성

iris_data = load_iris() # 붓꽃 데이터 로딩

# 학습과 테스트 데이터 셋으로 분리
# x값 => iris_data.data, y값 => ris_data.target, test_size => 학습/시험 데이터 분류기준
X_train , X_test , y_train , y_test =
train_test_split(iris_data.data, iris_data.target, test_size=0.2,  random_state=11)

# DecisionTreeClassifer 학습. dt_clf 즉,DecisionTreeClassifier 모델에 학습내용이 저장되었다.
dt_clf.fit(X_train , y_train)

# accuracy, Confusion Matrix 쓰기 위해 불러오기
import sklearn.metrics as mt

# xtrain, ytrain 데이터로 학습된 model (dt_clf) 을 이용해 X_test 데이터에 대응하는 y값을 예측한다.
y_pred = dt_clf.predict(X_test)

# 정확도(accuracy) 및 오차행렬(Confusion matrix) 구하기
# 아까 따로 20% 분류해놓은 y_test 데이터와
학습된 모델(dt_clf) 을 이용해 예측한 데이터 y_pred 를 비교해 정확도 및 오차행렬 산출
accuracy = mt.accuracy_score(y_test, y_pred)
Confusion_matrix = mt.confusion_matrix(y_test, y_pred)

accuracy
=> 0.9333

Confusion_matrix
=>
[[ 9  0  0] # 클래스가 0인 데이터셋
[ 0 10  0] # 클래스가 1인 데이터셋
[ 0  2  9]] # 클래스가 2인 데이터셋

# 첫번째 컬럼은 0 예측
# 두번째 컬럼은 1 예측
# 세번째 컬럼은 2 예측

# export_graphviz()의 호출 결과로 out_file로 지정된 tree.dot 파일을 생성함.
from sklearn.tree import export_graphviz

export_graphviz
(dt_clf, out_file = "tree.dot",
class_names = iris_data.target_names,
feature_names = iris_data.feature_names,
impurity=True, filled=True)

# 위에서 생성된 tree.dot 파일을 Graphviz 읽어서 Jupyter Notebook상에서 시각화
import graphviz

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

=>

# 피쳐네임 확인
iris_data.feature_names
=>
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

# 씨본으로 그려보자
import seaborn as sns
import numpy as np
%matplotlib inline

# feature importance 추출
print("Feature importances:\n{0}".format(np.round(dt_clf.feature_importances_, 3)))
=>
Feature importances: [0.025 0. 0.555 0.42 ]

# feature별 importance 매핑
for name, value in zip(iris_data.feature_names , dt_clf.feature_importances_):
print('{0} : {1:.3f}'.format(name, value))
=>
sepal length (cm) : 0.025
sepal width (cm) : 0.000
petal length (cm) : 0.555
petal width (cm) : 0.420

# feature importance를 column 별로 시각화 하기
sns.barplot(x=dt_clf.feature_importances_ , y=iris_data.feature_names)

=>

분석 목표 : 다음달에 탈퇴하는 회원의 현재달의 징조 파악하기

데이터 전처리

import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

class0_data = pd.read_csv('./datasets/만료및탈퇴회원.csv', index_col=0)
class1_data = pd.read_csv('./datasets/정회원.csv', index_col=0)
display(class0_data, class1_data)

탈퇴 / 정회원인지를 구분하는 것 --> 그 다음 달에 이 학생이 탈퇴를 할지 이전의 달에 예측
분석 목표 : 다음달에 탈퇴하는 회원의 현재달의 징조 파악하기


## 11월에 탈퇴한 회원의 10월 데이터 - 탈퇴회원 데이터
# DF.isin(values) ==> DF 객체의 각 요소가 values값과 일치하는지 여부를 bool형식으로 반환
exit_user = class0_data[class0_data['mm'] == 11]['userid'].unique()
exit_data = class1_data[(class1_data['userid'].isin(exit_user)) & (class1_data['mm'] == 10)]

exit_data['target'] = 'WILL EXIT'
exit_data['target'] = 0

## 11월에 정회원인 회원의 10월 데이터 - 정회원 데이터
regular_user = class1_data[class1_data['mm'] == 11]['userid'].unique()
regular_data = class1_data[(class1_data['userid'].isin(regular_user)) & (class1_data['mm'] == 10)]

regular_data['target'] = 'WILL STAY'
regular_data['target'] = 1

dataset = pd.concat([exit_data, regular_data])
dataset['target'].value_counts()
=>
1 4128
0 1251
Name: target, dtype: int64

# 이제 학습/테스트 데이터에 쓰일 x축과 y축 설정
X축(회원이 탈회할지 유지할지 영향을 줄 것 같은 요소들)
Y축(탈퇴,유지 결과. 즉, dataset['target'])

dataset = dataset.dropna(subset = ['point_gain_activeday_count', 'point_gain_count', 'point_gain',
             'point_loss_activeday_count', 'point_loss_count', 'point_loss',
             'tablet_activeday_count', 'tablet_moved_menu_count', 'tablet_leave_count', 'tablet_resume_count',
             'tablet_login_count', 'tablet_logout_count', 'study_activeday_count', 'study_count',
             'study_notcompleted_count', 'study_completed_count',
             'study_restart_count', 'total_system_learning_time', 'total_caliper_learning_time',
             'media_activeday_count', 'media_count',
             'video_action_count', 'video_start_count', 'video_restart_count',
             'video_pause_count', 'video_jump_count', 'video_resume_count',
             'video_speed_count', 'video_volume_count', 'video_end_count',
             'test_activeday_count', 'test_count', 'test_average_score',
             'test_item_count', 'test_correct_count', 'wrong_count',
             'wrong_item_count', 'wrong_correct_count'
             ])

X = dataset[['point_gain_activeday_count', 'point_gain_count', 'point_gain',
             'point_loss_activeday_count', 'point_loss_count', 'point_loss',
             'tablet_activeday_count', 'tablet_moved_menu_count', 'tablet_leave_count', 'tablet_resume_count',
             'tablet_login_count', 'tablet_logout_count', 'study_activeday_count', 'study_count',
             'study_notcompleted_count', 'study_completed_count',
             'study_restart_count', 'total_system_learning_time', 'total_caliper_learning_time',
             'media_activeday_count', 'media_count',
             'video_action_count', 'video_start_count', 'video_restart_count',
             'video_pause_count', 'video_jump_count', 'video_resume_count',
             'video_speed_count', 'video_volume_count', 'video_end_count',
             'test_activeday_count', 'test_count', 'test_average_score',
             'test_item_count', 'test_correct_count', 'wrong_count',
             'wrong_item_count', 'wrong_correct_count'
             ]]

Y = dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

모델적용 - Bagging (배깅)

from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix
lr_clf = LogisticRegression(solver='liblinear')

bagging_clf = BaggingClassifier(base_estimator=lr_clf)

# BaggingClassifier 학습/예측.
bagging_clf.fit(X_train , y_train)
pred = bagging_clf.predict(X_test)

# 평가
accuracy = accuracy_score(y_test , pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
matrix = confusion_matrix(y_test, pred)

print('Bagging 분류기 정확도: {0:.4f}'.format(accuracy))
print('Bagging 분류기 Recall: {0:.4f}'.format(recall))
print('Bagging 분류기 Precision: {0:.4f}'.format(precision))
print('Bagging 분류기 AUC: {0:.4f}'.format(auc))
print('Bagging 분류기 Confusion Matrix:','\n', matrix)

=>

Bagging 분류기 정확도: 0.8232
Bagging 분류기 Recall: 0.9926
Bagging 분류기 Precision: 0.8282
Bagging 분류기 AUC: 0.4963
Bagging 분류기
Confusion Matrix:
[[ 0 112]
[ 4 540]]

# 개별 모델의 학습/예측/평가.
lr_clf.fit(X_train , y_train)
pred = lr_clf.predict(X_test)
class_name= lr_clf.__class__.__name__

accuracy = accuracy_score(y_test , pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
matrix = confusion_matrix(y_test, pred)

print('{0} 정확도: {1:.4f}'.format(class_name, accuracy))
print('{0} Recall: {1:.4f}'.format(class_name, recall))
print('{0} Precision: {1:.4f}'.format(class_name, precision))
print('{0} AUC: {1:.4f}'.format(class_name, auc))
print('{0} Confusion Matrix:'.format(class_name),'\n', matrix)

=>

LogisticRegression 정확도: 0.8232
LogisticRegression Recall: 0.9926
LogisticRegression Precision: 0.8282
LogisticRegression AUC: 0.4963
LogisticRegression Confusion Matrix:
[[ 0 112]
[ 4 540]]

모델적용 - 랜덤포레스트

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=8)
rf_clf.fit(X_train , y_train)
pred = rf_clf.predict(X_test)

accuracy = accuracy_score(y_test , pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
matrix = confusion_matrix(y_test, pred)

print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))
print('랜덤 포레스트 Recall: {0:.4f}'.format(recall))
print('랜덤 포레스트 Precision: {0:.4f}'.format(precision))
print('랜덤 포레스트 AUC: {0:.4f}'.format(auc))
print('랜덤 포레스트 Confusion Matrix:','\n', matrix)

=>

랜덤 포레스트 정확도: 0.8186
랜덤 포레스트 Recall: 0.9871
랜덤 포레스트 Precision: 0.8274
랜덤 포레스트 AUC: 0.4936
랜덤 포레스트 Confusion Matrix:
[[ 0 112]
[ 7 537]]

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf.feature_importances_
ftr_importances = pd.Series(ftr_importances_values,index=X_train.columns )
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
ftr_top20

=>

point_gain 0.090967
test_average_score 0.047108
point_gain_count 0.046371
test_correct_count 0.042594
total_system_learning_time 0.039931
point_gain_activeday_count 0.032420
tablet_moved_menu_count 0.032343
test_item_count 0.031226
study_count 0.030450
total_caliper_learning_time 0.029988
tablet_leave_count 0.029329
video_end_count 0.028607
video_start_count 0.027604
study_restart_count 0.027581
test_activeday_count 0.026230
media_count 0.025987
tablet_resume_count 0.025507
media_activeday_count 0.025164
video_action_count 0.024785
test_count 0.023558
dtype: float64

plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)
plt.show()

=>

모델적용 - 그래디언트 부스팅

from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings
warnings.filterwarnings('ignore')

# GBM 수행 시간 측정을 위함. 시작 시간 설정.
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)

accuracy = accuracy_score(y_test , gb_pred)
recall = recall_score(y_test, gb_pred)
precision = precision_score(y_test, gb_pred)
auc = roc_auc_score(y_test, gb_pred)
matrix = confusion_matrix(y_test, gb_pred)

print('GBM 정확도: {0:.4f}'.format(accuracy))
print('GBM Recall: {0:.4f}'.format(recall))
print('GBM Precision: {0:.4f}'.format(precision))
print('GBM AUC: {0:.4f}'.format(auc))
print('GBM Confusion Matrix:','\n', matrix)

print("GBM 수행 시간: {0:.1f} 초 ".format(time.time() - start_time))

=>

GBM 정확도: 0.8171
GBM Recall: 0.9798
GBM Precision: 0.8302
GBM AUC: 0.5033
GBM Confusion Matrix:
[[ 3 109]
[ 11 533]]
GBM 수행 시간: 1.7 초

728x90

저작자표시 (새창열림)

'#02.천재교육 빅데이터 > +06.머신러닝 기초' 카테고리의 다른 글

[천재교육] 다항회귀, 학습곡선(Learning Curve) (0)	2023.03.15
[천재교육] 회귀 (Regression) (0)	2023.03.15
[천재교육] 앙상블학습 - 보팅, 배깅(랜덤포레스트), 부스팅, 스태깅 (0)	2023.03.15
[천재교육] 결정트리, KNN(K-Nearest Neighbor) 알고리즘 (0)	2023.03.14
[천재교육] 분류(Classification) (0)	2023.03.14

돌비오의 개발일지

[천재교육] 결정트리시각화, 배깅, 랜덤포레스트 부스팅 실전

'#02.천재교육 빅데이터 > +06.머신러닝 기초' 카테고리의 다른 글

티스토리툴바

[천재교육] 결정트리시각화, 배깅, 랜덤포레스트 부스팅 실전

'#02.천재교육 빅데이터 > +06.머신러닝 기초' 카테고리의 다른 글

관련글

티스토리툴바