데이터 분석/ADP 자격증 공부

(파이썬 한권으로 끝내기) 머신러닝 분류 알고리즘 – 로지스틱 회귀, SVM, KNN, 의사결정나무, 앙상블/보팅, 나이브베이즈, 인공신경망

나르시스트 2026. 4. 5. 17:59

*회귀 분석 알고리즘 추천

*분류 분석 알고리즘 추천

1. 로지스틱 회귀

로지스틱 회귀는 시그모이드 함수를 사용해 확률값을 예측하고, 이를 기준으로 클래스 레이블을 예측

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve


body = pd.read_csv('data/bodyPerformance.csv')
print(body)

body['gender'] = np.where(body['gender'] == 'M', 0, 1)
body['class_1'] = np.where(body['class'] == 'A', 1, 0)

feature_columns = list(body.columns.difference(['class', 'class_1']))
x = body[feature_columns]
y = body['class_1']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.7, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

## 0. 기본 회귀 모델 생성
logR = LogisticRegression()
logR.fit(train_x, train_y)

## 1. 로지스틱 회귀 그래프 그리기
proba = pd.DataFrame(logR.predict_proba(train_x)) 
cs = logR.decision_function(train_x)  # Confidence Score

df = pd.concat([proba, pd.DataFrame(cs)], axis=1)
df.columns = ['Not A', 'A', 'decision_function']
df.sort_values(['decision_function'], inplace=True)
df.reset_index(inplace=True, drop=True)
print(df)

plt.figure(figsize=(15,5))

plt.axhline(y=0.5, linestyle='--', color='black', linewidth=1)
plt.axvline(x=0, linestyle='--', color='black', linewidth=1)
plt.plot(df['decision_function'], df['Not A'], 'g--', label='Not A')
plt.plot(df['decision_function'], df['Not A'], 'g^')
plt.plot(df['decision_function'], df['A'], 'b--', label='A')
plt.plot(df['decision_function'], df['A'], 'b*')
plt.show()

 

Confidence Score에 따른 클래스 확률값을 매칭시키면, A 클래스에 속할 추정확류로가 결정경계(파란 선)를 얻을 수 있음

##. 2. 성능 평가
pred = logR.predict(test_x)

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)
print('정확도\t{}'.format(round(test_acc*100, 2)))
print('정밀도\t{}'.format(round(test_prc*100, 2)))
print('재현율\t{}'.format(round(test_rcll*100, 2)))
print('F1\t{}'.format(round(test_f1*100, 2)))


## 3. ROC 커브와 함께 AUC 시각화 
def plot_roc_curve(fper, tper):
    plt.plot(fper, tper, color='red', label='ROC')
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()

fper, tper, thresholds = roc_curve(test_y, pred)
plot_roc_curve(fper, tper)

*다중 클래스 분류 – 소프트맥스 회귀

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve

body = pd.read_csv('data/bodyPerformance.csv')
print(body)

body['gender'] = np.where(body['gender'] == 'M', 0, 1)
mapping = {'A':0, 'B':1, 'C':2, 'D':4}
body['class_2'] = body['class'].map(mapping)
print(body)

feature_columns = list(body.columns.difference(['class', 'class_2']))
x = body[feature_columns]
y = body['class_2']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.7, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

## 1. 기본 회귀 모델 생성
softm = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10)
softm.fit(train_x, train_y)

## 2. 성능 평가
pred = softm.predict(test_x)

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
print(test_cm)
print('정확도\t{}'.format(round(test_acc*100, 2)))

print(softm.predict([test_x.iloc[-1, :]]))
print(softm.predict_proba([test_x.iloc[-1, :]]))  # 각 클래스에 속할 확률 출력

2. 서포트 벡터 머신(SVM)

커널 트릭(Kernel Trick)을 사용해 저차원 데이터를 고차원 공간으로 매핑하여 비선형 문제를 해결할 수 있음

import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

c = pd.read_csv('data/classification_gpt.csv')
print(c)

sns.pairplot(hue='success', data=c)

x = c[['age', 'interest']]
y = c['success']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.7, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

scaler = StandardScaler()  # 스케일에 민감하므로 추가해줌
train_x = scaler.fit_transform(train_x)
sns.pairplot(data=pd.concat([pd.DataFrame(train_x), train_y.reset_index(drop=True)], axis=1), hue='success')

clf = SVC(C=0.5)
clf.fit(train_x, train_y)

test_x_scal = scaler.transform(test_x)

pred = clf.predict(test_x_scal)

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)
print('정확도\t{}%'.format(round(test_acc*100, 2)))
print('정밀도\t{}%'.format(round(test_prc*100, 2)))
print('재현율\t{}%'.format(round(test_rcll*100, 2)))
print('F1\t{}%'.format(round(test_f1*100, 2)))

 

*서포트 벡터 회귀분석 – LinearSVR

import pandas as pd
import numpy as np

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error

x = np.sort(5 * np.random.rand(40, 1), axis=0)  # 0~5 사이 데이터 40개 생성

y = np.sin(x).ravel()  # 비선형 데이터 생성
y[::5] += 3 * (0.5 - np.random.rand(8))
print(f"{x[0:6]}\n\n{y[0:10]}")

# SVR 모델 생성 - 커널: Random Forest, Linear, Polynomial
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=0.1, coef0=1)

svr_rbf.fit(x, y)
svr_lin.fit(x, y)
svr_poly.fit(x, y)
rbf_pred = svr_rbf.predict(x)
lin_pred = svr_lin.predict(x)
poly_pred = svr_poly.predict(x)

preds = [rbf_pred, lin_pred, poly_pred]
kernel = ['Random Forest', 'Linear', 'Polynomial']
evls = ['mse', 'rmse', 'mae']

results = pd.DataFrame(index=kernel, columns=evls)

for pred, nm in zip(preds, kernel):
    mse = mean_squared_error(y, pred)
    mae = mean_absolute_error(y, pred)
    rmse = np.sqrt(mse)

    results.loc[nm]['mse']=round(mse,2)
    results.loc[nm]['rmse'] = round(rmse, 2)
    results.loc[nm]['mae'] = round(mae, 2)
    
print(results)

3. KNN 분류

– 이미 레이블이 지정된 데이터 포인트 중에서 가장 가까운 ‘K’개의 이웃을 찾음
– 유클리디안 거리를 많이 사용
– K의 값을 적절히 선택하는 것이 중요! 너무 작으면 잡음에 민감해지고, 너무 크면 경계가 불분명해질 수 있음
– 비모수적인 방식을 사용하기 때문에 결정경계가 매우 불규칙한 분류 상황에서 종종 높은 예측 성능

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### 데이터 다운로드: https://www.kaggle.com/datasets/uciml/indian-liver-patient-records
liver = pd.read_csv('data/indian_liver_patient.csv')
print(liver.Dataset.unique())

liver.Gender = np.where(liver.Gender=='Female', 0, 1)  # One-hot Encoding
liver.dropna(axis=0, inplace=True)


x = liver[liver.columns.difference(['Dataset'])]
y = liver['Dataset']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.7, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

clf = KNeighborsClassifier(n_neighbors=15, weights='uniform')
clf.fit(train_x, train_y)

pred = clf.predict(test_x)

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)
print('\n')
print('정확도\t{}%'.format(round(test_acc*100, 2)))
print('정밀도\t{}%'.format(round(test_prc*100, 2)))
print('재현율\t{}%'.format(round(test_rcll*100, 2)))

 

*KNN 회귀

– 주어진 입력을 바탕으로 가장 잘 예측된 평균 값들의 집합을 나타냄
– 따라서 종속변수에 대한 독립변수의 영향력을 의미하는 회귀계수는 KNN 회귀에서 확인되지 않음
– X축이 시점이라면, KNN 회귀 모델은 모든 시점의 평균 데이터를 예측하게 됨 → 시계열 분석에도 활용

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

np.random.seed(0)
x = np.sort(5 * np.random.rand(400, 1), axis=0)  # 0~5 사이 데이터 40개 생성
T = np.linspace(0, 5, 500)[:, np.newaxis]
y = np.sin(x).ravel()  # 비선형 데이터 생성
y[::1] += 1 * (0.5 - np.random.rand(400))
print(x[0:6], '\n\n', y[0:10])

train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.7, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

knn_uni = KNeighborsRegressor(n_neighbors=20, weights='uniform')
knn_dis = KNeighborsRegressor(n_neighbors=20, weights='distance')

knn_uni.fit(train_x, train_y)
knn_dis.fit(train_x, train_y)

uni_pred = knn_uni.predict(test_x)
dis_pred = knn_dis.predict(test_x)

preds = [uni_pred, dis_pred]
weights = ['uniform', 'distance']
evls = ['mse', 'rmse', 'mae']

results = pd.DataFrame(index=weights, columns=evls)

for pred, nm in zip(preds, weights):
    mse = mean_squared_error(test_y, pred)
    mae = mean_absolute_error(test_y, pred)
    rmse = np.sqrt(mse)

    results.loc[nm]['mse'] = round(mse, 2)
    results.loc[nm]['rmse'] = round(rmse, 2)
    results.loc[nm]['mae'] = round(mae, 2)

print(results)

4. 의사결정나무 분류

– 선형성과 정규성 등의 가정이 필요하지 않아 전처리 과정에 모델의 성능이 큰 영향을 받지 않음

  • 이산형 목표변수 – 분류 분석
    – 분리기준: 지니지수, 엔트로피지수, 카이제곱 통계량 p값
  • 연속형 목표변수 – 회귀 분석
    – 분산분석 F통계량, 분산의 감소량
  • 변수중요도(Feature Importance) – 0~1 사이의 숫자로 표현

→ 단일 결정트리 단점을 극복하기 위해 앙상블 기법이 제안됨

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, RocCurveDisplay

credit = pd.read_csv('data/credit_final_gpt.csv')
feature_columns = list(credit.columns.difference(['credit.rating']))

x = credit[feature_columns]
y = credit['credit.rating']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(train_x, train_y)

pred = clf.predict(test_x)

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)
print('\n')
print('정확도\t{}%'.format(round(test_acc*100, 2)))
print('정밀도\t{}%'.format(round(test_prc*100, 2)))
print('재현율\t{}%'.format(round(test_rcll*100, 2)))

report = classification_report(test_y, pred)
print(report)

RocCurveDisplay.from_estimator(clf, test_x, test_y)  # (구) plot_roc_curve
plt.show()
R_A_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
print("ROC_AUC_score: ", R_A_score)

*변수중요도 계산

importances = clf.feature_importances_
column_nm = pd.DataFrame(x.columns)
feature_importances = pd.concat([column_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
print(feature_importances)

*의사결정나무 시각화

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

iris = load_iris()
X = iris.data
y = iris.target

clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X, y)

plt.figure(figsize=(12, 8))
plot_tree(clf, filled=True, feature_names=iris.feature_names, class_names=iris.target_names, rounded=True)
plt.show()

 DecisionTreeRegressor(random_state=42)

5. 앙상블(Ensemble)

  • 배깅 – 여러 개의 결정 트리를 병렬적으로 학습
  • 부스팅 – 여러 개의 결정 트리를 순차적으로 학습
    *이전 트리가 잘못 예측한 샘플에 더 많은 가중치를 부여하여, 오류를 줄여가는 방식으로 학습
    *XGBoost, AdaBoost, Gradient Boosting 등
  • 랜덤포레스트 – 배깅의 일종으로 여러 결정 트리를 학습시키지만, 각 트리는 무작위로 선택된 특성의 서브셋을 사용

 

*배깅(Bagging)

배깅은 과적합을 방지하면서도 성능을 높이는 데 효과적인 기법

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix, RocCurveDisplay

breast = pd.read_csv('data/breast_cancer_gpt.csv')
plt.figure()
sns.countplot(x='diagnosis', data=breast)

breast['diagnosis'] = np.where(breast['diagnosis']=='M', 1, 0)
features = ['area_mean', 'area_worst']

x = breast[features]
y = breast['diagnosis']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

clf = BaggingClassifier(estimator=DecisionTreeClassifier())
clf.fit(train_x, train_y)
pred = clf.predict(test_x)
print('Accuracy Score: ', clf.score(test_x, test_y))

result = pd.DataFrame(confusion_matrix(test_y, pred), index=['True[0]', 'True[1]'], columns=['Pred[0]', 'Pred[1]'])
print(result)

RocCurveDisplay.from_estimator(clf, test_x, test_y)  # (구) plot_roc_curve
plt.show()

clf_oob = BaggingClassifier(estimator=DecisionTreeClassifier(), oob_score=True)
oob = clf_oob.fit(x, y).oob_score_  # 교차검증 따로 할 필요가 없음
print(oob)

※ base_reg = DecisionTreeRegressor(random_state=42)
bagging_reg = BaggingRegressor(base_estimator=base_reg, n_estimators=100, random_state=42)

 

*부스팅(Boosting)

AdaBoost는 강력한 앙상블 기법으로, 적절한 학습률 약한 학습자 수를 선택하면 일반화 성능이 매우 우수

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

breast = pd.read_csv('data/breast_cancer_gpt.csv')

breast['diagnosis'] = np.where(breast['diagnosis']=='M', 1, 0)
features = ['area_mean', 'area_worst']

x = breast[features]
y = breast['diagnosis']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

clf = AdaBoostClassifier(estimator=None)
clf.fit(train_x, train_y)
pred = clf.predict(test_x)
print('Accuracy Score: ', clf.score(test_x, test_y))

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)
print('\n')
print('정확도\t{}%'.format(round(test_acc*100, 2)))
print('정밀도\t{}%'.format(round(test_prc*100, 2)))
print('재현율\t{}%'.format(round(test_rcll*100, 2)))

importances = clf.feature_importances_
column_nm = pd.DataFrame(['area_mean', 'area_worst'])
feature_importances = pd.concat([column_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
print(feature_importances)

※ base_reg = DecisionTreeRegressor(max_depth=3)
ada_reg = AdaBoostRegressor(base_estimator=base_reg, n_estimators=100, learning_rate=0.1, random_state=42)

*XGBoost(Extreme Gradient Boosting)

XGBoost는 속도와 성능에서 뛰어나며, 과적합 방지, 병렬 처리 등의 장점

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"테스트 세트 정확도: {accuracy:.4f}")

 XGBRegressor(n_estimators=1000) # learning_rate=0.1, max_depth=3

 

*LightGBM

마이크로소프트에서 개발한 고성능 그레디언트 부스팅 알고리즘
대규모 데이터셋에서 특히 효율적이며, 빠른 학습 속도와 낮은 메모리 사용량을 자랑함

import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lgbm_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=31, random_state=42)
lgbm_clf.fit(X_train, y_train)

y_pred = lgbm_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"테스트 세트의 정확도: {accuracy:.4f}")

※ lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, num_leaves=31, random_state=42)

 

*랜덤 포레스트(Random Forest)

랜덤 포레스트는 일반화 성능이 뛰어나고, 과적합 방지에 효과적인 알고리즘으로,
다양한 문제에서 우수한 성능을 발휘하는 대표적인 앙상블 기법

import pandas as pd
import matplotlib.pyplot as plt

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

breast = pd.read_csv('data/breast_cancer_gpt.csv')

breast['diagnosis'] = np.where(breast['diagnosis']=='M', 1, 0)
features = ['area_mean', 'area_worst']

x = breast[features]
y = breast['diagnosis']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

clf = RandomForestClassifier(n_estimators=100, min_samples_split=5)
clf.fit(train_x, train_y)
pred = clf.predict(test_x)
print('Accuracy Score: ', clf.score(test_x, test_y))

test_cm = confusion_matrix(test_y, pred)
test_acc = accuracy_score(test_y, pred)
test_prc = precision_score(test_y, pred)
test_rcll = recall_score(test_y, pred)
test_f1 = f1_score(test_y, pred)

print(test_cm)
print('\n')
print('정확도\t{}%'.format(round(test_acc*100, 2)))
print('정밀도\t{}%'.format(round(test_prc*100, 2)))
print('재현율\t{}%'.format(round(test_rcll*100, 2)))

importances = clf.feature_importances_
column_nm = pd.DataFrame(['area_mean', 'area_worst'])
feature_importances = pd.concat([column_nm, pd.DataFrame(importances)], axis=1)
feature_importances.columns = ['feature_nm', 'importances']
print(feature_importances)

plt.bar(features, importances, align='center')
plt.show()

 RandomForestRegressor(n_estimators=100) # random_state=42

*Voting 알고리즘

*Hard vs Soft

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# 데이터 로드
iris = datasets.load_iris()
X, y = iris.data, iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 개별 모델 생성
svm_clf = SVC(probability=True)
knn_clf = KNeighborsClassifier()
nb_clf = GaussianNB()

# 보팅 분류기 생성
voting_clf = VotingClassifier(
    estimators=[('svm', svm_clf), ('knn', knn_clf), ('nb', nb_clf)],
    voting='soft'
)  
# soft: 모든 분류기의 예측을 확률로 평균내어 최종 결정을 내리는 방식
# hard: 가장 많은 표를 얻은 클래스를 선택

# 모델 학습
voting_clf.fit(X_train, y_train)

# 모델 예측 및 평가
y_pred = voting_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 데이터 불러오기
cancer_data = load_breast_cancer()
X = cancer_data.data
y = cancer_data.target

# Logistic Regression은 특성의 스케일에 민감
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 다양한 분류기 생성
clf1 = LogisticRegression(random_state=42)
clf2 = DecisionTreeClassifier(random_state=42)
clf3 = SVC(random_state=42)

# 각 분류기의 정확도 저장할 리스트 생성
accuracies = []

# 각 분류기 별 정확도 측정 및 출력
for clf, name in zip([clf1, clf2, clf3], ['Logistic Regression', 'Decision Tree', 'SVM']):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"{name}의 정확도: {accuracy}")

# 보팅 분류기 생성
voting_clf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)], voting='hard')

# 보팅 분류기 학습
voting_clf.fit(X_train, y_train)

# 보팅 분류기의 정확도 측정 및 출력
y_pred = voting_clf.predict(X_test)
voting_accuracy = accuracy_score(y_test, y_pred)
accuracies.append(voting_accuracy)
print(f"보팅 분류기의 정확도: {voting_accuracy}")

 

*SVC(random_state=42, probability=True) # 확률 계산 활성화

**voting='soft'**는 분류기의 확률 예측을 기반으로 가중치를 적용하여 최종 예측을 수행하므로, SVC에서도 확률 계산이 가능하도록 설정했습니다.

*보팅 회귀

import numpy as np
#from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error

#boston = load_boston()
#X, y = boston.data, boston.target
housing = fetch_california_housing()
X, y = housing.data, housing.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg1 = LinearRegression()
reg2 = RandomForestRegressor(n_estimators=100, random_state=42)
reg3 = SVR()

voting_reg = VotingRegressor(estimators=[('lr', reg1), ('rf', reg2), ('svr', reg3)])
voting_reg.fit(X_train, y_train)

y_pred = voting_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Voting Regressor MSE: {mse:.4f}")

회귀 문제에서는 클래스 확률이라는 개념이 없기 때문에 hard, soft 개념이 존재하지 않음
VotingRegressor는 회귀 문제에서 여러 모델의 예측을 단순히 평균하는 방식으로 동작

6. 나이브 베이즈(Naive Bayes) – 회귀 없음

확률 기반의 분류 알고리즘으로, 각 특성이 독립적이라고 가정하고 베이즈 정리를 활용해 분류를 수행
텍스트 분류(스팸 메일 필터링, 감정 분석)와 같은 문제에서 자주 사용

  • 라플라스 스무딩: 학습데이터에 없던 데이터가 등장해도 빈도수에 1을 더해서 확률이 0이 되는 현상을 막음

 

*BernoulliNB – 이진데이터에 적용 ex) 스팸메일 분류

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

spam = pd.read_csv('data/spam_gpt.csv')

spam = spam[['v1', 'v2']]
spam['v1'].unique()

spam['label'] = np.where(spam['v1'] == 'spam', 1, 0)

x = spam['v2']
y = spam['label']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

cv = CountVectorizer(binary=True)  # 단어가 한 번 이상 등장하면 1, 그렇지 않으면 9
x_train_cv = cv.fit_transform(train_x)

encoded_input = x_train_cv.toarray()
print(cv.inverse_transform(encoded_input[[0]]))  ## 단어 확인
print(cv.get_feature_names_out()[1:10], end='\n')  ## 인덱스 단어 확인

bnb = BernoulliNB()
bnb.fit(x_train_cv, train_y)
x_test_cv = cv.transform(test_x)
pred = bnb.predict(x_test_cv)

acc = accuracy_score(test_y, pred)
print('Accuracy Score: ', acc)

print(classification_report(test_y, pred))

 

*MultinomialNB – 카운트 데이터에 적용 ex) 영화 감상 평을 토대로 긍정/부정 리뷰 분류

import pandas as pd

from keras.datasets import imdb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

(train_x, train_y), (test_x, test_y) = imdb.load_data()
print(train_x.shape)
print(test_x.shape)


### imdb 데이터를 텍스트로 변환하는 코드인데, ADP 시험에서는 변환된 데이터가 주어짐
word_to_index = imdb.get_word_index()
index_to_word = {}

for key, value in word_to_index.items():
    index_to_word[value+3] = key
for index, token in enumerate(('<pad>', '<sos>', '<unk>')):
    index_to_word[index] = token

train_reviews = []
for x in train_x:
    tmp = ' '.join([index_to_word[index] for index in x])
    train_reviews.append(tmp)

test_reviews = []
for x in test_x:
    tmp = ' '.join([index_to_word[index] for index in x])
    test_reviews.append(tmp)

train = pd.concat([pd.DataFrame(train_reviews), pd.DataFrame(train_y)], axis=1)
train.columns = ['reviews', 'label']
train['reviews'] = train['reviews'].str[6:]

test = pd.concat([pd.DataFrame(test_reviews), pd.DataFrame(test_y)], axis=1)
test.columns = ['reviews', 'label']
test['reviews'] = test['reviews'].str[6:]
print(train)
print(test)

train_x, test_x = train['reviews'].values, test['reviews'].values
train_y, test_y = train['label'].values, test['label'].values
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

cv = CountVectorizer(binary=False)  # 단어 카운팅
x_train_cv = cv.fit_transform(train_x)

bnb = MultinomialNB()
bnb.fit(x_train_cv, train_y)
x_test_cv = cv.transform(test_x)
pred = bnb.predict(x_test_cv)

acc = accuracy_score(test_y, pred)
print('Accuracy Score: ', acc)

print(classification_report(test_y, pred))

 

*GaussianNB – 연속적인 데이터에 적용

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

sky = pd.read_csv('data/Skyserver_gpt.csv')

features = list(sky.columns)  # 열 이름 가져오기
features.remove('class')

x = sky[features]
y = sky['class']

train_x, test_x, train_y, test_y = train_test_split(x, y, stratify=y, train_size=0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

gnb = GaussianNB()
gnb.fit(train_x, train_y)
pred = gnb.predict(test_x)
print('Accuracy Score: ', gnb.score(test_x, test_y))

print(gnb.predict_proba(test_x)[[0, 4, 8]])
print(gnb.predict(test_x)[[0, 4, 8]])

pred = gnb.predict(test_x)
print(classification_report(test_y, pred))

7. 인공신경망 분류 모델

 

*다층 퍼셉트론(Multi-layer Perceptron)을 기반으로 하는 신경망 분류 모델 예시

 

*이진 분류

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 가상의 이진 분류 데이터셋 생성
X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=0)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# ANN 모델 생성
mlp = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=0)

# 모델 학습
mlp.fit(X_train, y_train)
pred = mlp.predict(X_test)

# 정확도 출력
print("Accuracy:", accuracy_score(y_test, pred))

 

*다중 분류

# 가상의 다중 분류 데이터셋 생성
X, y = make_classification(n_samples=100, n_features=10, n_classes=3, random_state=0, n_clusters_per_class=1)

# 데이터 분할 및 모델 학습, 예측, 평가는 이진 분류 예제와 동일

 

*스케일링 적용(거의 필수)

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MLPClassifier 모델 생성 (은닉층 2개: [64, 32], 활성화 함수: ReLU, Solver: Adam)
mlp_clf = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp_clf.fit(X_train, y_train)

y_pred = mlp_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"테스트 세트의 정확도: {accuracy:.4f}")

 MLPRegressor(hidden_layer_sizes=(64, 32), activation=’relu’, solver=’adam’, max_iter=1000, random_state=42)
다층 퍼셉트론(Multi-layer Perceptron, MLP) 구조를 사용하여 연속적인 출력 값을 예측하는 회귀 문제에 적합한 모델

*인공신경망 모델 비교

 

*기본 인공신경망 (MLP: Multi-layer Perceptron)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

data = load_iris()
X = data.data
y = data.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_one_hot = to_categorical(y, num_classes=3)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_one_hot, test_size=0.3, random_state=42)

model_mlp = Sequential()
model_mlp.add(Dense(10, input_dim=4, activation='relu'))
model_mlp.add(Dense(8, activation='relu'))
model_mlp.add(Dense(3, activation='softmax'))

# 모델 컴파일
model_mlp.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 학습
model_mlp.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1)

# 정확도 평가
accuracy_mlp = model_mlp.evaluate(X_test, y_test)
print(f"MLP 모델 정확도: {accuracy_mlp[1] * 100:.2f}%")

 

*합성곱 신경망 (CNN: Convolutional Neural Network) – 이미지 처리에 적합

from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

x_train = x_train / 255.0
x_test = x_test / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

model_cnn = Sequential()
model_cnn.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Flatten())
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dense(10, activation='softmax'))

model_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_cnn.fit(x_train, y_train, epochs=10, batch_size=64, verbose=1)

accuracy_cnn = model_cnn.evaluate(x_test, y_test)
print(f"CNN 모델 정확도: {accuracy_cnn[1] * 100:.2f}%")

*순환 신경망 (RNN: Recurrent Neural Network)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

(x_train, y_train), (x_test, y_test) = imdb.load_data()(num_words=10000)  # input_dim과 맞춰줘야 함

x_train = pad_sequences(x_train, maxlen=200)
x_test = pad_sequences(x_test, maxlen=200)

model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model_rnn.add(SimpleRNN(128, activation='relu'))
model_rnn.add(Dense(1, activation='sigmoid'))

model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_rnn.fit(x_train, y_train, epochs=5, batch_size=64, verbose=1)

accuracy_rnn = model_rnn.evaluate(x_test, y_test)
print(f"RNN 모델 정확도: {accuracy_rnn[1] * 100:.2f}%")

 

→ Iris 데이터 이용 예제 (시퀀스 데이터 아니라 적합하진 않음)

model_rnn = Sequential()
model_rnn.add(SimpleRNN(50, activation='relu', input_shape=(X_train.shape[1], 1)))
model_rnn.add(Dense(3, activation='softmax'))

model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 데이터 차원 맞추기 (RNN에 적합하게 reshape)
X_train_rnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_rnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

model_rnn.fit(X_train_rnn, y_train, epochs=50, batch_size=10, verbose=1)

*장기 단기 기억 네트워크 (LSTM: Long Short-Term Memory)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

(x_train, y_train), (x_test, y_test) = imdb.load_data()

x_train = pad_sequences(x_train, maxlen=200)
x_test = pad_sequences(x_test, maxlen=200)

model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model_lstm.add(LSTM(128))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.fit(x_train, y_train, epochs=5, batch_size=64, verbose=1)

accuracy_lstm = model_lstm.evaluate(x_test, y_test)
print(f"LSTM 모델 정확도: {accuracy_lstm[1] * 100:.2f}%")

 

→ Iris 데이터 이용 예제 (시퀀스 데이터 아니라 적합하진 않음)

model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
model_lstm.add(Dense(3, activation='softmax'))

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

X_train_lstm = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

model_lstm.fit(X_train_lstm, y_train, epochs=50, batch_size=10, verbose=1)

accuracy_lstm = model_lstm.evaluate(X_test_lstm, y_test)
print(f"LSTM 모델 정확도: {accuracy_lstm[1] * 100:.2f}%")

8. LDA(Linear Discriminant Analysis) – 회귀 없음

분류 문제에서 사용되는 지도 학습 알고리즘으로, 특성 축소 분류를 동시에 수행할 수 있는 기법
각 클래스 간의 차이를 극대화하면서, 클래스 내부의 분산을 최소화하는 방향으로 데이터를 변환하여,
차원 축소 분류 모두에서 성능을 발휘

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_pred = lda.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"테스트 세트의 정확도: {accuracy:.4f}")

# LDA 차원 축소 (2차원으로 변환)
X_r2 = lda.transform(X_train)

# 2차원으로 투영된 데이터 시각화
plt.figure(figsize=(12, 6))

# 첫 번째 서브플롯: 원본 데이터의 두 특성 간의 관계
plt.subplot(1, 2, 1)
colors = ['navy', 'turquoise', 'darkorange']
for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
    plt.scatter(X_train[y_train == i, 0], X_train[y_train == i, 1], alpha=.8, color=color, label=target_name)
plt.title('Original IRIS dataset')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.legend(loc='best')

# 두 번째 서브플롯: LDA로 2차원으로 투영된 데이터
plt.subplot(1, 2, 2)
for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
    plt.scatter(X_r2[y_train == i, 0], X_r2[y_train == i, 1], alpha=.8, color=color, label=target_name)
plt.title('LDA of IRIS dataset (2D projection)')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend(loc='best')

plt.tight_layout()
plt.show()

*스태킹

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score

data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(max_iter=200, random_state=42))  # max_iter를 200으로 설정해야 경고 없음
]

meta_learner = RandomForestClassifier(n_estimators=100, random_state=42)
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner)
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Stacking 모델 정확도: {accuracy * 100:.2f}%")