데이터 분석/ADP 자격증 공부

[ADP 실기] 1. 분류분석 총정리

나르시스트 2026. 4. 6. 22:02

hotel_bookings.csv
4.22MB

 

(1) 데이터 탐색

*데이터 확인

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

hotel = pd.read_csv('data/hotel_bookings.csv')
hotel.isna().sum()
hotel.info()
hotel.deposit_type.describe()  
hotel.groupby(hotel['deposit_type']).mean()['adr']

※ pd.set_option(‘display.max_columns’, None)
※ pd.set_option(‘display.max_colwidth’, None)

 

*시각화 확인

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('data/diabetes_for_test.csv')
print(df)

diabetes = df.groupby('Outcome').mean()
print(diabetes)

fig, axes = plt.subplots(2, 4, figsize=(20, 14))

for i in range(4):
    sns.barplot(x=diabetes.index, y=diabetes.iloc[:, i], ax=axes[0][i])
    axes[0][i].set_title(diabetes.columns[i])

for i in range(4):
    sns.barplot(x=diabetes.index, y=diabetes.iloc[:, i+4], ax=axes[1][i])
    axes[1][i].set_title(diabetes.columns[i+4])

plt.suptitle('EDA')
plt.show()

(2) 결측치 탐색 및 처리

  • 결측치가 1% 미만이면, 결측치가 있는 행 단순 삭제
  • 결측치가 1% 이상이면, 최빈값이나 평균으로 대체
hotel_nonull = hotel.copy()
hotel_nonull.dropna(subset=['lead_time'], axis=0, inplace=True)  # 행 삭제
hotel_nonull['is_repeated_guest'] = hotel_nonull['is_repeated_guest'].fillna(0)  # 값 대체

fill_mean_func = lambda g: g.fillna(g.mean())
hotel_nonull = hotel_nonull.groupby('deposit_type').apply(fill_mean_func)  # 그룹별 평균을 이용해 결측값 대체

#hotel_nonull.index = hotel_nonull.index.droplevel(0)
hotel_nonull.sort_index(inplace=True)

hotel_nonull.isna().sum()

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

hotel_nonull['is_canceled'] = hotel_nonull['is_canceled'].apply(lambda x: 1 if x == 'yes' else 0)

sns.countplot(x='is_canceled', data=hotel_nonull)
plt.show()
ratio0 = round(len(hotel_nonull[hotel_nonull['is_canceled'] == 0]) / len(hotel_nonull) * 100, 2)
ratio1 = round(len(hotel_nonull[hotel_nonull['is_canceled'] == 1]) / len(hotel_nonull) * 100, 2)
print('0 비율: {}%'.format(ratio0))
print('1 비율: {}%'.format(ratio1))

 

*KNN을 이용한 결측치 대체

from sklearn.impute import KNNImputer

KNN_data = df.drop(columns=['school', 'sex', 'paid', 'activities'])  # 결측치가 있는 수치형 데이터만 추출

imputer = KNNImputer()
df_filled = imputer.fit_transform(KNN_data)
df_filled = pd.DataFrame(df_filled, columns=KNN_data.columns)
df[KNN_data.columns] = df_filled

 

(3) Feature engineering

→ 정규분포화(로그 변환 등), 이상치 제거, 스케일링, 언더/오버 샘플링, 원 핫 인코딩, 그룹핑

  • IQR 방식으로 lead_time 변수의 이상치를 보정하여 극단적인 값 제거
  • is_repeated_guest 변수의 데이터를 더 수집하여 해당 변수의 불균형을 해소

*이상치 대체

X = df.drop(columns=['Outcome'])
df_v1 = pd.melt(X, var_name='col', value_name='value')  # boxplot을 한 번에 그려주기 위해 데이터 재구조화
print(df_v1)

plt.figure(figsize=(15, 7))
sns.boxplot(x='col', y='value', data=df_v1)
plt.ylim([-50, 1100])
plt.xticks(range(8), X.columns)
plt.show()

outlier_index = df[df['Age'] > 400].index
df.loc[outlier_index, 'Age'] = df.Age.median()
sns.boxplot(df['Age'])
plt.show()

*오버 샘플링

from imblearn.over_sampling import RandomOverSampler, SMOTE
import time

hotel_nonull = pd.get_dummies(hotel_nonull)

X = hotel_nonull[hotel_nonull.columns.difference(['is_canceled'])]
y = hotel_nonull['is_canceled']

start = time.time()
ros = RandomOverSampler(random_state=42)
X_ro, y_ro = ros.fit_resample(X, y)
print('time: ', time.time() - start)

start = time.time()
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)
print('time: ', time.time() - start)

 

*스케일링

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('model', LogisticRegression())  
])

 

(4) 모델링 및 성능 평가

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)
clf = RandomForestClassifier(n_estimators=100, min_samples_split=10)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('train 정확도: ', clf.score(X_train, y_train), '\n')
print(classification_report(y_test, pred))
print('time: ', time.time() - start)

 

*랜덤 분할, 층화 추출

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
X = iris.data  # 특성 데이터
y = iris.target  # 레이블 데이터

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

 

*교차 검증

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression

X = iris.drop('target', axis=1)
y = iris['target']
LOGREG = LogisticRegression(max_iter=300, C=0.1)
SKF = StratifiedKFold(n_splits=4)  
# KF = KFold(n_splits=5, shuffle=True, random_state=42)
result = cross_validate(LOGREG, X, y, cv=SKF, return_train_score=True)
print(pd.DataFrame(result))

 

*GridSearch

  • LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid={'C': [0.01, 0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid = GridSearchCV(LOGREG, param_grid, cv=SKF)
grid.fit(X, y)
print('최상의 교차 검증 점수: {:.2f}'.format(grid.best_score_))
print('최적의 매개변수: {}'.format(grid.best_params_))

results = pd.DataFrame(grid_search.cv_results_)
sorted_results = results.sort_values(by='mean_test_score', ascending=False)

print(sorted_results[['params', 'mean_test_score', 'rank_test_score']])

  • SVM
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# SVM 모델을 위한 매개변수 그리드 생성
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.01, 0.1, 1, 10]}

# GridSearchCV를 사용하여 모델 훈련
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 매개변수 출력
print("최적의 매개변수:", grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_estimator_.feature_importances)
  • KNN: {‘n_neighbors’: [3, 5, 7], ‘weights’: [‘uniform’, ‘distance’], ‘metric’: [‘euclidean’, ‘manhattan’, ‘minkowski’]}
  • Naive Bayes
    ※ Bernoulli: { ‘alpha’: [0.1, 0.5, 1.0, 2.0], ‘fit_prior’: [True, False], ‘binarize’: [0.0, 0.5, 1.0]}
    ※ Multinomial: { ‘alpha’: [0.1, 0.5, 1.0, 2.0], ‘fit_prior’: [True, False]} # 알파 값 (라플라스 스무딩)
    ※ Gaussian: {‘var_smoothing’: [1e-9, 1e-8, 1e-7, 1e-6]} # 분산에 더하는 값 (수치적 안정성을 위한 파라미터)
  • 의사결정나무: {‘max_depth’: [None, 10, 20], ‘min_samples_split’: [2, 5, 10, 20], ‘min_samples_leaf’: [1, 5, 10], ‘criterion’: [‘gini’, ‘entropy’], ‘max_features’: [None, ‘sqrt’, ‘log2’], ‘splitter’: [‘best’, ‘random’]}
  • BaggingClassifier: {‘n_estimators’: [50, 100, 200], ‘max_samples’: [0.5, 0.7, 1.0], ‘max_features’: [0.5, 0.7, 1.0], ‘bootstrap’: [True, False], ‘bootstrap_features’: [True, False], ‘base_estimator’: [DecisionTreeClassifier(), None]}
  • AdaBoostClassifier: { ‘n_estimators’: [50, 100, 200], ‘learning_rate’: [0.01, 0.1, 1], ‘base_estimator’: [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)], ‘algorithm’: [‘SAMME’, ‘SAMME.R’]} # SAMME.R**은 확률적 출력을 사용해 성능이 향상될 수 있음
  • XGBoost: { ‘n_estimators’: [50, 100, 200], ‘max_depth’: [3, 5, 7], ‘learning_rate’: [0.01, 0.1, 0.3], ‘subsample’: [0.5, 0.7, 1.0], ‘colsample_bytree’: [0.5, 0.7, 1.0] }
  • LightGBM: { ‘n_estimators’: [50, 100, 200], ‘learning_rate’: [0.01, 0.1, 0.3], ‘num_leaves’: [31, 50, 100], ‘max_depth’: [-1, 10, 20], ‘colsample_bytree’: [0.6, 0.8, 1.0], ‘subsample’: [0.6, 0.8, 1.0]}
  • 랜덤포레스트: { ‘n_estimators’: [50, 100, 200], ‘max_depth’: [None, 10, 20], ‘min_samples_split’: [2, 5, 10], ‘min_samples_leaf’: [1, 2, 4] }
  • Voting: { ‘lr__C’: [0.1, 1, 10], ‘rf__n_estimators’: [10, 50, 100], ‘svc__C’: [0.1, 1, 10], ‘voting’: [‘hard’, ‘soft’], ‘weights’: [(1, 1, 1), (2, 1, 1), (1, 2, 1)]}
  • MLP: {‘hidden_layer_sizes’: [(100,), (50, 50)], ‘activation’: [‘relu’, ‘tanh’], ‘solver’: [‘adam’, ‘sgd’]}

  • LDA: { ‘solver’: [‘svd’, ‘lsqr’, ‘eigen’], ‘shrinkage’: [None, ‘auto’, 0.1, 0.5], ‘n_components’: [1, 2, 3], ‘store_covariance’: [True, False]}

*Feature Importance

→ 머신 러닝 모델이 각 특성의 예측에 얼마나 중요한지

  • LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

feature_importance = log_reg.coef_[0]

feature_importance_df = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': feature_importance
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

  • SVM
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import pandas as pd

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_svm = LinearSVC(dual=False, max_iter=10000, random_state=42)
linear_svm.fit(X_train, y_train)

feature_importance = linear_svm.coef_[0]

# 특성 중요도 및 특성 이름 출력
feature_importance_df = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': feature_importance
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
  • KNN: 비선형이고, 인스턴스 기반 학습 기법이므로, 모델 학습 과정에서 가중치나 특성 중요도를 계산하지 않음
    데이터 포인트 간의 거리를 기반으로 예측을 수행하며, 각 특성이 결과에 미치는 영향을 직접적으로 나타내는 가중치(coefficient) 같은 개념이 없음
    그러나, 특성 중요도를 알고 싶을 때는 몇 가지 다른 방법을 통해 특성의 기여도를 간접적으로 평가할 수 있음
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
import pandas as pd

data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

result = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42)

feature_importance_df = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': result.importances_mean
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
  • Naive Bayes: 확률 기반 분류 모델로, 각 특성의 조건부 확률을 사용해 예측을 수행
    각 특성에 대해 명시적인 가중치를 학습하지 않으므로, 직접적인 특성 중요도를 제공하지 않음
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import pandas as pd

data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)

result = permutation_importance(nb, X_test, y_test, n_repeats=10, random_state=42)

feature_importance_df = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': result.importances_mean
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

  • 의사결정나무
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np

iris = load_iris()
X = iris.data
y = iris.target

dt = DecisionTreeClassifier()
dt.fit(X, y)

dt_feature_importances = dt.feature_importances_
important_features = np.argsort(dt_feature_importances)[::-1]

print("중요한 변수 순서:")
for i in important_features:
    print(f"{iris.feature_names[i]}: {dt_feature_importances[i]:.4f}")

  • BaggingClassifier
from sklearn.datasets import load_iris
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
import pandas as pd

iris = load_iris()
X, y = iris.data, iris.target

bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging_clf.fit(X, y)

result = permutation_importance(bagging_clf, X, y, n_repeats=10, random_state=42)

feature_importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': result.importances_mean
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
  • AdaBoostClassifier
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd

iris = load_iris()
X, y = iris.data, iris.target

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42)
ada_clf.fit(X, y)

feature_importances = ada_clf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

 

  • XGBoost: feature_importances_ 포함 3가지 방법 사용 가능

import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

xgb_clf = xgb.XGBClassifier(eval_metric='mlogloss')
xgb_clf.fit(X_train, y_train)

booster = xgb_clf.get_booster()
importance_by_gain = booster.get_score(importance_type='gain')
importance_by_weight = booster.get_score(importance_type='weight')
importance_by_cover = booster.get_score(importance_type='cover')

print("Gain 기반 중요도:", importance_by_gain)
print("Weight 기반 중요도:", importance_by_weight)
print("Cover 기반 중요도:", importance_by_cover)

plot_importance는 XGBoost 모델에서 제공하는 특성 중요도를 시각화하는 함수
기본적으로 Gain을 기준으로 중요도를 계산

import matplotlib.pyplot as plt
import xgboost as xgb

xgb.plot_importance(xgb_clf)
plt.show()

  • LightGBM

import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

lgb_clf = lgb.LGBMClassifier(random_state=42)
lgb_clf.fit(X_train, y_train)

feature_importances = lgb_clf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
import matplotlib.pyplot as plt
import lightgbm as lgb

lgb.plot_importance(lgb_clf, importance_type='split')
plt.show()
  • 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

feature_importances = rf_clf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

import matplotlib.pyplot as plt

# 특성 중요도 시각화
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()  # 큰 값이 위로 오도록 설정
plt.show()

  • Voting (없음)

  • MLP: 가중치를 학습하여 예측을 수행하지만, 이러한 가중치 자체로 특성 중요도를 쉽게 해석하기는 어려움
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
import pandas as pd

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 스케일된 데이터를 MLP에 적용
mlp_clf = MLPClassifier(random_state=42, max_iter=500)
mlp_clf.fit(X_train_scaled, y_train)

result = permutation_importance(mlp_clf, X_test, y_test, n_repeats=10, random_state=42)

feature_importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': result.importances_mean
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
  • LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

lda = LinearDiscriminantAnalysis()
lda.fit(X, y)

lda_coefficients = lda.coef_

feature_importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'LDA Coefficient (abs)': abs(lda_coefficients[0])  # 절댓값을 사용하여 중요도 추출
})

feature_importance_df = feature_importance_df.sort_values(by='LDA Coefficient (abs)', ascending=False)
print(feature_importance_df)

*Drop Column Importance

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base_model = RandomForestClassifier(random_state=42)
base_model.fit(X_train, y_train)
base_accuracy = accuracy_score(y_test, base_model.predict(X_test))

# 각 특성 제거 후 정확도 측정
drop_column_accuracies = []
for i in range(X.shape[1]):
    X_train_mod = X_train[:, [j for j in range(X_train.shape[1]) if j != i]]
    X_test_mod = X_test[:, [j for j in range(X_test.shape[1]) if j != i]]
    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_mod, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_mod))
    drop_column_accuracies.append(accuracy)

print("기준 모델의 정확도:", base_accuracy)
print("각 특성별 정확도:")
for i, accuracy in enumerate(drop_column_accuracies):
    print(f"특성 {i+1}: {accuracy}")

→ 해당 특성을 제거했을 때 정확

(5) 속도 개선을 위한 차원축소 방법

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# 주성분 분석하기
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
pca = PCA(n_components=8)
X_train_pca = pca.fit(X_train_s)

print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_[:5].sum())

# 주성분 적용하기
scaler = StandardScaler()
X_train_res_s = scaler.fit_transform(X_train_resample)
X_test_s = scaler.transform(X_test)

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_res_s) 
X_test_pca = pca.transform(X_test_s)

model.fit(X_train_pca, y_train_resample)
pred = model.predict(X_test_pca)

*매니폴드 학습

비선형 구조를 저차원 평면에 펼쳐 시각화
시각화, 클러스터링 전처리로 매우 유용

import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 데이터 생성 (DataFrame 형태로 변환)
X, y = make_classification(n_samples=500, n_features=20, n_informative=10, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y
from sklearn.manifold import Isomap

isomap = Isomap(n_neighbors=10, n_components=5)
X_iso = isomap.fit_transform(df.drop(columns='target'))

X_train, X_test, y_train, y_test = train_test_split(X_iso, df['target'], test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("Isomap 결과:")
print(classification_report(y_test, clf.predict(X_test)))

*NMF 분석

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

# NMF는 비음수만 허용 → 데이터 스케일 조정
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df.drop(columns='target'))

nmf = NMF(n_components=5, random_state=42)
X_nmf = nmf.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_nmf, df['target'], test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print("NMF 결과:")
print(classification_report(y_test, clf.predict(X_test)))

*요인분석

여러 변수들 간의 상관관계를 분석하여 잠재적이고 관찰할 수 없는 공통 요인(Latent Factor)을 찾아내는 통계적 방법
다변량 데이터를 분석할 때 차원 축소와 잠재 변수 탐색을 위해 사용
심리학, 사회학, 경제학, 마케팅 등 다양한 분야에서 변수 간의 상관관계를 이해하고 중요한 요인을 추출하는 데 활용

import numpy as np
import pandas as pd
from sklearn.decomposition import FactorAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# 데이터 로드 (Iris 데이터셋)
iris = load_iris()
X = iris.data
y = iris.target

# 요인분석 수행
n_components = 2  # 추출할 요인의 개수
fa = FactorAnalysis(n_components=n_components, random_state=0)
X_factors = fa.fit_transform(X)  # 요인 분석을 통해 차원 축소된 데이터

# 요인 부하량 출력
factor_loadings = pd.DataFrame(fa.components_, columns=iris.feature_names)
print("요인 부하량 (Factor Loadings):")
print(factor_loadings)

# 요인 변환된 데이터 출력 (첫 5개 관찰치)
print("\n요인 변환된 데이터:")
print(X_factors[:5])

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_factors, y, test_size=0.3, random_state=42)

# 로지스틱 회귀 모델 생성 및 훈련
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 예측 및 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"요인분석 후 로지스틱 회귀의 정확도: {accuracy:.4f}")

 요인 부하량: 첫 번째 요인이 “sepal length”, “petal length”, “petal width”에 큰 영향
 요인 변환된 데이터: 각 데이터가 두 개의 잠재 요인에 의해 변환된 결과
※ 0.3~0.5가 보통 수준

설문 조사 데이터에서 여러 질문을 요약하여 몇 가지 핵심적인 주제를 도출할 수 있고,
여러 금융 지표를 요인 분석하여 몇 가지 주요 요인을 추출하고 경제 상태를 분석하는 데 사용할 수 있음

*독립성분 분석 (p.327)

다변량의 신호를 최대한 독립적인 추가 하부 성분으로 분리하는 방법
일반적으로 차원을 줄이기 위해 사용하는 것이 아니라, 중첩된 신호를 분리하는데 사용
예) 실내에서 녹음된 여러 사람의 대화에서 특정 인물의 목소리를 빼내는 음원 분리

*다차원 척도법 (p.331)

고차원 데이터를 저차원(주로 2차원 또는 3차원)으로 시각화할 수 있도록 차원을 축소하는 기법
stress는 0에 가까우면 완벽, 0.1 이하는 적합도 좋음 / 0.15 이상은 적합도 나쁨

  • 계량적 MDS: 데이터가 구간척도, 비율척도
  • 비계량적 MDS: 서열척도
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics import pairwise_distances

# 데이터 생성 (10개의 점을 임의로 생성)
np.random.seed(42)
X = np.random.rand(10, 5)  # 10개의 데이터 포인트 (5차원)

mds = MDS(n_components=2, dissimilarity='euclidean', random_state=42, normalized_stress=False)
#mds = MDS(n_components=2, dissimilarity='euclidean', metric=False, random_state=42)  # 비계량 MDS

X_mds = mds.fit_transform(X)
#X_mds_test = mds.transform(X)

plt.scatter(X_mds[:, 0], X_mds[:, 1], color='blue')

for i in range(X_mds.shape[0]):
    plt.text(X_mds[i, 0], X_mds[i, 1], f'Point {i+1}', fontsize=12)

plt.xlabel("MDS1")
plt.ylabel("MDS2")
plt.grid(True)
plt.show()

*예측한계선 설정

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

X, y = np.random.rand(100, 5), np.random.randint(2, size=100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_prob = rf.predict_proba(X_test)[:, 1]  # 클래스 1에 대한 확률 추출

y_pred_default = np.where(y_prob > 0.5, 1, 0)  # 기본 임계값 0.5로 예측
threshold = 0.7
y_pred_custom = np.where(y_prob > threshold, 1, 0)  # 사용자 지정 임계값 0.7로 예측

# 7. 결과 평가
print("Confusion Matrix (Threshold = 0.5):")
print(confusion_matrix(y_test, y_pred_default))
print("\nClassification Report (Threshold = 0.5):")
print(classification_report(y_test, y_pred_default))

print("\nConfusion Matrix (Threshold = 0.7):")
print(confusion_matrix(y_test, y_pred_custom))
print("\nClassification Report (Threshold = 0.7):")
print(classification_report(y_test, y_pred_custom))

 

*랜덤포레스트 – 다중 분류

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X, y = make_classification(n_samples=200, n_features=5, n_classes=3, n_informative=3, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_prob = rf.predict_proba(X_test)

# 임계값을 설정하여 가장 높은 확률이 특정 값 이상일 때만 해당 클래스로 예측
threshold = 0.6
y_pred_custom = np.array([np.argmax(probs) if np.max(probs) > threshold else -1 for probs in y_prob])

# 평가 (여기서 -1은 '불확실한' 예측을 나타냄)
print("Classification Report (with custom threshold):")
print(classification_report(y_test, y_pred_custom, labels=np.unique(y_pred_custom)))

 

*SVM – 다중분류

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()
X = iris.data[:, :2]  # 편의를 위해 2개의 피처만 사용 (sepal length, sepal width)
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

def plot_decision_boundary(X, y, model):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.coolwarm)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', marker='o', cmap=plt.cm.coolwarm)
    plt.title("SVM Decision Boundary")
    plt.show()

plot_decision_boundary(X_train, y_train, svm_model)

 

*확률 기반 예측을 활성화한 SVM 예제

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

iris = datasets.load_iris()
X = iris.data[:, :2]  # 편의를 위해 2개의 피처만 사용
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. 확률 기반 SVM 모델 생성 (probability=True)
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)
svm_model.fit(X_train, y_train)

# 3. 확률 예측 (각 클래스에 속할 확률 예측)
y_proba = svm_model.predict_proba(X_test)
print("Predicted probabilities:\n", y_proba)

# 4. Threshold 적용 (기본값은 0.5이지만, 다른 임계값을 설정 가능)
threshold = 0.6
y_pred_custom = np.where(y_proba.max(axis=1) >= threshold, svm_model.predict(X_test), -1)  

print("\nClassification Report (with custom threshold):\n", classification_report(y_test, y_pred_custom))