In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data['Outcome'].value_counts())
diabetes_data.head(3)
0 500
1 268
Name: Outcome, dtype: int64
Out[24]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
In [2]:
diabetes_data.info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [2]:
# 수정된 get_clf_eval() 함수
def get_clf_eval(y_test, pred=None, pred_proba=None):
confusion = confusion_matrix( y_test, pred)
accuracy = accuracy_score(y_test , pred)
precision =
(y_test , pred)
recall = recall_score(y_test , pred)
f1 = f1_score(y_test,pred)
# ROC-AUC 추가
roc_auc = roc_auc_score(y_test, pred_proba)
print('오차 행렬')
print(confusion)
# ROC-AUC print 추가
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
In [32]:
thresholds
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [32], in <cell line: 1>()
----> 1 thresholds
NameError: name 'thresholds' is not defined
In [3]:
def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
# threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출.
precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
# X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
plt.figure(figsize=(8,6))
threshold_boundary = thresholds.shape[0]
plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
# threshold 값 X 축의 Scale을 0.1 단위로 변경
start, end = plt.xlim()
plt.xticks(np.round(np.arange(start, end, 0.1),2))
# x축, y축 label과 legend, 그리고 grid 설정
plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
plt.legend(); plt.grid()
plt.show()
In [4]:
# 피처 데이터 세트 X, 레이블 데이터 세트 y를 추출.
# 맨 끝이 Outcome 컬럼으로 레이블 값임. 컬럼 위치 -1을 이용해 추출
X = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 156, stratify=y)
# 로지스틱 회귀로 학습,예측 및 평가 수행.
lr_clf = LogisticRegression()
lr_clf.fit(X_train , y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test , pred, pred_proba)
오차 행렬
[[88 12]
[23 31]]
정확도: 0.7727, 정밀도: 0.7209, 재현율: 0.5741, F1: 0.6392, AUC:0.7919
C:\Users\82105\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
In [6]:
pred_proba_c1 = lr_clf.predict_proba(X_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)
In [25]:
diabetes_data.describe()
Out[25]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [26]:
plt.hist(diabetes_data['Glucose'], bins=10)
Out[26]:
(array([ 5., 0., 4., 32., 156., 211., 163., 95., 56., 46.]),
array([ 0. , 19.9, 39.8, 59.7, 79.6, 99.5, 119.4, 139.3, 159.2,
179.1, 199. ]),
<BarContainer object of 10 artists>)
In [27]:
# 0값을 검사할 피처명 리스트 객체 설정
zero_features = ['Glucose', 'BloodPressure','SkinThickness','Insulin','BMI']
# 전체 데이터 건수
total_count = diabetes_data['Glucose'].count()
# 피처별로 반복 하면서 데이터 값이 0 인 데이터 건수 추출하고, 퍼센트 계산
for feature in zero_features:
zero_count = diabetes_data[diabetes_data[feature] == 0][feature].count()
print('{0} 0 건수는 {1}, 퍼센트는 {2:.2f} %'.format(feature, zero_count, 100*zero_count/total_count))
768
Glucose 0 건수는 5, 퍼센트는 0.65 %
BloodPressure 0 건수는 35, 퍼센트는 4.56 %
SkinThickness 0 건수는 227, 퍼센트는 29.56 %
Insulin 0 건수는 374, 퍼센트는 48.70 %
BMI 0 건수는 11, 퍼센트는 1.43 %
In [28]:
# zero_features 리스트 내부에 저장된 개별 피처들에 대해서 0값을 평균 값으로 대체
diabetes_data[zero_features]=diabetes_data[zero_features].replace(0, diabetes_data[zero_features].mean())
In [43]:
X_scaler
Out[43]:
array([[ 0.63994726, 0.84832379, 0.14964075, ..., 0.20401277,
0.46849198, 1.4259954 ],
[-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
-0.36506078, -0.19067191],
[ 1.23388019, 1.94372388, -0.26394125, ..., -1.10325546,
0.60439732, -0.10558415],
...,
[ 0.3429808 , 0.00330087, 0.14964075, ..., -0.73518964,
-0.68519336, -0.27575966],
[-0.84488505, 0.1597866 , -0.47073225, ..., -0.24020459,
-0.37110101, 1.17073215],
[-0.84488505, -0.8730192 , 0.04624525, ..., -0.20212881,
-0.47378505, -0.87137393]])
In [39]:
x
Out[39]:
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
---|---|---|---|---|---|---|---|---|
0 | 6 | 148.0 | 72.0 | 35.000000 | 79.799479 | 33.6 | 0.627 | 50 |
1 | 1 | 85.0 | 66.0 | 29.000000 | 79.799479 | 26.6 | 0.351 | 31 |
2 | 8 | 183.0 | 64.0 | 20.536458 | 79.799479 | 23.3 | 0.672 | 32 |
3 | 1 | 89.0 | 66.0 | 23.000000 | 94.000000 | 28.1 | 0.167 | 21 |
4 | 0 | 137.0 | 40.0 | 35.000000 | 168.000000 | 43.1 | 2.288 | 33 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101.0 | 76.0 | 48.000000 | 180.000000 | 32.9 | 0.171 | 63 |
764 | 2 | 122.0 | 70.0 | 27.000000 | 79.799479 | 36.8 | 0.340 | 27 |
765 | 5 | 121.0 | 72.0 | 23.000000 | 112.000000 | 26.2 | 0.245 | 30 |
766 | 1 | 126.0 | 60.0 | 20.536458 | 79.799479 | 30.1 | 0.349 | 47 |
767 | 1 | 93.0 | 70.0 | 31.000000 | 79.799479 | 30.4 | 0.315 | 23 |
768 rows × 8 columns
In [41]:
x= diabetes_data.iloc[:,:-1]
y= diabetes_data.iloc[:,-1]
scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=156, stratify=y)
# 로지스틱 회귀로 학습 예측 평가 수행
Ir_clf= LogisticRegression()
Ir_clf.fit(X_train, y_train)
pred = Ir_clf.predict(X_test)
pred_proba= Ir_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)
오차 행렬
[[90 10]
[21 33]]
정확도: 0.7987, 정밀도: 0.7674, 재현율: 0.6111, F1: 0.6804, AUC:0.8059
In [31]:
from sklearn.preprocessing import Binarizer
def get_eval_by_threshold(y_test , pred_proba_c1, thresholds):
# thresholds 리스트 객체내의 값을 차례로 iteration하면서 Evaluation 수행.
for custom_threshold in thresholds:
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
custom_predict = binarizer.transform(pred_proba_c1)
print('임곗값:',custom_threshold)
get_clf_eval(y_test , custom_predict, pred_proba_c1)
In [11]:
pred_proba
Out[11]:
array([0.02179032, 0.20837173, 0.15498853, 0.4941816 , 0.04526058,
0.58843895, 0.74410215, 0.20383534, 0.15109265, 0.15563549,
0.6782234 , 0.49867664, 0.06558148, 0.41047078, 0.21085978,
0.12147092, 0.75385462, 0.40641399, 0.03587325, 0.78502572,
0.46228279, 0.30145931, 0.06816902, 0.33980826, 0.34349613,
0.11173355, 0.91766496, 0.65558046, 0.17045457, 0.81173691,
0.20080454, 0.20547538, 0.13990007, 0.23134555, 0.15174827,
0.70460185, 0.279011 , 0.68412086, 0.80784478, 0.6081399 ,
0.40821482, 0.33914245, 0.65038489, 0.25119566, 0.31268375,
0.0420607 , 0.76011507, 0.13038814, 0.48976725, 0.29486048,
0.34285975, 0.82613595, 0.84772613, 0.08557145, 0.15880061,
0.31518675, 0.09996615, 0.36651998, 0.39399353, 0.40920345,
0.22744671, 0.73449652, 0.36858328, 0.55663361, 0.23902005,
0.10306695, 0.19947756, 0.71114524, 0.16518775, 0.67841842,
0.08485784, 0.35792512, 0.14400514, 0.46235207, 0.67466537,
0.26727045, 0.09017055, 0.1016968 , 0.14214714, 0.37735129,
0.09691981, 0.10235276, 0.72065574, 0.19731935, 0.28334899,
0.34913346, 0.76136862, 0.7179411 , 0.08307342, 0.06624957,
0.06796572, 0.14006207, 0.73930977, 0.26233103, 0.35228678,
0.64469586, 0.21427968, 0.50456785, 0.05806128, 0.69629419,
0.85502499, 0.57057556, 0.33030962, 0.25317543, 0.12721291,
0.44627571, 0.37844401, 0.42983575, 0.05201154, 0.10248308,
0.5328593 , 0.47094454, 0.16304704, 0.19515468, 0.27300856,
0.40592549, 0.77345106, 0.1128333 , 0.32399469, 0.98963766,
0.53472597, 0.01674584, 0.87830282, 0.65748036, 0.20680959,
0.3155667 , 0.10489277, 0.06950607, 0.25502009, 0.28096776,
0.07986631, 0.0896269 , 0.35067776, 0.09205424, 0.06755042,
0.47191584, 0.83343602, 0.76228563, 0.07474722, 0.13720128,
0.1078253 , 0.56006274, 0.53100402, 0.16089097, 0.65907902,
0.05813538, 0.18058874, 0.05477717, 0.76676586, 0.52226013,
0.40683347, 0.06596279, 0.47880029, 0.13516918])
In [34]:
thresholds = [0.3 , 0.33 ,0.36,0.39, 0.42 , 0.45 ,0.48, 0.50]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds )
임곗값: 0.3
오차 행렬
[[98 2]
[54 0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.33
오차 행렬
[[98 2]
[54 0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.36
오차 행렬
[[98 2]
[54 0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.39
오차 행렬
[[98 2]
[54 0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.42
오차 행렬
[[98 2]
[54 0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.45
오차 행렬
[[99 1]
[54 0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.48
오차 행렬
[[99 1]
[54 0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
임곗값: 0.5
오차 행렬
[[99 1]
[54 0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
C:\Users\82105\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
warnings.warn(
In [35]:
# 임곗값를 0.48로 설정한 Binarizer 생성
binarizer = Binarizer(threshold=0.48)
# 위에서 구한 lr_clf의 predict_proba() 예측 확률 array에서 1에 해당하는 컬럼값을 Binarizer변환.
pred_th_048 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1,1))
get_clf_eval(y_test , pred_th_048, pred_proba[:, 1])
오차 행렬
[[99 1]
[54 0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC:0.5572
In [ ]:
'머신러닝 > 분류' 카테고리의 다른 글
머신러닝_분류_개념 정확도 정밀도 재현율 (0) | 2022.10.25 |
---|