In [24]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data['Outcome'].value_counts())
diabetes_data.head(3)

0    500
1    268
Name: Outcome, dtype: int64

Out[24]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	33.6	0.627	50	1
1	1	85	66	29	26.6	0.351	31	0
2	8	183	64	0	23.3	0.672	32	1

In [2]:

diabetes_data.info( )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

In [2]:

# 수정된 get_clf_eval() 함수 
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = 
    (y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [32]:

thresholds

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [32], in <cell line: 1>()
----> 1 thresholds

NameError: name 'thresholds' is not defined

In [3]:

def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [4]:

# 피처 데이터 세트 X, 레이블 데이터 세트 y를 추출. 
# 맨 끝이 Outcome 컬럼으로 레이블 값임. 컬럼 위치 -1을 이용해 추출 
X = diabetes_data.iloc[:, :-1]
y = diabetes_data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 156, stratify=y)

# 로지스틱 회귀로 학습,예측 및 평가 수행. 
lr_clf = LogisticRegression()
lr_clf.fit(X_train , y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:, 1]

get_clf_eval(y_test , pred, pred_proba)

오차 행렬
[[88 12]
 [23 31]]
정확도: 0.7727, 정밀도: 0.7209, 재현율: 0.5741,    F1: 0.6392, AUC:0.7919

C:\Users\82105\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

In [6]:

pred_proba_c1 = lr_clf.predict_proba(X_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)

In [25]:

diabetes_data.describe()

Out[25]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

In [26]:

plt.hist(diabetes_data['Glucose'], bins=10)

Out[26]:

(array([  5.,   0.,   4.,  32., 156., 211., 163.,  95.,  56.,  46.]),
 array([  0. ,  19.9,  39.8,  59.7,  79.6,  99.5, 119.4, 139.3, 159.2,
        179.1, 199. ]),
 <BarContainer object of 10 artists>)

In [27]:

# 0값을 검사할 피처명 리스트 객체 설정
zero_features = ['Glucose', 'BloodPressure','SkinThickness','Insulin','BMI']

# 전체 데이터 건수
total_count = diabetes_data['Glucose'].count()

# 피처별로 반복 하면서 데이터 값이 0 인 데이터 건수 추출하고, 퍼센트 계산
for feature in zero_features:
    zero_count = diabetes_data[diabetes_data[feature] == 0][feature].count()
    print('{0} 0 건수는 {1}, 퍼센트는 {2:.2f} %'.format(feature, zero_count, 100*zero_count/total_count))

768
Glucose 0 건수는 5, 퍼센트는 0.65 %
BloodPressure 0 건수는 35, 퍼센트는 4.56 %
SkinThickness 0 건수는 227, 퍼센트는 29.56 %
Insulin 0 건수는 374, 퍼센트는 48.70 %
BMI 0 건수는 11, 퍼센트는 1.43 %

In [28]:

# zero_features 리스트 내부에 저장된 개별 피처들에 대해서 0값을 평균 값으로 대체
diabetes_data[zero_features]=diabetes_data[zero_features].replace(0, diabetes_data[zero_features].mean())

In [43]:

X_scaler

Out[43]:

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [39]:

Out[39]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age
0	6	148.0	72.0	35.000000	79.799479	33.6	0.627	50
1	1	85.0	66.0	29.000000	79.799479	26.6	0.351	31
2	8	183.0	64.0	20.536458	79.799479	23.3	0.672	32
3	1	89.0	66.0	23.000000	94.000000	28.1	0.167	21
4	0	137.0	40.0	35.000000	168.000000	43.1	2.288	33
...	...	...	...	...	...	...	...	...
763	10	101.0	76.0	48.000000	180.000000	32.9	0.171	63
764	2	122.0	70.0	27.000000	79.799479	36.8	0.340	27
765	5	121.0	72.0	23.000000	112.000000	26.2	0.245	30
766	1	126.0	60.0	20.536458	79.799479	30.1	0.349	47
767	1	93.0	70.0	31.000000	79.799479	30.4	0.315	23

768 rows × 8 columns

In [41]:

x= diabetes_data.iloc[:,:-1]
y= diabetes_data.iloc[:,-1]

scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=156, stratify=y)

# 로지스틱 회귀로 학습 예측 평가 수행
Ir_clf= LogisticRegression()
Ir_clf.fit(X_train, y_train)
pred = Ir_clf.predict(X_test)
pred_proba= Ir_clf.predict_proba(X_test)[:,1]
get_clf_eval(y_test, pred, pred_proba)

오차 행렬
[[90 10]
 [21 33]]
정확도: 0.7987, 정밀도: 0.7674, 재현율: 0.6111,    F1: 0.6804, AUC:0.8059

In [31]:

from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test , pred_proba_c1, thresholds):
    # thresholds 리스트 객체내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
        get_clf_eval(y_test , custom_predict, pred_proba_c1)

In [11]:

pred_proba

Out[11]:

array([0.02179032, 0.20837173, 0.15498853, 0.4941816 , 0.04526058,
       0.58843895, 0.74410215, 0.20383534, 0.15109265, 0.15563549,
       0.6782234 , 0.49867664, 0.06558148, 0.41047078, 0.21085978,
       0.12147092, 0.75385462, 0.40641399, 0.03587325, 0.78502572,
       0.46228279, 0.30145931, 0.06816902, 0.33980826, 0.34349613,
       0.11173355, 0.91766496, 0.65558046, 0.17045457, 0.81173691,
       0.20080454, 0.20547538, 0.13990007, 0.23134555, 0.15174827,
       0.70460185, 0.279011  , 0.68412086, 0.80784478, 0.6081399 ,
       0.40821482, 0.33914245, 0.65038489, 0.25119566, 0.31268375,
       0.0420607 , 0.76011507, 0.13038814, 0.48976725, 0.29486048,
       0.34285975, 0.82613595, 0.84772613, 0.08557145, 0.15880061,
       0.31518675, 0.09996615, 0.36651998, 0.39399353, 0.40920345,
       0.22744671, 0.73449652, 0.36858328, 0.55663361, 0.23902005,
       0.10306695, 0.19947756, 0.71114524, 0.16518775, 0.67841842,
       0.08485784, 0.35792512, 0.14400514, 0.46235207, 0.67466537,
       0.26727045, 0.09017055, 0.1016968 , 0.14214714, 0.37735129,
       0.09691981, 0.10235276, 0.72065574, 0.19731935, 0.28334899,
       0.34913346, 0.76136862, 0.7179411 , 0.08307342, 0.06624957,
       0.06796572, 0.14006207, 0.73930977, 0.26233103, 0.35228678,
       0.64469586, 0.21427968, 0.50456785, 0.05806128, 0.69629419,
       0.85502499, 0.57057556, 0.33030962, 0.25317543, 0.12721291,
       0.44627571, 0.37844401, 0.42983575, 0.05201154, 0.10248308,
       0.5328593 , 0.47094454, 0.16304704, 0.19515468, 0.27300856,
       0.40592549, 0.77345106, 0.1128333 , 0.32399469, 0.98963766,
       0.53472597, 0.01674584, 0.87830282, 0.65748036, 0.20680959,
       0.3155667 , 0.10489277, 0.06950607, 0.25502009, 0.28096776,
       0.07986631, 0.0896269 , 0.35067776, 0.09205424, 0.06755042,
       0.47191584, 0.83343602, 0.76228563, 0.07474722, 0.13720128,
       0.1078253 , 0.56006274, 0.53100402, 0.16089097, 0.65907902,
       0.05813538, 0.18058874, 0.05477717, 0.76676586, 0.52226013,
       0.40683347, 0.06596279, 0.47880029, 0.13516918])

In [34]:

thresholds = [0.3 , 0.33 ,0.36,0.39, 0.42 , 0.45 ,0.48, 0.50]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds )

임곗값: 0.3
오차 행렬
[[98  2]
 [54  0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.33
오차 행렬
[[98  2]
 [54  0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.36
오차 행렬
[[98  2]
 [54  0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.39
오차 행렬
[[98  2]
 [54  0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.42
오차 행렬
[[98  2]
 [54  0]]
정확도: 0.6364, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.45
오차 행렬
[[99  1]
 [54  0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.48
오차 행렬
[[99  1]
 [54  0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572
임곗값: 0.5
오차 행렬
[[99  1]
 [54  0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572

C:\Users\82105\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(

In [35]:

# 임곗값를 0.48로 설정한 Binarizer 생성
binarizer = Binarizer(threshold=0.48)

# 위에서 구한 lr_clf의 predict_proba() 예측 확률 array에서 1에 해당하는 컬럼값을 Binarizer변환. 
pred_th_048 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1,1)) 

get_clf_eval(y_test , pred_th_048, pred_proba[:, 1])

오차 행렬
[[99  1]
 [54  0]]
정확도: 0.6429, 정밀도: 0.0000, 재현율: 0.0000,    F1: 0.0000, AUC:0.5572

In [ ]:

'머신러닝 > 분류' 카테고리의 다른 글

머신러닝_분류_개념 정확도 정밀도 재현율 (0)	2022.10.25

with_open_형준

피마 인디언 당뇨병 예측 사이킷런

'머신러닝 > 분류' 카테고리의 다른 글

티스토리툴바

피마 인디언 당뇨병 예측 사이킷런

'머신러닝 > 분류' 카테고리의 다른 글

'머신러닝/분류' Related Articles

티스토리툴바