붓꽃 품종 예측하기

In [1]:

pip install  sklearn

Requirement already satisfied: sklearn in c:\users\82105\anaconda3\lib\site-packages (0.0)
Requirement already satisfied: scikit-learn in c:\users\82105\anaconda3\lib\site-packages (from sklearn) (1.0.2)
Requirement already satisfied: joblib>=0.11 in c:\users\82105\anaconda3\lib\site-packages (from scikit-learn->sklearn) (1.1.0)
Requirement already satisfied: numpy>=1.14.6 in c:\users\82105\anaconda3\lib\site-packages (from scikit-learn->sklearn) (1.21.5)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\82105\anaconda3\lib\site-packages (from scikit-learn->sklearn) (2.2.0)
Requirement already satisfied: scipy>=1.1.0 in c:\users\82105\anaconda3\lib\site-packages (from scikit-learn->sklearn) (1.7.3)
Note: you may need to restart the kernel to use updated packages.

In [2]:

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [4]:

import pandas as pd
iris =load_iris()

iris_data = iris.data

iris_label =iris.target
print('iris target값:',iris_label)
iris_label =iris.target
print('iris target명:',iris.target_names)

iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df['label']= iris.target
iris_df.head()

iris target값: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
iris target명: ['setosa' 'versicolor' 'virginica']

Out[4]:

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

In [5]:

X_train,X_test, y_train, y_test= train_test_split(iris_data,iris_label,
                                                 test_size=0.2, random_state=11)

In [61]:

print('학습용',X_train)
print('-'*50)
print('테스트용',X_test)
print('-'*50)
print('학습용',y_train)
print('테스트용',y_test)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [61], in <cell line: 1>()
----> 1 print('학습용',X_train.head())
      2 print('-'*50)
      3 print('테스트용',X_test.head())

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [55]:

dt_clf

Out[55]:

DecisionTreeClassifier(random_state=156)

In [58]:

dt_clf= DecisionTreeClassifier(random_state=11)

dt_clf.fit(X_train,y_train)

Out[58]:

DecisionTreeClassifier(random_state=11)

In [59]:

pred= dt_clf.predict(X_test)

In [60]:

pred

Out[60]:

array([1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 0, 2, 2,
       1, 1, 1, 1, 0, 0, 2, 2])

In [9]:

from sklearn.metrics import accuracy_score
print('예측 정확도:{0:.4f}'.format(accuracy_score(y_test,pred)))

예측 정확도:0.9333

In [10]:

iris = load_iris()
dt_clf =DecisionTreeClassifier()
train_data= iris.data
train_label=iris.target
dt_clf.fit(train_data,train_label)

#학습 데이터 셋으로 예측 수행
pred=dt_clf.predict(train_data)
print('예측 정확도:',accuracy_score(train_label,pred))

예측 정확도: 1.0

In [11]:

dt_clf= DecisionTreeClassifier()
iris_data= load_iris()

X_test,X_train, y_test,y_train = train_test_split(iris_data.data, iris_data.target,
                                                 test_size=0.3, random_state=121)

In [17]:

dt_clf.fit(X_train,y_train)
pred=dt_clf.predict(X_test)
print('예측 정확도:{0:.3f}'.format(accuracy_score(y_test,pred)))

예측 정확도:0.962

In [18]:

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np

In [19]:

iris = load_iris()
features = iris.data
label =iris.target
dt_clf= DecisionTreeClassifier(random_state=156)

kfold= KFold(n_splits=5)
cv_accuracy =[]
print('붓꽃 데이터 세트 크기:', features.shape[0])

붓꽃 데이터 세트 크기: 150

In [28]:

import numpy as np
n_iter = 0

for train_index,test_index, in kfold.split(features):
    X_train, X_test= features[train_index],features[test_index]
    y_train, y_test= label[train_index], label[test_index]


    dt_clf.fit(X_train,y_train)
    pred= dt_clf.predict(X_test)
    n_iter += 1

    accuracy =np.round(accuracy_score(y_test,pred), 4)
    train_size= X_train.shape[0]
    test_size= X_test.shape[0]
    print('\n {0} 교차 검증 정확도: {1} , 학습 데이터 크기: {2}, 검증 데이터 크기 :{3}'.format(n_iter, accuracy, train_size, test_size))
    print('\n {0} 검증 세트 인데스 :{1}'.format(n_iter, test_index))
    cv_accuracy.append(accuracy)

print('\n 평균 검증 정확도:',np.mean(cv_accuracy))

 1 교차 검증 정확도: 1.0 , 학습 데이터 크기: 120, 검증 데이터 크기 :30

 1 검증 세트 인데스 :[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]

 2 교차 검증 정확도: 0.9667 , 학습 데이터 크기: 120, 검증 데이터 크기 :30

 2 검증 세트 인데스 :[30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 54 55 56 57 58 59]

 3 교차 검증 정확도: 0.8667 , 학습 데이터 크기: 120, 검증 데이터 크기 :30

 3 검증 세트 인데스 :[60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
 84 85 86 87 88 89]

 4 교차 검증 정확도: 0.9333 , 학습 데이터 크기: 120, 검증 데이터 크기 :30

 4 검증 세트 인데스 :[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119]

 5 교차 검증 정확도: 0.7333 , 학습 데이터 크기: 120, 검증 데이터 크기 :30

 5 검증 세트 인데스 :[120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]

 평균 검증 정확도: 0.9

In [29]:

iris= load_iris()

iris_label=pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label']= iris.target
iris_df['label'].value_counts()

Out[29]:

0    50
1    50
2    50
Name: label, dtype: int64

In [31]:

kfold =KFold(n_splits=3)

n_iter=0
for train_index, test_index in kfold.split(iris_df):
    n_iter+=1
    label_train =iris_df['label'].iloc[train_index]
    label_test= iris_df['label'].iloc[test_index]
    print('## 교차 검증:{0}'.format(n_iter))
    print('학습 레이블 데이터 분포 : \n', label_train.value_counts())
    print('검증 레이블 데이터 분포: \n', label_test.value_counts())

## 교차 검증:1
학습 레이블 데이터 분포 : 
 1    50
2    50
Name: label, dtype: int64
검증 레이블 데이터 분포: 
 0    50
Name: label, dtype: int64
## 교차 검증:2
학습 레이블 데이터 분포 : 
 0    50
2    50
Name: label, dtype: int64
검증 레이블 데이터 분포: 
 1    50
Name: label, dtype: int64
## 교차 검증:3
학습 레이블 데이터 분포 : 
 0    50
1    50
Name: label, dtype: int64
검증 레이블 데이터 분포: 
 2    50
Name: label, dtype: int64

In [33]:

import sklearn
print(sklearn.__version__)

1.0.2

In [35]:

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
iris_data= load_iris()
dt_clf =DecisionTreeClassifier(random_state=156)
data = iris_data.data
label= iris_data.target

scores = cross_val_score(dt_clf, data , label, scoring = 'accuracy', cv=3)
print('교차검증별 정확도:' ,np.round(scores,4))
print('평균 검증 정확도:', np.round(np.mean(scores),4))

교차검증별 정확도: [0.98 0.94 0.98]
평균 검증 정확도: 0.9667

In [37]:

iris =load_iris()
X_train, X_test, y_train,y_test=train_test_split(iris_data.data,iris_data.target, test_size=0.2, random_state=121)
dtree = DecisionTreeClassifier()

parameters ={'max_depth':[1,2,3],'min_samples_split':[2,3]}

In [43]:

from sklearn.model_selection import GridSearchCV
grid_dtree= GridSearchCV(dtree, param_grid=parameters, cv=3,refit=True)

grid_dtree.fit(X_train, y_train)

scores_df= pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params','mean_test_score', 'rank_test_score',\
           'split0_test_score', 'split1_test_score', 'split2_test_score']]

Out[43]:

	params	mean_test_score	rank_test_score	split0_test_score	split1_test_score	split2_test_score
0	{'max_depth': 1, 'min_samples_split': 2}	0.700000	5	0.700	0.7	0.70
1	{'max_depth': 1, 'min_samples_split': 3}	0.700000	5	0.700	0.7	0.70
2	{'max_depth': 2, 'min_samples_split': 2}	0.958333	3	0.925	1.0	0.95
3	{'max_depth': 2, 'min_samples_split': 3}	0.958333	3	0.925	1.0	0.95
4	{'max_depth': 3, 'min_samples_split': 2}	0.975000	1	0.975	1.0	0.95
5	{'max_depth': 3, 'min_samples_split': 3}	0.975000	1	0.975	1.0	0.95

In [46]:

print('GridSearchCV 최적의 파라미터:', grid_dtree.best_params_)
print('GridSearchCV 최고의 정확도:{0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적의 파라미터: {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최고의 정확도:0.9750

In [45]:

estimator = grid_dtree.best_estimator_

pred= estimator.predict(X_test)
print('테스트 데이터 셋 정확도:{0:.4f}'.format(accuracy_score(y_test,pred)))

테스트 데이터 셋 정확도:0.9667

In [ ]:

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

with_open_형준

붓꽃 품종 예측하기_사이킷런

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2