라이브러리와 app 데이터 세트 로딩¶
In [160]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
C:\Users\82105\AppData\Local\Temp\ipykernel_19812\2892539641.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
from IPython.core.display import display, HTML
In [ ]:
import numpy as np
import pandas as pd
import gc
import time
import matplotlib.pyplot as plt
import seaborn as sns
#import warning
%matplotlib inline
#warning.ignorewarning(...)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
In [120]:
app_train = pd.read_csv('데이터셋/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('데이터셋/home-credit-default-risk/application_test.csv')
In [4]:
app_train.head()
Out[4]:
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
In [121]:
app_train.shape, app_test.shape
Out[121]:
((307511, 122), (48744, 121))
TARGET 값 분포 및 AMT_INCOME_TOTAL 값 Histogram¶
- TARGET값 별 분포도, Pandas, Matplotlib, Seaborn으로 histogram 표현
In [6]:
app_train['TARGET'].value_counts()
Out[6]:
0 282686
1 24825
Name: TARGET, dtype: int64
In [43]:
# AMT_CREDIT _ Credit amount of the loan, 대출 총액
app_train['AMT_CREDIT'].hist()
#app_train[app_train['AMT_CREDIT'] == 4050000]
#test_hist['AMT_CREDIT'].hist()
#히스토그램 y축은 금액에 해당하는 행 수
#x축은 금액을 소수점 단위로 변환해서 비율
Out[43]:
<AxesSubplot:>
In [8]:
# AMT_INCOME_TOTAL == 수입
plt.hist(app_train['AMT_INCOME_TOTAL'])
Out[8]:
(array([3.07508e+05, 2.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00]),
array([2.56500000e+04, 1.17230850e+07, 2.34205200e+07, 3.51179550e+07,
4.68153900e+07, 5.85128250e+07, 7.02102600e+07, 8.19076950e+07,
9.36051300e+07, 1.05302565e+08, 1.17000000e+08]),
<BarContainer object of 10 artists>)
In [46]:
# 수입
sns.distplot(app_train['AMT_INCOME_TOTAL'])
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Out[46]:
<AxesSubplot:xlabel='AMT_INCOME_TOTAL', ylabel='Density'>
In [11]:
# 수입
sns.boxplot(app_train['AMT_INCOME_TOTAL'])
C:\Users\82105\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
Out[11]:
<AxesSubplot:xlabel='AMT_INCOME_TOTAL'>
AMT_INCOME_TOTAL이 1000000 이하인 값에 대한 분포도¶
- boolean indexing으로 filtering 후 histogram 표현
In [ ]:
# boolean indexing 으로 filtering 적용
# AMT_INCOME_TOTAL == 수입
app_train[app_train['AMT_INCOME_TOTAL'] < 1000000]['AMT_INCOME_TOTAL'].hist()
#app_train[app_train['AMT_INCOME_TOTAL'] < 1000000].sum()
In [13]:
# distplot으로 histogram 표현
# 대출 총액
sns.distplot(app_train[app_train['AMT_CREDIT'] < 1000000]['AMT_CREDIT'])
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Out[13]:
<AxesSubplot:xlabel='AMT_CREDIT', ylabel='Density'>
TARGET 값에 따른 AMT_INCOME_TOTAL값 분포도 비교¶
- distplot과 violinplot 시각화
- plt.subplots() 기반으로 seaborn의 distplot과 violinplot으로 분포도 비교 시각화
In [50]:
# TARGET값에 따른 Filtering 조건 각각 설정.
cond1 = (app_train['TARGET'] == 1) # 그짓말쟁이
cond0 = (app_train['TARGET'] == 0) # 착한 사람
# AMT_INCOME_TOTAL_(수입)은 매우 큰 값이 있으므로 이는 제외.
cond_amt = (app_train['AMT_INCOME_TOTAL'] < 500000)
# distplot으로 TARGET=1이면 빨간색으로, 0이면 푸른색으로 Histogram 표현
#cond_amt을 추가 안하면 y축이 50이상 가장 큰값까지 나오기 떄문에 제한을 두었다
sns.distplot(app_train[cond0 & cond_amt]['AMT_INCOME_TOTAL'], label='0', color='blue')
sns.distplot(app_train[cond1 & cond_amt]['AMT_INCOME_TOTAL'], label='1', color='red')
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Out[50]:
<AxesSubplot:xlabel='AMT_INCOME_TOTAL', ylabel='Density'>
In [15]:
# violinplot을 이용하면 Category 값별로 연속형 값의 분포도를 알수 있음. x는 category컬럼, y는 연속형 컬럼
sns.violinplot(x='TARGET', y='AMT_INCOME_TOTAL', data=app_train[cond_amt]) # 수입이 500000 이하인것만
Out[15]:
<AxesSubplot:xlabel='TARGET', ylabel='AMT_INCOME_TOTAL'>
In [51]:
# 2개의 subplot을 생성
fig, axs = plt.subplots(figsize=(12, 4), nrows=1, ncols=2)
In [53]:
# TARGET 값 유형에 따른 Boolean Indexing 조건
cond1 = (app_train['TARGET'] == 1)
cond0 = (app_train['TARGET'] == 0)
cond_amt = (app_train['AMT_INCOME_TOTAL'] < 500000)
# 2개의 subplot을 생성하고 왼쪽에는 violinplot을 오른쪽에는 distplot을 표현
fig, axs = plt.subplots(figsize=(12, 4), nrows=1, ncols=2, squeeze=False)
# violin plot을 왼쪽 subplot에 그림.
sns.violinplot(x='TARGET', y='AMT_INCOME_TOTAL', data=app_train[cond_amt], ax=axs[0][0] )
# Histogram을 오른쪽 subplot에 그림.
sns.distplot(app_train[cond0 & cond_amt]['AMT_INCOME_TOTAL'], ax=axs[0][1], label='0', color='blue')
sns.distplot(app_train[cond1 & cond_amt]['AMT_INCOME_TOTAL'], ax=axs[0][1], label='1', color='red')
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Out[53]:
<AxesSubplot:xlabel='AMT_INCOME_TOTAL', ylabel='Density'>
In [ ]:
##위에 코드 함수화
In [56]:
def show_column_hist_by_target(df, column, is_amt=False):
cond1 = (df['TARGET'] == 1)
cond0 = (df['TARGET'] == 0)
fig, axs = plt.subplots(figsize=(12, 4), nrows=1, ncols=2, squeeze=False)
# is_amt가 True이면 < 500000 조건으로 filtering
cond_amt = True
if is_amt:
cond_amt = df[column] < 500000
sns.violinplot(x='TARGET', y=column, data=df[cond_amt], ax=axs[0][0] )
sns.distplot(df[cond0 & cond_amt][column], ax=axs[0][1], label='0', color='blue')
sns.distplot(df[cond1 & cond_amt][column], ax=axs[0][1], label='1', color='red')
show_column_hist_by_target(app_train, 'AMT_CREDIT', is_amt=True)
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
C:\Users\82105\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
app_train과 app_test를 합쳐서 한번에 데이터 preprocessing 수행.¶
In [122]:
app_train.shape, app_test.shape
Out[122]:
((307511, 122), (48744, 121))
In [123]:
# pandas의 concat()을 이용하여 app_train과 app_test를 결합
apps = pd.concat([app_train, app_test])
apps.shape
Out[123]:
(356255, 122)
In [124]:
#널값이 있다.
#위에 356255보다 작다
# app_test셋은 테스트 셋이기 떄문에 tarfet컬럼이 없다.
apps['TARGET'].value_counts().sum()
Out[124]:
307511
In [125]:
# app_train의 TARGET 값을 Null로 입력됨.
apps['TARGET'].value_counts(dropna=False)
#target컬럼의 범주별 수와 널값을 한번에 알수 있다.
Out[125]:
0.0 282686
NaN 48744
1.0 24825
Name: TARGET, dtype: int64
In [126]:
# 널을 안없애는 이유는 test셋을 이용하기 위함이다.
#test셋은 target컬럼을 제외한 컬럼들만 존재하기 때문에
# train 셋을 통해 학습시켜 test셋을 예측 하려 한다,
apps['TARGET'].isnull().sum()
Out[126]:
48744
Object feature들을 Label Encoding¶
- pandas의 factorize()를 이용
In [99]:
apps.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 48743
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(66), int64(40), object(16)
memory usage: 334.3+ MB
In [127]:
len(pd.factorize(apps['CODE_GENDER'])[1])
# [1]은 인코딩이 안된 것을 모아둔거 같다 길이도 3밖에 되지 않는다.
Out[127]:
3
In [129]:
# pdapps.factorize()는 편리하게 Category(범주형) 컬럼을 Label인코딩 수행.
# pd.factorize(Category컬럼 Series)는 Label인코딩된 Series와 uniq한 Category값을 반환함. [0]을 이용하여 Label인코딩 Series만 취함.
#0과 1로 인코딩된 시리즈만 옭기기
# CODE_GENDER == 성별
apps['CODE_GENDER'] = pd.factorize(apps['CODE_GENDER'])[0]
# 문자형 범주형이 CODE_GENDER뿐이다.
In [130]:
pd.factorize(apps['CODE_GENDER'])#두개를 반환한다..그래서 첫번째만 반환해라 해서, [0]을 넣는다.
# 2가 있는것은 성별 범주가 [F/M/XNA] 3가지라서 그렇다
Out[130]:
(array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
Int64Index([0, 1, 2], dtype='int64'))
In [131]:
apps['CODE_GENDER'].value_counts()
Out[131]:
1 235126
0 121125
2 4
Name: CODE_GENDER, dtype: int64
In [132]:
apps.info()
#오브젝트 컬럼이 하나 줄었다,
#int는 1 추가 되었다.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 356255 entries, 0 to 48743
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(66), int64(41), object(15)
memory usage: 334.3+ MB
In [135]:
# Label 인코딩을 위해 object 유형의 컬럼만 추출
object_columns = apps.dtypes[apps.dtypes == 'object'].index.tolist()
# 범주형 문자컬럼이 아닌 컬럼들을 한번에 레이블화 시켜본다
In [136]:
object_columns
# 이렇게 많은 문자형 컬럼이 있었다
Out[136]:
['NAME_CONTRACT_TYPE',
'FLAG_OWN_CAR',
'FLAG_OWN_REALTY',
'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'OCCUPATION_TYPE',
'WEEKDAY_APPR_PROCESS_START',
'ORGANIZATION_TYPE',
'FONDKAPREMONT_MODE',
'HOUSETYPE_MODE',
'WALLSMATERIAL_MODE',
'EMERGENCYSTATE_MODE']
In [137]:
# pd.factorize()는 한개의 컬럼만 Label 인코딩이 가능하므로 object형 컬럼들을 iteration하면서 변환 수행.
# 문자형 컬럼들을 인트로 lable화 시켰다
#factorize 함수는 factorize와 다르게 한번에 인코딩이 안대서 하나씩 해야 하므로 for문을 썻다.
for column in object_columns:
apps[column] = pd.factorize(apps[column])[0]
In [139]:
apps.head()
# 컬럼에 문자가 없다.
Out[139]:
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1.0 | 0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0 | 0 | 0 | 0 | 0 | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1.0 | 2 | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | 0 | 0 | 0.0149 | 0 | 0 | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0.0 | 0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 1 | 1 | 1 | 1 | 0 | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 2.0 | 1 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | 0 | 0 | 0.0714 | 1 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0.0 | 1 | 0 | 1 | 0 | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | 0 | 0 | 0 | 0 | 0 | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1.0 | 2 | 2 | 1 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1 | -1 | NaN | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0.0 | 0 | 1 | 0 | 0 | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | 0 | 0 | 0 | 2 | 0 | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 2.0 | 2 | 2 | 0 | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1 | -1 | NaN | -1 | -1 | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0.0 | 0 | 0 | 0 | 0 | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | 0 | 0 | 0 | 0 | 0 | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1.0 | 2 | 2 | 2 | 11 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1 | -1 | NaN | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Null값 일괄 변환¶
In [140]:
apps.isnull().sum().head(100)
#은행가면 형관펜 표시된것만 적는것과 같다,
#중요하지 않은것은 널값이 많다.
Out[140]:
SK_ID_CURR 0
TARGET 48744
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_INCOME_TOTAL 0
AMT_CREDIT 0
AMT_ANNUITY 36
AMT_GOODS_PRICE 278
NAME_TYPE_SUITE 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
REGION_POPULATION_RELATIVE 0
DAYS_BIRTH 0
DAYS_EMPLOYED 0
DAYS_REGISTRATION 0
DAYS_ID_PUBLISH 0
OWN_CAR_AGE 235241
FLAG_MOBIL 0
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
FLAG_PHONE 0
FLAG_EMAIL 0
OCCUPATION_TYPE 0
CNT_FAM_MEMBERS 2
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
HOUR_APPR_PROCESS_START 0
REG_REGION_NOT_LIVE_REGION 0
REG_REGION_NOT_WORK_REGION 0
LIVE_REGION_NOT_WORK_REGION 0
REG_CITY_NOT_LIVE_CITY 0
REG_CITY_NOT_WORK_CITY 0
LIVE_CITY_NOT_WORK_CITY 0
ORGANIZATION_TYPE 0
EXT_SOURCE_1 193910
EXT_SOURCE_2 668
EXT_SOURCE_3 69633
APARTMENTS_AVG 179948
BASEMENTAREA_AVG 207584
YEARS_BEGINEXPLUATATION_AVG 172863
YEARS_BUILD_AVG 236306
COMMONAREA_AVG 248360
ELEVATORS_AVG 189080
ENTRANCES_AVG 178407
FLOORSMAX_AVG 176341
FLOORSMIN_AVG 241108
LANDAREA_AVG 210844
LIVINGAPARTMENTS_AVG 242979
LIVINGAREA_AVG 177902
NONLIVINGAPARTMENTS_AVG 246861
NONLIVINGAREA_AVG 195766
APARTMENTS_MODE 179948
BASEMENTAREA_MODE 207584
YEARS_BEGINEXPLUATATION_MODE 172863
YEARS_BUILD_MODE 236306
COMMONAREA_MODE 248360
ELEVATORS_MODE 189080
ENTRANCES_MODE 178407
FLOORSMAX_MODE 176341
FLOORSMIN_MODE 241108
LANDAREA_MODE 210844
LIVINGAPARTMENTS_MODE 242979
LIVINGAREA_MODE 177902
NONLIVINGAPARTMENTS_MODE 246861
NONLIVINGAREA_MODE 195766
APARTMENTS_MEDI 179948
BASEMENTAREA_MEDI 207584
YEARS_BEGINEXPLUATATION_MEDI 172863
YEARS_BUILD_MEDI 236306
COMMONAREA_MEDI 248360
ELEVATORS_MEDI 189080
ENTRANCES_MEDI 178407
FLOORSMAX_MEDI 176341
FLOORSMIN_MEDI 241108
LANDAREA_MEDI 210844
LIVINGAPARTMENTS_MEDI 242979
LIVINGAREA_MEDI 177902
NONLIVINGAPARTMENTS_MEDI 246861
NONLIVINGAREA_MEDI 195766
FONDKAPREMONT_MODE 0
HOUSETYPE_MODE 0
TOTALAREA_MODE 171055
WALLSMATERIAL_MODE 0
EMERGENCYSTATE_MODE 0
OBS_30_CNT_SOCIAL_CIRCLE 1050
DEF_30_CNT_SOCIAL_CIRCLE 1050
OBS_60_CNT_SOCIAL_CIRCLE 1050
DEF_60_CNT_SOCIAL_CIRCLE 1050
DAYS_LAST_PHONE_CHANGE 1
FLAG_DOCUMENT_2 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_5 0
dtype: int64
In [141]:
# -999로 모든 컬럼들의 Null값 변환
#모든 컬럼의 널값을 -999로 바꾼다.
apps = apps.fillna(-999)
In [142]:
apps.isnull().sum().head(100)
Out[142]:
SK_ID_CURR 0
TARGET 0
NAME_CONTRACT_TYPE 0
CODE_GENDER 0
FLAG_OWN_CAR 0
FLAG_OWN_REALTY 0
CNT_CHILDREN 0
AMT_INCOME_TOTAL 0
AMT_CREDIT 0
AMT_ANNUITY 0
AMT_GOODS_PRICE 0
NAME_TYPE_SUITE 0
NAME_INCOME_TYPE 0
NAME_EDUCATION_TYPE 0
NAME_FAMILY_STATUS 0
NAME_HOUSING_TYPE 0
REGION_POPULATION_RELATIVE 0
DAYS_BIRTH 0
DAYS_EMPLOYED 0
DAYS_REGISTRATION 0
DAYS_ID_PUBLISH 0
OWN_CAR_AGE 0
FLAG_MOBIL 0
FLAG_EMP_PHONE 0
FLAG_WORK_PHONE 0
FLAG_CONT_MOBILE 0
FLAG_PHONE 0
FLAG_EMAIL 0
OCCUPATION_TYPE 0
CNT_FAM_MEMBERS 0
REGION_RATING_CLIENT 0
REGION_RATING_CLIENT_W_CITY 0
WEEKDAY_APPR_PROCESS_START 0
HOUR_APPR_PROCESS_START 0
REG_REGION_NOT_LIVE_REGION 0
REG_REGION_NOT_WORK_REGION 0
LIVE_REGION_NOT_WORK_REGION 0
REG_CITY_NOT_LIVE_CITY 0
REG_CITY_NOT_WORK_CITY 0
LIVE_CITY_NOT_WORK_CITY 0
ORGANIZATION_TYPE 0
EXT_SOURCE_1 0
EXT_SOURCE_2 0
EXT_SOURCE_3 0
APARTMENTS_AVG 0
BASEMENTAREA_AVG 0
YEARS_BEGINEXPLUATATION_AVG 0
YEARS_BUILD_AVG 0
COMMONAREA_AVG 0
ELEVATORS_AVG 0
ENTRANCES_AVG 0
FLOORSMAX_AVG 0
FLOORSMIN_AVG 0
LANDAREA_AVG 0
LIVINGAPARTMENTS_AVG 0
LIVINGAREA_AVG 0
NONLIVINGAPARTMENTS_AVG 0
NONLIVINGAREA_AVG 0
APARTMENTS_MODE 0
BASEMENTAREA_MODE 0
YEARS_BEGINEXPLUATATION_MODE 0
YEARS_BUILD_MODE 0
COMMONAREA_MODE 0
ELEVATORS_MODE 0
ENTRANCES_MODE 0
FLOORSMAX_MODE 0
FLOORSMIN_MODE 0
LANDAREA_MODE 0
LIVINGAPARTMENTS_MODE 0
LIVINGAREA_MODE 0
NONLIVINGAPARTMENTS_MODE 0
NONLIVINGAREA_MODE 0
APARTMENTS_MEDI 0
BASEMENTAREA_MEDI 0
YEARS_BEGINEXPLUATATION_MEDI 0
YEARS_BUILD_MEDI 0
COMMONAREA_MEDI 0
ELEVATORS_MEDI 0
ENTRANCES_MEDI 0
FLOORSMAX_MEDI 0
FLOORSMIN_MEDI 0
LANDAREA_MEDI 0
LIVINGAPARTMENTS_MEDI 0
LIVINGAREA_MEDI 0
NONLIVINGAPARTMENTS_MEDI 0
NONLIVINGAREA_MEDI 0
FONDKAPREMONT_MODE 0
HOUSETYPE_MODE 0
TOTALAREA_MODE 0
WALLSMATERIAL_MODE 0
EMERGENCYSTATE_MODE 0
OBS_30_CNT_SOCIAL_CIRCLE 0
DEF_30_CNT_SOCIAL_CIRCLE 0
OBS_60_CNT_SOCIAL_CIRCLE 0
DEF_60_CNT_SOCIAL_CIRCLE 0
DAYS_LAST_PHONE_CHANGE 0
FLAG_DOCUMENT_2 0
FLAG_DOCUMENT_3 0
FLAG_DOCUMENT_4 0
FLAG_DOCUMENT_5 0
dtype: int64
학습 데이터와 테스트 데이터 다시 분리¶
In [143]:
# 왜 target컬럼의 널을 제거 하지 않았냐
# >>> target값이 널인 행을 가지고 돈을 값을 것인지 안값을 것인지를 예측하기 위해서다.
# app_test의 TARGET 컬럼은 원래 null이었는데 앞에서 fillna(-999)로 -999로 변환됨. 이를 추출함.
# TARGETdl -999값이 아닌 행만 모두 app_test로 넣기
app_train = apps[apps['TARGET'] != -999]
#TARGET이 -999값이 있는 행만 모두 app_test로 넣기
app_test = apps[apps['TARGET']== -999]
app_train.shape, app_test.shape
# app_train, app_test에 원래 있던 데이터는 없어진것 같다
# shape시 새로 넣은 데이터만 있다
Out[143]:
((307511, 122), (48744, 122))
In [144]:
# app_test의 TARGET컬럼을 Drop
app_test = app_test.drop('TARGET', axis=1)
# test셋이라 타켓값컬럼을 삭제
In [145]:
app_test.shape
Out[145]:
(48744, 121)
In [146]:
app_test.dtypes
Out[146]:
SK_ID_CURR int64
NAME_CONTRACT_TYPE int64
CODE_GENDER int64
FLAG_OWN_CAR int64
FLAG_OWN_REALTY int64
...
AMT_REQ_CREDIT_BUREAU_DAY float64
AMT_REQ_CREDIT_BUREAU_WEEK float64
AMT_REQ_CREDIT_BUREAU_MON float64
AMT_REQ_CREDIT_BUREAU_QRT float64
AMT_REQ_CREDIT_BUREAU_YEAR float64
Length: 121, dtype: object
학습 데이터를 검증 데이터로 분리하고 LGBM Classifier로 학습 수행.¶
- 피처용 데이터와 타겟 데이터 분리
- 학습용/검증용 데이터 세트 분리
In [147]:
app_train.head()
Out[147]:
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1.0 | 0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0 | 0 | 0 | 0 | 0 | 0.018801 | -9461 | -637 | -3648.0 | -2120 | -999.0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1.0 | 2 | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | 0 | 0 | 0.0149 | 0 | 0 | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0.0 | 0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 1 | 1 | 1 | 1 | 0 | 0.003541 | -16765 | -1188 | -1186.0 | -291 | -999.0 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 2.0 | 1 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.311267 | 0.622246 | -999.000000 | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | 0 | 0 | 0.0714 | 1 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0.0 | 1 | 0 | 1 | 0 | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | 0 | 0 | 0 | 0 | 0 | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1.0 | 2 | 2 | 1 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -999.000000 | 0.555912 | 0.729567 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0 | -999.0 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0.0 | 0 | 1 | 0 | 0 | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | 0 | 0 | 0 | 2 | 0 | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | -999.0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 2.0 | 2 | 2 | 0 | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -999.000000 | 0.650442 | -999.000000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0 | -999.0 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -1 | -1 | -999.0000 | -1 | -1 | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -999.0 | -999.0 | -999.0 | -999.0 | -999.0 | -999.0 |
4 | 100007 | 0.0 | 0 | 0 | 0 | 0 | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | 0 | 0 | 0 | 0 | 0 | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | -999.0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1.0 | 2 | 2 | 2 | 11 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | -999.000000 | 0.322738 | -999.000000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0 | -999.0 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
In [148]:
# SK_ID_CURR' == 대출 id , 'TARGET == 0,1 사기구분
# 학습시키기 위해서 트레인에는 타켓값과 타켓값과 유사한 SK_ID_CURR도 같이 제외
ftr_app = app_train.drop(['SK_ID_CURR', 'TARGET'], axis=1) #prime key값도 제외 시키자
target_app = app_train['TARGET'] # 사기인지 알수 있는 기준인 타켓값만 넣기
In [149]:
from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
train_x.shape, valid_x.shape
Out[149]:
((215257, 120), (92254, 120))
In [ ]:
In [150]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(
n_jobs=-1,
n_estimators=1000,
learning_rate=0.02,
num_leaves=32,
subsample=0.8,
max_depth=12,
silent=-1,
verbose=-1
)
clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
eval_metric= 'auc', verbose= 100, early_stopping_rounds= 50)
# early_stopping_rounds 최고 성능이 나오고 50번 더 돌렸는데 더 좋은 성능이 안나오면 기존 최고 성능이 최고 성능이 된다
C:\Users\82105\anaconda3\lib\site-packages\lightgbm\sklearn.py:598: UserWarning: 'silent' argument is deprecated and will be removed in a future release of LightGBM. Pass 'verbose' parameter via keyword arguments instead.
_log_warning("'silent' argument is deprecated and will be removed in a future release of LightGBM. "
C:\Users\82105\anaconda3\lib\site-packages\lightgbm\sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
_log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
C:\Users\82105\anaconda3\lib\site-packages\lightgbm\sklearn.py:736: UserWarning: 'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
_log_warning("'verbose' argument is deprecated and will be removed in a future release of LightGBM. "
[100] training's auc: 0.752205 training's binary_logloss: 0.250372 valid_1's auc: 0.744317 valid_1's binary_logloss: 0.251593
[200] training's auc: 0.771473 training's binary_logloss: 0.243554 valid_1's auc: 0.754053 valid_1's binary_logloss: 0.247539
[300] training's auc: 0.784885 training's binary_logloss: 0.239292 valid_1's auc: 0.757737 valid_1's binary_logloss: 0.246203
[400] training's auc: 0.796336 training's binary_logloss: 0.235948 valid_1's auc: 0.758946 valid_1's binary_logloss: 0.245732
[500] training's auc: 0.806016 training's binary_logloss: 0.233017 valid_1's auc: 0.759411 valid_1's binary_logloss: 0.24555
Out[150]:
LGBMClassifier(learning_rate=0.02, max_depth=12, n_estimators=1000,
num_leaves=32, silent=-1, subsample=0.8, verbose=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(learning_rate=0.02, max_depth=12, n_estimators=1000,
num_leaves=32, silent=-1, subsample=0.8, verbose=-1)
Feature importance 시각화¶
In [151]:
from lightgbm import plot_importance
plot_importance(clf, figsize=(16, 32))
#EXT_SOURCE_3 Normalized score from external data source,normalized
# DAYS_BIRTH == 신청날 고객 나이
# EXT_SOURCE_3 == 정규화 점수
Out[151]:
<AxesSubplot:title={'center':'Feature importance'}, xlabel='Feature importance', ylabel='Features'>
학습된 Classifier를 이용하여 테스트 데이터을 예측하고 결과를 Kaggle로 Submit 수행.¶
In [152]:
#학습된 classifier의 predict_proba()를 이용하여 binary classification에서 1이될 확률만 추출
preds = clf.predict_proba(app_test.drop(['SK_ID_CURR'], axis=1))[:, 1 ]
In [153]:
clf.predict_proba(app_test.drop(['SK_ID_CURR'],axis=1)) #0과 1이 될 확룔이 두개다 표시가 된다.
Out[153]:
array([[0.97246502, 0.02753498],
[0.87904013, 0.12095987],
[0.98381082, 0.01618918],
...,
[0.96802646, 0.03197354],
[0.94283805, 0.05716195],
[0.82020099, 0.17979901]])
In [154]:
app_test['TARGET'] = preds
app_test['TARGET'].head(10)
Out[154]:
0 0.027535
1 0.120960
2 0.016189
3 0.037421
4 0.146077
5 0.036411
6 0.017445
7 0.041752
8 0.016933
9 0.087198
Name: TARGET, dtype: float64
In [155]:
# SK_ID_CURR과 TARGET 값만 csv 형태로 생성.
app_test[['SK_ID_CURR', 'TARGET']].to_csv('app_baseline_01.csv', index=False)
In [156]:
app_test.head()
Out[156]:
SK_ID_CURR | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | TARGET | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100001 | 0 | 1 | 0 | 0 | 0 | 135000.0 | 568800.0 | 20560.5 | 450000.0 | 0 | 0 | 1 | 1 | 0 | 0.018850 | -19241 | -2329 | -5170.0 | -812 | -999.0 | 1 | 1 | 0 | 1 | 0 | 1 | -1 | 2.0 | 2 | 2 | 6 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | 0.752614 | 0.789654 | 0.159520 | 0.0660 | 0.0590 | 0.9732 | -999.0000 | -999.0000 | -999.00 | 0.1379 | 0.125 | -999.0000 | -999.0000 | -999.0000 | 0.0505 | -999.0000 | -999.00 | 0.0672 | 0.0612 | 0.9732 | -999.0000 | -999.0000 | -999.0000 | 0.1379 | 0.125 | -999.0000 | -999.0000 | -999.0000 | 0.0526 | -999.0000 | -999.0000 | 0.0666 | 0.0590 | 0.9732 | -999.0000 | -999.0000 | -999.00 | 0.1379 | 0.125 | -999.0000 | -999.0000 | -999.0000 | 0.0514 | -999.0000 | -999.0000 | -1 | 0 | 0.0392 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | -1740.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.027535 |
1 | 100005 | 0 | 0 | 0 | 0 | 0 | 99000.0 | 222768.0 | 17370.0 | 180000.0 | 0 | 0 | 0 | 1 | 0 | 0.035792 | -18064 | -4469 | -9118.0 | -1623 | -999.0 | 1 | 1 | 0 | 1 | 0 | 0 | 13 | 2.0 | 2 | 2 | 5 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 0.564990 | 0.291656 | 0.432962 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.120960 |
2 | 100013 | 0 | 0 | 1 | 0 | 0 | 202500.0 | 663264.0 | 69777.0 | 630000.0 | -1 | 0 | 1 | 1 | 0 | 0.019101 | -20038 | -4458 | -2175.0 | -3503 | 5.0 | 1 | 1 | 0 | 1 | 0 | 0 | 4 | 2.0 | 2 | 2 | 1 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | -999.000000 | 0.699787 | 0.610991 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -856.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | 0.016189 |
3 | 100028 | 0 | 1 | 0 | 0 | 2 | 315000.0 | 1575000.0 | 49018.5 | 1575000.0 | 0 | 0 | 0 | 1 | 0 | 0.026392 | -13976 | -1866 | -2000.0 | -4208 | -999.0 | 1 | 1 | 0 | 1 | 1 | 0 | 5 | 4.0 | 2 | 2 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.525734 | 0.509677 | 0.612704 | 0.3052 | 0.1974 | 0.9970 | 0.9592 | 0.1165 | 0.32 | 0.2759 | 0.375 | 0.0417 | 0.2042 | 0.2404 | 0.3673 | 0.0386 | 0.08 | 0.3109 | 0.2049 | 0.9970 | 0.9608 | 0.1176 | 0.3222 | 0.2759 | 0.375 | 0.0417 | 0.2089 | 0.2626 | 0.3827 | 0.0389 | 0.0847 | 0.3081 | 0.1974 | 0.9970 | 0.9597 | 0.1173 | 0.32 | 0.2759 | 0.375 | 0.0417 | 0.2078 | 0.2446 | 0.3739 | 0.0388 | 0.0817 | 0 | 0 | 0.3700 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | -1805.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.037421 |
4 | 100038 | 0 | 0 | 1 | 1 | 1 | 180000.0 | 625500.0 | 32067.0 | 625500.0 | 0 | 0 | 0 | 1 | 0 | 0.010032 | -13040 | -2191 | -4000.0 | -4262 | 16.0 | 1 | 1 | 1 | 1 | 0 | 0 | -1 | 3.0 | 2 | 2 | 5 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0.202145 | 0.425687 | -999.000000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -821.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -999.0 | -999.0 | -999.0 | -999.0 | -999.0 | -999.0 | 0.146077 |
In [157]:
app_train.head()
Out[157]:
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1.0 | 0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0 | 0 | 0 | 0 | 0 | 0.018801 | -9461 | -637 | -3648.0 | -2120 | -999.0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1.0 | 2 | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | 0 | 0 | 0.0149 | 0 | 0 | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0.0 | 0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 1 | 1 | 1 | 1 | 0 | 0.003541 | -16765 | -1188 | -1186.0 | -291 | -999.0 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 2.0 | 1 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.311267 | 0.622246 | -999.000000 | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | 0 | 0 | 0.0714 | 1 | 0 | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0.0 | 1 | 0 | 1 | 0 | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | 0 | 0 | 0 | 0 | 0 | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1.0 | 2 | 2 | 1 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -999.000000 | 0.555912 | 0.729567 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0 | -999.0 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0.0 | 0 | 1 | 0 | 0 | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | 0 | 0 | 0 | 2 | 0 | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | -999.0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 2.0 | 2 | 2 | 0 | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -999.000000 | 0.650442 | -999.000000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0 | -999.0 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -1 | -1 | -999.0000 | -1 | -1 | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -999.0 | -999.0 | -999.0 | -999.0 | -999.0 | -999.0 |
4 | 100007 | 0.0 | 0 | 0 | 0 | 0 | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | 0 | 0 | 0 | 0 | 0 | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | -999.0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1.0 | 2 | 2 | 2 | 11 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | -999.000000 | 0.322738 | -999.000000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.000 | -999.0000 | -999.0 | -999.0 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.0000 | -999.00 | -1 | -1 | -999.0000 | -1 | -1 | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
'사기 예측 (언발란스 데이터)' 카테고리의 다른 글
10_prev_baseline_01_exercise_pandas_대출상환 예측_피처 엔지니어링_2단계(판다스 고급기술 활용) (0) | 2022.11.03 |
---|---|
09_4_app_baseline_02_exercise_피처 엔지니어_1단계 (0) | 2022.11.03 |