In [2]:
import pandas as pd
In [47]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
C:\Users\82105\AppData\Local\Temp\ipykernel_14852\3510566465.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
from IPython.core.display import display, HTML
테스트용 DataFrame 생성¶
In [3]:
cust_dict = {
'customer_name':['Alice', 'Tom', 'James', 'Yerin', 'Min', 'Katherine', 'John', 'Park', 'Maria', 'Derik', 'Jin'],
'cust_country':['US', 'GB', 'US', 'KOR', 'KOR', 'GB', 'US', 'KOR', 'US', 'GB', 'KOR'],
'gender':['F', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F'],
'grade':[3, 1, 2, 5, 4, 3, 5, 7, 2, 4, 5],
'age':[25, 34, 26, 33, 67, 29, 54, 21, 77, 29, 16]
}
customer = pd.DataFrame(cust_dict)
customer.head(11)
Out[3]:
customer_name | cust_country | gender | grade | age | |
---|---|---|---|---|---|
0 | Alice | US | F | 3 | 25 |
1 | Tom | GB | M | 1 | 34 |
2 | James | US | M | 2 | 26 |
3 | Yerin | KOR | F | 5 | 33 |
4 | Min | KOR | M | 4 | 67 |
5 | Katherine | GB | F | 3 | 29 |
6 | John | US | M | 5 | 54 |
7 | Park | KOR | M | 7 | 21 |
8 | Maria | US | F | 2 | 77 |
9 | Derik | GB | M | 4 | 29 |
10 | Jin | KOR | F | 5 | 16 |
SQL Group by¶
SELECT cust_country, sum(grade) sum_grade, max(grade) max_grade, avg(age) avg_age , max(age) max_age FROM customer group by cust_country
Pandas Groupby¶
grouby key 컬럼으로 DataFrameGroupby 를 생성¶
In [11]:
cust_group = customer.groupby('cust_country')
type(cust_group)
Out[11]:
pandas.core.groupby.generic.DataFrameGroupBy
In [12]:
cust_group.head()
Out[12]:
customer_name | cust_country | gender | grade | age | |
---|---|---|---|---|---|
0 | Alice | US | F | 3 | 25 |
1 | Tom | GB | M | 1 | 34 |
2 | James | US | M | 2 | 26 |
3 | Yerin | KOR | F | 5 | 33 |
4 | Min | KOR | M | 4 | 67 |
5 | Katherine | GB | F | 3 | 29 |
6 | John | US | M | 5 | 54 |
7 | Park | KOR | M | 7 | 21 |
8 | Maria | US | F | 2 | 77 |
9 | Derik | GB | M | 4 | 29 |
10 | Jin | KOR | F | 5 | 16 |
개별 aggregation 컬럼 및 aggregation 연산별로 API 호출¶
In [32]:
cust_agg = pd.DataFrame()
cust_agg
Out[32]:
In [33]:
cust_agg = pd.DataFrame()
cust_agg['sum_grade'] = cust_group['grade'].sum()
cust_agg['max_grade'] = cust_group['grade'].max()
cust_agg.head()
Out[33]:
sum_grade | max_grade | |
---|---|---|
cust_country | ||
GB | 8 | 4 |
KOR | 21 | 7 |
US | 12 | 5 |
In [34]:
cust_agg['avg_age'] = cust_group['age'].mean()
cust_agg['max_age'] = cust_group['age'].max()
cust_agg.head()
Out[34]:
sum_grade | max_grade | avg_age | max_age | |
---|---|---|---|---|
cust_country | ||||
GB | 8 | 4 | 30.666667 | 34 |
KOR | 21 | 7 | 34.250000 | 67 |
US | 12 | 5 | 45.500000 | 77 |
In [37]:
print(cust_agg.index.name)
cust_agg['avg_age'] = cust_group['age'].mean()
cust_agg['max_age'] = cust_group['age'].max()
cust_agg.head()
cust_country
Out[37]:
sum_grade | max_grade | avg_age | max_age | |
---|---|---|---|---|
cust_country | ||||
GB | 8 | 4 | 30.666667 | 34 |
KOR | 21 | 7 | 34.250000 | 67 |
US | 12 | 5 | 45.500000 | 77 |
In [38]:
print(cust_agg.index.name)
cust_country
agg()함수를 활용. agg() 인자로 aggregation 함수 리스트를 입력. 여전히 aggregation 컬럼별로 API 적용¶
In [9]:
cust_agg1 = cust_group['grade'].agg(['sum', 'max'])
cust_agg2 = cust_group['age'].agg(['mean', 'max'])
print(cust_agg1.head())
print(cust_agg2.head())
sum max
cust_country
GB 8 4
KOR 21 7
US 12 5
mean max
cust_country
GB 30.666667 34
KOR 34.250000 67
US 45.500000 77
In [10]:
cust_agg = cust_agg1.merge(cust_agg2, on='cust_country', how='left')
cust_agg.head()
Out[10]:
sum | max_x | mean | max_y | |
---|---|---|---|---|
cust_country | ||||
GB | 8 | 4 | 30.666667 | 34 |
KOR | 21 | 7 | 34.250000 | 67 |
US | 12 | 5 | 45.500000 | 77 |
In [11]:
cust_agg = pd.DataFrame()
cust_agg[['sum_grade', 'max_grade']] = cust_group['grade'].agg(['sum', 'max'])
cust_agg[['avg_age', 'max_age']] = cust_group['age'].agg(['mean', 'max'])
cust_agg.head()
Out[11]:
sum_grade | max_grade | avg_age | max_age | |
---|---|---|---|---|
cust_country | ||||
GB | 8 | 4 | 30.666667 | 34 |
KOR | 21 | 7 | 34.250000 | 67 |
US | 12 | 5 | 45.500000 | 77 |
agg()에 인자로 Aggregation 컬럼값과 Aggregation 함수 리스트를 기재한 dictinary값 입력¶
In [42]:
cus_ff=pd.DataFrame()
In [46]:
agg_dict = {
'grade':['sum', 'max'],
'age':['mean', 'max']
}
cust_agg = cust_group.agg(agg_dict)
cust_agg.head()
cus_ff=pd.DataFrame()
aff={
'grade':['sum','max'],
'age':['max','min']
}
cus_ff=cust_group.agg(aff)
cus_ff
Out[46]:
grade | age | |||
---|---|---|---|---|
sum | max | max | min | |
cust_country | ||||
GB | 8 | 4 | 34 | 29 |
KOR | 21 | 7 | 67 | 16 |
US | 12 | 5 | 77 | 25 |
In [13]:
print(cust_agg.columns)
[('_').join(column) for column in cust_agg.columns]
MultiIndex([('grade', 'sum'),
('grade', 'max'),
( 'age', 'mean'),
( 'age', 'max')],
)
Out[13]:
['grade_sum', 'grade_max', 'age_mean', 'age_max']
groupby 수행 결과 DataFrame의 컬럼명 변경.¶
In [14]:
cust_agg.columns = [('_').join(column) for column in cust_agg.columns]
In [15]:
cust_agg.head()
Out[15]:
grade_sum | grade_max | age_mean | age_max | |
---|---|---|---|---|
cust_country | ||||
GB | 8 | 4 | 30.666667 | 34 |
KOR | 21 | 7 | 34.250000 | 67 |
US | 12 | 5 | 45.500000 | 77 |
Pandas Group by Case When¶
SQL¶
SELECT cust_country, sum(grade) total_sum,
Sum(case when gender==‘M’ then grade end) male_sum,
Sum(case when gender==‘F’ then grade end) female_sum FROM customer GROUP BY cust_country
case when 조건으로 각각 filtering하여 별도의 DataFrame생성¶
In [16]:
cond_male = customer['gender'] == 'M'
cond_female = customer['gender'] == 'F'
cust_male = customer[cond_male]
cust_female = customer[cond_female]
In [17]:
cust_female
Out[17]:
customer_name | cust_country | gender | grade | age | |
---|---|---|---|---|---|
0 | Alice | US | F | 3 | 25 |
3 | Yerin | KOR | F | 5 | 33 |
5 | Katherine | GB | F | 3 | 29 |
8 | Maria | US | F | 2 | 77 |
10 | Jin | KOR | F | 5 | 16 |
원본 데이터에 cust_country로 groupby 수행.¶
In [18]:
cust_agg = customer.groupby('cust_country')['grade'].sum()
cust_agg = cust_agg.reset_index()
In [19]:
cust_agg.head()
Out[19]:
cust_country | grade | |
---|---|---|
0 | GB | 8 |
1 | KOR | 21 |
2 | US | 12 |
filtering된 데이터에 cust_country로 groupby 수행.¶
In [20]:
cust_male_agg = cust_male.groupby('cust_country')['grade'].sum()
cust_female_agg = cust_female.groupby('cust_country')['grade'].sum()
cust_male_agg = cust_male_agg.reset_index()
cust_female_agg = cust_female_agg.reset_index()
In [21]:
cust_male_agg
Out[21]:
cust_country | grade | |
---|---|---|
0 | GB | 5 |
1 | KOR | 11 |
2 | US | 7 |
원본 데이터에 groupby 된 데이터 세트와 filtering된 데이터에 groupby 된 데이터 세트를 조인.¶
In [22]:
cust_agg = cust_agg.merge(cust_male_agg, on='cust_country', how='left')
cust_agg = cust_agg.merge(cust_female_agg, on='cust_country', how='left')
cust_agg.head()
Out[22]:
cust_country | grade_x | grade_y | grade | |
---|---|---|---|---|
0 | GB | 8 | 5 | 3 |
1 | KOR | 21 | 11 | 10 |
2 | US | 12 | 7 | 5 |
In [23]:
cust_agg.columns = ['cust_country', 'sum_grade', 'sum_male_grade', 'sum_female_grade']
cust_agg
Out[23]:
cust_country | sum_grade | sum_male_grade | sum_female_grade | |
---|---|---|---|---|
0 | GB | 8 | 5 | 3 |
1 | KOR | 21 | 11 | 10 |
2 | US | 12 | 7 | 5 |
In [ ]: