In [2]:

import pandas as pd

In [47]:

from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

C:\Users\82105\AppData\Local\Temp\ipykernel_14852\3510566465.py:1: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
  from IPython.core.display import display, HTML

테스트용 DataFrame 생성¶

In [3]:

cust_dict = {
    'customer_name':['Alice', 'Tom', 'James', 'Yerin', 'Min', 'Katherine', 'John', 'Park', 'Maria', 'Derik', 'Jin'],
    'cust_country':['US', 'GB', 'US', 'KOR', 'KOR', 'GB', 'US', 'KOR', 'US', 'GB', 'KOR'],
    'gender':['F', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F'],
    'grade':[3, 1, 2, 5, 4, 3, 5, 7, 2, 4, 5],
    'age':[25, 34, 26, 33, 67, 29, 54, 21, 77, 29, 16]
}
customer = pd.DataFrame(cust_dict)

customer.head(11)

Out[3]:

	customer_name	cust_country	gender	grade	age
0	Alice	US	F	3	25
1	Tom	GB	M	1	34
2	James	US	M	2	26
3	Yerin	KOR	F	5	33
4	Min	KOR	M	4	67
5	Katherine	GB	F	3	29
6	John	US	M	5	54
7	Park	KOR	M	7	21
8	Maria	US	F	2	77
9	Derik	GB	M	4	29
10	Jin	KOR	F	5	16

SQL Group by¶

SELECT cust_country, sum(grade) sum_grade, max(grade) max_grade, avg(age) avg_age , max(age) max_age FROM customer group by cust_country

Pandas Groupby¶

grouby key 컬럼으로 DataFrameGroupby 를 생성¶

In [11]:

cust_group = customer.groupby('cust_country')
type(cust_group)

Out[11]:

pandas.core.groupby.generic.DataFrameGroupBy

In [12]:

cust_group.head()

Out[12]:

	customer_name	cust_country	gender	grade	age
0	Alice	US	F	3	25
1	Tom	GB	M	1	34
2	James	US	M	2	26
3	Yerin	KOR	F	5	33
4	Min	KOR	M	4	67
5	Katherine	GB	F	3	29
6	John	US	M	5	54
7	Park	KOR	M	7	21
8	Maria	US	F	2	77
9	Derik	GB	M	4	29
10	Jin	KOR	F	5	16

개별 aggregation 컬럼 및 aggregation 연산별로 API 호출¶

In [32]:

cust_agg = pd.DataFrame()
cust_agg

Out[32]:

In [33]:

cust_agg = pd.DataFrame()
cust_agg['sum_grade'] = cust_group['grade'].sum()
cust_agg['max_grade'] = cust_group['grade'].max()
cust_agg.head()

Out[33]:

	sum_grade	max_grade
cust_country
GB	8	4
KOR	21	7
US	12	5

In [34]:

cust_agg['avg_age'] = cust_group['age'].mean()
cust_agg['max_age'] = cust_group['age'].max()
cust_agg.head()

Out[34]:

	sum_grade	max_grade	avg_age	max_age
cust_country
GB	8	4	30.666667	34
KOR	21	7	34.250000	67
US	12	5	45.500000	77

In [37]:

print(cust_agg.index.name)
cust_agg['avg_age'] = cust_group['age'].mean()
cust_agg['max_age'] = cust_group['age'].max()
cust_agg.head()

cust_country

Out[37]:

	sum_grade	max_grade	avg_age	max_age
cust_country
GB	8	4	30.666667	34
KOR	21	7	34.250000	67
US	12	5	45.500000	77

In [38]:

print(cust_agg.index.name)

cust_country

agg()함수를 활용. agg() 인자로 aggregation 함수 리스트를 입력. 여전히 aggregation 컬럼별로 API 적용¶

In [9]:

cust_agg1 = cust_group['grade'].agg(['sum', 'max'])
cust_agg2 = cust_group['age'].agg(['mean', 'max'])
print(cust_agg1.head())
print(cust_agg2.head())

              sum  max
cust_country          
GB              8    4
KOR            21    7
US             12    5
                   mean  max
cust_country                
GB            30.666667   34
KOR           34.250000   67
US            45.500000   77

In [10]:

cust_agg = cust_agg1.merge(cust_agg2, on='cust_country', how='left')
cust_agg.head()

Out[10]:

	sum	max_x	mean	max_y
cust_country
GB	8	4	30.666667	34
KOR	21	7	34.250000	67
US	12	5	45.500000	77

In [11]:

cust_agg = pd.DataFrame()
cust_agg[['sum_grade', 'max_grade']] = cust_group['grade'].agg(['sum', 'max'])
cust_agg[['avg_age', 'max_age']] = cust_group['age'].agg(['mean', 'max'])
cust_agg.head()

Out[11]:

	sum_grade	max_grade	avg_age	max_age
cust_country
GB	8	4	30.666667	34
KOR	21	7	34.250000	67
US	12	5	45.500000	77

agg()에 인자로 Aggregation 컬럼값과 Aggregation 함수 리스트를 기재한 dictinary값 입력¶

In [42]:

cus_ff=pd.DataFrame()

In [46]:

agg_dict = {
    'grade':['sum', 'max'],
    'age':['mean', 'max']
}
cust_agg = cust_group.agg(agg_dict)
cust_agg.head()


cus_ff=pd.DataFrame()

aff={
    'grade':['sum','max'],
    'age':['max','min']
    
}

cus_ff=cust_group.agg(aff)
cus_ff

Out[46]:

	grade		age
	sum	max	max	min
cust_country
GB	8	4	34	29
KOR	21	7	67	16
US	12	5	77	25

In [13]:

print(cust_agg.columns)
[('_').join(column) for column in cust_agg.columns]

MultiIndex([('grade',  'sum'),
            ('grade',  'max'),
            (  'age', 'mean'),
            (  'age',  'max')],
           )

Out[13]:

['grade_sum', 'grade_max', 'age_mean', 'age_max']

groupby 수행 결과 DataFrame의 컬럼명 변경.¶

In [14]:

cust_agg.columns = [('_').join(column) for column in cust_agg.columns]

In [15]:

cust_agg.head()

Out[15]:

	grade_sum	grade_max	age_mean	age_max
cust_country
GB	8	4	30.666667	34
KOR	21	7	34.250000	67
US	12	5	45.500000	77

Pandas Group by Case When¶

SQL¶

SELECT cust_country, sum(grade) total_sum,
Sum(case when gender==‘M’ then grade end) male_sum,
Sum(case when gender==‘F’ then grade end) female_sum FROM customer GROUP BY cust_country

case when 조건으로 각각 filtering하여 별도의 DataFrame생성¶

In [16]:

cond_male = customer['gender'] == 'M'
cond_female = customer['gender'] == 'F'
cust_male = customer[cond_male]
cust_female = customer[cond_female]

In [17]:

cust_female

Out[17]:

	customer_name	cust_country	gender	grade	age
0	Alice	US	F	3	25
3	Yerin	KOR	F	5	33
5	Katherine	GB	F	3	29
8	Maria	US	F	2	77
10	Jin	KOR	F	5	16

원본 데이터에 cust_country로 groupby 수행.¶

In [18]:

cust_agg = customer.groupby('cust_country')['grade'].sum()
cust_agg = cust_agg.reset_index()

In [19]:

cust_agg.head()

Out[19]:

	cust_country	grade
0	GB	8
1	KOR	21
2	US	12

filtering된 데이터에 cust_country로 groupby 수행.¶

In [20]:

cust_male_agg = cust_male.groupby('cust_country')['grade'].sum()
cust_female_agg = cust_female.groupby('cust_country')['grade'].sum()
cust_male_agg = cust_male_agg.reset_index()
cust_female_agg = cust_female_agg.reset_index()

In [21]:

cust_male_agg

Out[21]:

	cust_country	grade
0	GB	5
1	KOR	11
2	US	7

원본 데이터에 groupby 된 데이터 세트와 filtering된 데이터에 groupby 된 데이터 세트를 조인.¶

In [22]:

cust_agg = cust_agg.merge(cust_male_agg, on='cust_country', how='left')
cust_agg = cust_agg.merge(cust_female_agg, on='cust_country', how='left')
cust_agg.head()

Out[22]:

	cust_country	grade_x	grade_y	grade
0	GB	8	5	3
1	KOR	21	11	10
2	US	12	7	5

In [23]:

cust_agg.columns = ['cust_country', 'sum_grade', 'sum_male_grade', 'sum_female_grade']
cust_agg

Out[23]:

	cust_country	sum_grade	sum_male_grade	sum_female_grade
0	GB	8	5	3
1	KOR	21	11	10
2	US	12	7	5

In [ ]:

with_open_형준

판다스 그룹바이

테스트용 DataFrame 생성¶

SQL Group by¶

Pandas Groupby¶

grouby key 컬럼으로 DataFrameGroupby 를 생성¶

개별 aggregation 컬럼 및 aggregation 연산별로 API 호출¶

agg()함수를 활용. agg() 인자로 aggregation 함수 리스트를 입력. 여전히 aggregation 컬럼별로 API 적용¶

agg()에 인자로 Aggregation 컬럼값과 Aggregation 함수 리스트를 기재한 dictinary값 입력¶

groupby 수행 결과 DataFrame의 컬럼명 변경.¶

Pandas Group by Case When¶

SQL¶

case when 조건으로 각각 filtering하여 별도의 DataFrame생성¶

원본 데이터에 cust_country로 groupby 수행.¶

filtering된 데이터에 cust_country로 groupby 수행.¶

원본 데이터에 groupby 된 데이터 세트와 filtering된 데이터에 groupby 된 데이터 세트를 조인.¶

'파이썬 기본' 카테고리의 다른 글

티스토리툴바

판다스 그룹바이

테스트용 DataFrame 생성¶

SQL Group by¶

Pandas Groupby¶

grouby key 컬럼으로 DataFrameGroupby 를 생성¶

개별 aggregation 컬럼 및 aggregation 연산별로 API 호출¶

agg()함수를 활용. agg() 인자로 aggregation 함수 리스트를 입력. 여전히 aggregation 컬럼별로 API 적용¶

agg()에 인자로 Aggregation 컬럼값과 Aggregation 함수 리스트를 기재한 dictinary값 입력¶

groupby 수행 결과 DataFrame의 컬럼명 변경.¶

Pandas Group by Case When¶

SQL¶

case when 조건으로 각각 filtering하여 별도의 DataFrame생성¶

원본 데이터에 cust_country로 groupby 수행.¶

filtering된 데이터에 cust_country로 groupby 수행.¶

원본 데이터에 groupby 된 데이터 세트와 filtering된 데이터에 groupby 된 데이터 세트를 조인.¶

'파이썬 기본' 카테고리의 다른 글

'파이썬 기본' Related Articles

티스토리툴바