In [17]:
import pandas as pd
years=['2020','2021','2022']
df_year=[]
for year in years:
path= 'final_covid_data_for_Recovered_%s.csv' %year
covid_df= pd.read_csv(path, encoding='utf-8-sig',index_col=0)
df_year.append(covid_df)
covid_df=pd.merge(df_year[0],df_year[1], on='Country_Region')
covid_df=pd.merge(covid_df,df_year[2], on='Country_Region')
cols_covic = covid_df.columns.tolist()
cols_covic.remove('Country_Flag_y')
cols_covic.remove('Country_Flag')
covid_df = covid_df[cols_covic]
cols_covic[1] = 'Country_Flag'
covid_df.columns = cols_covic
covid_df.to_csv("test_Confirmed_2020~2022.csv")
In [16]:
years=['2020','2021','2022']
df_year=[]
for year in years:
path= 'final_covid_data_for_Recovered_%s.csv' %year
covid_df= pd.read_csv(path, encoding='utf-8-sig',index_col=0)
df_year.append(covid_df)
df_year[2]
Out[16]:
| Country_Region | Country_Flag | 1/01/2022 | 1/02/2022 | 1/03/2022 | 1/04/2022 | 1/05/2022 | 1/06/2022 | 1/07/2022 | 1/08/2022 | ... | 7/29/2022 | 7/30/2022 | 7/31/2022 | 8/01/2022 | 8/02/2022 | 8/03/2022 | 8/04/2022 | 8/05/2022 | 8/06/2022 | 8/07/2022 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | https://flagcdn.com/48x36/af.png | 158107 | 158189 | 158183 | 158205 | 158245 | 158275 | 158300 | 158309 | ... | 185481 | 185552 | 185749 | 185930 | 186120 | 186393 | 186697 | 187037 | 187109 | 187442 |
| 1 | Albania | https://flagcdn.com/48x36/al.png | 210224 | 210885 | 210885 | 212021 | 212021 | 213257 | 214905 | 214905 | ... | 310362 | 311381 | 312097 | 312375 | 313582 | 314561 | 315337 | 316145 | 316976 | 317514 |
| 2 | Algeria | https://flagcdn.com/48x36/dz.png | 218818 | 219159 | 219532 | 219953 | 220415 | 220825 | 221316 | 221742 | ... | 267287 | 267374 | 267454 | 267546 | 267657 | 267777 | 267902 | 268033 | 268141 | 268254 |
| 3 | Andorra | https://flagcdn.com/48x36/ad.png | 23740 | 23740 | 24502 | 24802 | 25289 | 25289 | 26408 | 26408 | ... | 45508 | 45508 | 45508 | 45508 | 45508 | 45793 | 45793 | 45793 | 45793 | 45793 |
| 4 | Angola | https://flagcdn.com/48x36/ao.png | 82398 | 82920 | 83764 | 84666 | 86636 | 87625 | 88775 | 89251 | ... | 102301 | 102301 | 102301 | 102301 | 102301 | 102301 | 102301 | 102636 | 102636 | 102636 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 192 | West Bank and Gaza | https://flagcdn.com/48x36/ps.png | 469748 | 469748 | 469748 | 471090 | 471414 | 471414 | 471414 | 471414 | ... | 682096 | 682096 | 682096 | 682096 | 691065 | 691065 | 691065 | 691065 | 691065 | 691065 |
| 193 | Winter Olympics 2022 | https://flagcdn.com/48x36/.png | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 535 | 535 | 535 | 535 | 535 | 535 | 535 | 535 | 535 | 535 |
| 194 | Yemen | https://flagcdn.com/48x36/ye.png | 10127 | 10130 | 10138 | 10143 | 10146 | 10152 | 10159 | 10178 | ... | 11849 | 11877 | 11877 | 11877 | 11877 | 11877 | 11895 | 11895 | 11895 | 11895 |
| 195 | Zambia | https://flagcdn.com/48x36/zm.png | 257948 | 259677 | 261221 | 265479 | 269991 | 274087 | 278276 | 281576 | ... | 329483 | 329483 | 329483 | 329483 | 330575 | 330575 | 330575 | 331074 | 331074 | 331358 |
| 196 | Zimbabwe | https://flagcdn.com/48x36/zw.png | 214214 | 214214 | 216087 | 217678 | 219057 | 220178 | 221282 | 221918 | ... | 256376 | 256376 | 256378 | 256382 | 256403 | 256423 | 256444 | 256444 | 256444 | 256447 |
196 rows × 221 columns
In [2]:
years=['2020','2021','2022']
for year in years:
print(year)
2020
2021
2022
In [13]:
dfs = []
for i in range(10):
df1 = pd.DataFrame({
'id': [1, 2, 3],
'customer_id': [1, 2, 3],
'customer_name': ['Robert', 'Peter', 'Dave']
}, columns=['id', 'customer_id', 'customer_name'])
dfs.append(df1)
dfs[1]
Out[13]:
| id | customer_id | customer_name | |
|---|---|---|---|
| 0 | 1 | 1 | Robert |
| 1 | 2 | 2 | Peter |
| 2 | 3 | 3 | Dave |
In [ ]:
미국만 꺼내고
주별로 그룹바이해서
년도별로 묵고
In [51]:
import pandas as pd
import json, os
with open('csse_covid_19_data2020/country_convert.json', 'r', encoding='utf-8-sig') as json_file:
json_data = json.load(json_file)
def country_name_convert(row):
if row['Country_Region'] in json_data:
return json_data[row['Country_Region']]
return row['Country_Region']
def create_dateframe(filename):
doc = pd.read_csv(PATH + filename, encoding='utf-8-sig') # 1. csv 파일 읽기
try:
doc = doc[['Province_State','Country_Region', 'Confirmed']] # 2. 특정 컬럼만 선택해서 데이터프레임 만들기
except:
doc = doc[['Province/State','Country/Region', 'Confirmed']] # 2. 특정 컬럼만 선택해서 데이터프레임 만들기
doc.columns = ['Province_State','Country_Region', 'Confirmed']
doc = doc.dropna(subset=['Confirmed']) # 3. 특정 컬럼에 없는 데이터 삭제하기
doc['Country_Region'] = doc.apply(country_name_convert, axis=1) # 4. 'Country_Region'의 국가명을 여러 파일에 일관되doc = doc.astype({'Confirmed': 'int64'}) # 5. 특정 컬럼의 데이터 타입 변경하기
doc=doc[doc['Country_Region']=='US'] # 바꿘점 미국만 츨력
#doc=doc.Province_State.apply(lambda x.split(',')[0]:if ',' in x )
doc = doc.groupby('Province_State').sum() # 6. 특정 컬럼으로 중복된 데이터를 합치기 주별로
# 7. 파일명을 기반으로 날짜 문자열 변환하고, 'Confirmed' 컬럼명 변경하기
date_column = filename.split(".")[0].lstrip('0').replace('-', '/')
doc.columns = [date_column]
return doc
In [52]:
import os
def generate_dateframe_by_path(PATH):
file_list, csv_list = os.listdir(PATH), list()
first_doc = True
for file in file_list:
if file.split(".")[-1] == 'csv':
csv_list.append(file)
csv_list.sort()
for file in csv_list:
doc = create_dateframe(file)
if first_doc:
final_doc, first_doc = doc, False
else:
final_doc = pd.merge(final_doc, doc, how='outer', left_index=True, right_index=True)
final_doc = final_doc.fillna(0)
return final_doc
PATH = 'csse_covid_19_data2020/csse_covid_19_daily_reports_stay/'
df_confirmed = generate_dateframe_by_path(PATH)
df_confirmed = df_confirmed.astype('int64')
df_confirmed.to_csv("Province_test1 data.csv")
In [ ]:
def create_flag_link(row):
flag_link = 'https://flagcdn.com/48x36/' + row + '.png'
return flag_link
PATH = 'csse_covid_19_data/csse_covid_19_daily_reports_stay/'
df_confirmed = generate_dateframe_by_path(PATH)
df_confirmed = df_confirmed.astype('int64')
country_info = pd.read_csv("csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv", encoding='utf-8-sig',keep_default_na=False)
country_info = country_info[['admin2', 'Country_Region']]
country_info=country_info[country_info['admin2'].split('')[-1]]
country_info = country_info.drop_duplicates(subset='Country_Region', keep='fist')
doc_final_country = pd.merge(df_confirmed, country_info, how='left', on='Country_Region')
doc_final_country = doc_final_country.dropna(subset=['iso2'])
doc_final_country['iso2'] = doc_final_country['iso2'].apply(create_flag_link)
cols = doc_final_country.columns.tolist()
cols.remove('iso2')
cols.insert(1, 'iso2')
doc_final_country = doc_final_country[cols]
cols[1] = 'Country_Flag'
doc_final_country.columns = cols
doc_final_country['Country_Flag'] = doc_final_country['Country_Flag'].str.lower()
doc_final_country.to_csv("Province_State_covid_data.csv")
In [66]:
path= 'Province_test1 data.csv'
covid_df= pd.read_csv(path, encoding='utf-8-sig',index_col=0)
In [67]:
covid_df
Out[67]:
| 1/22/2020 | 1/23/2020 | 1/24/2020 | 1/25/2020 | 1/26/2020 | 1/27/2020 | 1/28/2020 | 1/29/2020 | 1/30/2020 | 1/31/2020 | ... | 6/08/2020 | 6/09/2020 | 6/10/2020 | 6/11/2020 | 6/12/2020 | 6/13/2020 | 6/14/2020 | 6/15/2020 | 6/16/2020 | 6/17/2020 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Province_State | |||||||||||||||||||||
| Norfolk County, MA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Alabama | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 20925 | 21422 | 21989 | 22845 | 23710 | 24601 | 25615 | 26272 | 26912 | 27312 |
| Alameda County, CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Alaska | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 562 | 572 | 592 | 609 | 624 | 653 | 660 | 663 | 675 | 695 |
| American Samoa | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Williamson County, TN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Wisconsin | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 21038 | 21308 | 21593 | 21926 | 22246 | 22518 | 22758 | 22932 | 23198 | 23456 |
| Wuhan Evacuee | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Wyoming | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 960 | 970 | 980 | 1009 | 1027 | 1050 | 1060 | 1079 | 1089 | 1114 |
| Yolo County, CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
199 rows × 148 columns
In [68]:
covid_df=covid_df.reset_index()
#covid_df=covid_df.Province_State.apply(lambda x.split(',')[0]:if ',' in x )
In [70]:
covid_df.Province_State=covid_df.Province_State.apply(lambda x : x.split(',')[0])
In [71]:
covid_df
Out[71]:
| Province_State | 1/22/2020 | 1/23/2020 | 1/24/2020 | 1/25/2020 | 1/26/2020 | 1/27/2020 | 1/28/2020 | 1/29/2020 | 1/30/2020 | ... | 6/08/2020 | 6/09/2020 | 6/10/2020 | 6/11/2020 | 6/12/2020 | 6/13/2020 | 6/14/2020 | 6/15/2020 | 6/16/2020 | 6/17/2020 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Norfolk County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | Alabama | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 20925 | 21422 | 21989 | 22845 | 23710 | 24601 | 25615 | 26272 | 26912 | 27312 |
| 2 | Alameda County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | Alaska | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 562 | 572 | 592 | 609 | 624 | 653 | 660 | 663 | 675 | 695 |
| 4 | American Samoa | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 194 | Williamson County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 195 | Wisconsin | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 21038 | 21308 | 21593 | 21926 | 22246 | 22518 | 22758 | 22932 | 23198 | 23456 |
| 196 | Wuhan Evacuee | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 197 | Wyoming | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 960 | 970 | 980 | 1009 | 1027 | 1050 | 1060 | 1079 | 1089 | 1114 |
| 198 | Yolo County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
199 rows × 149 columns
In [76]:
covid_df.groupby('Province_State').sum()
Out[76]:
| 1/22/2020 | 1/23/2020 | 1/24/2020 | 1/25/2020 | 1/26/2020 | 1/27/2020 | 1/28/2020 | 1/29/2020 | 1/30/2020 | 1/31/2020 | ... | 6/08/2020 | 6/09/2020 | 6/10/2020 | 6/11/2020 | 6/12/2020 | 6/13/2020 | 6/14/2020 | 6/15/2020 | 6/16/2020 | 6/17/2020 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Province_State | |||||||||||||||||||||
| Norfolk County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Alabama | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 20925 | 21422 | 21989 | 22845 | 23710 | 24601 | 25615 | 26272 | 26912 | 27312 |
| Alameda County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Alaska | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 562 | 572 | 592 | 609 | 624 | 653 | 660 | 663 | 675 | 695 |
| American Samoa | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Williamson County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Wisconsin | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 21038 | 21308 | 21593 | 21926 | 22246 | 22518 | 22758 | 22932 | 23198 | 23456 |
| Wuhan Evacuee | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Wyoming | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 960 | 970 | 980 | 1009 | 1027 | 1050 | 1060 | 1079 | 1089 | 1114 |
| Yolo County | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
184 rows × 148 columns
'파이썬 활용 > 전처리 모음집' 카테고리의 다른 글
| 인구통계분석_학교 수업 때 배운 부분 (0) | 2022.10.25 |
|---|---|
| 브라질 이커머스 쇼핑몰 매우 큰 데이터 셋으로 전처리 후 분석 해보기 (0) | 2022.10.25 |
| 코로나 년도별 확진자수 분석 전처리 (0) | 2022.10.25 |
| 데이터 전처리 판다스 병합, 분할, 특정 조회, csv로 저장 (0) | 2022.10.25 |